Python 数据科学 学习笔记 - 钢钢更新

Numpy

数组Array

初始化数组

1
2
3
4
import numpy as np

list_1 = [1,2,3,4]
array_1 = np.array(list_1)

array([1, 2, 3, 4])

多维数组

1
2
list_2 = [5,6,7,8]
array_2 = np.array([list_1, list_2])

array([[1, 2, 3, 4],
[5, 6, 7, 8]])

获取数组信息

1
array_2.shape

(2, 4)

获取数组类型

1
array_2.dtype

dtype(‘int64’)

获取数组尺寸

1
array_2.size

8

1
array_4 = np.array([[1.0,2,3],[4.0,5,6]])

array([[1., 2., 3.],
[4., 5., 6.]])

1
array_4.dtype

dtype(‘float64’)

1
array_5 = np.arange(1,10,2)

array([1, 3, 5, 7, 9])

初始化元素为0的数组

1
np.zeros(5)

array([0., 0., 0., 0., 0.])

1
np.zeros([2,3])

array([[0., 0., 0.],
[0., 0., 0.]])

初始化数组

1
a = np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

数组切片

1
a[1]

2

1
a[1:5]

array([2, 3, 4, 5])

1
b = np.array([[1,2,3],[4,5,6]])

array([[1, 2, 3],
[4, 5, 6]])

1
b[1][0]

4

1
b[1,0]

4

1
c = np.array([[1,2,3],[4,5,6],[7,8,9]])

array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])

1
c[:2,1:]

array([[2, 3],
[5, 6]])

1
c[1:]

array([[4, 5, 6],
[7, 8, 9]])

1
c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:,:1]

array([[1],
[4],
[7]])

1
c[0:,:2]

array([[1, 2],
[4, 5],
[7, 8]])

1
c[0:1]

array([[1, 2, 3]])

1
c[1:]

array([[4, 5, 6],
[7, 8, 9]])

1
c[1:,1:]

array([[5, 6],
[8, 9]])

1
c[0:,1:2]

array([[2],
[5],
[8]])

1
c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:,1:3]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:2,1:3]

array([[2, 3],
[5, 6]])

用随机(小)数初始化数组

1
2
3
import numpy as np

np.random.randn(10)

array([-2.98186555, 0.51843113, -0.74563428, -1.84832264, -0.54354513,
1.23002927, 0.99527109, -0.49284623, 0.20869479, -0.46934356])

用随机整数初始化数组

1
np.random.randint(10)

2

1
np.random.randint(10, size=20).reshape(4,5)

array([[3, 2, 5, 1, 3],
[1, 6, 2, 8, 4],
[0, 4, 8, 7, 0],
[4, 4, 8, 5, 3]])

数组运算

1
a = np.random.randint(10, size=20).reshape(4,5)
1
b = np.random.randint(10, size=20).reshape(4,5)
1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
b

array([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

数组相加

1
a + b

array([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

数组相减

1
a - b

array([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

数组相乘

1
a * b

array([[ 0, 45, 72, 45, 32],
[ 0, 27, 5, 20, 0],
[12, 42, 15, 35, 54],
[ 9, 49, 27, 6, 54]])

数组常用函数

1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

获取数组中所有唯一数

1
np.unique(a)

array([0, 1, 3, 4, 5, 6, 7, 8, 9])

对数组每一列求和

1
sum(a)

array([14, 28, 25, 21, 20])

对数组指定行求和

1
sum(a[0])

37

对数组指定列求和

1
sum(a[:,0])

14

获取数组最大值

1
a.max()

9

获取数组指定行最大值

1
max(a[0])

9

获取数组指定列最大值

1
max(a[:,0])

7

使用pickle序列化array

1
2
3
4
import pickle
import numpy as np

x = np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

将数组以二进制的方式写入文件

1
f = open('x.pkl','wb')

写入文件

1
pickle.dump(x, f)

查看文件目录

1
!ls

Array的input和output.ipynb x.pkl
array.ipynb 数组与矩阵运算.ipynb

以二进制方式打开文件

1
f = open('x.pkl', 'rb')

读取文件内容

1
pickle.load(f)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

矩阵Matrix

1
np.mat([[1,2,3],[4,5,6]])

matrix([[1, 2, 3],
[4, 5, 6]])

1
np.mat(a)

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

矩阵的运算

1
A = np.mat(a)
1
B = np.mat(b)
1
A

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
B

matrix([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

矩阵相加

1
A + B

matrix([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

矩阵相减

1
A - B

matrix([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

矩阵相乘

1
2
c = np.mat(np.random.randint(10, size=20).reshape(4,5))
d = np.mat(np.random.randint(10, size=20).reshape(5,4))
1
c

matrix([[8, 5, 2, 1, 8],
[9, 5, 8, 5, 4],
[5, 0, 6, 9, 8],
[3, 9, 7, 9, 6]])

1
d

matrix([[1, 8, 8, 5],
[3, 4, 5, 3],
[9, 3, 5, 2],
[8, 6, 9, 2],
[4, 9, 8, 1]])

1
c * d

matrix([[ 81, 168, 172, 69],
[152, 182, 214, 90],
[163, 184, 215, 63],
[189, 189, 233, 80]])

使用numpy进行输入输出(文件操作)

使用np.save写入文件

1
np.save('one_array', x)
1
!ls
Array的input和output.ipynb   x.pkl
array.ipynb                  数组与矩阵运算.ipynb
one_array.npy

使用np.load读取文件内容

1
np.load('one_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1
y = np.arange(20)

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

使用np.savez将多个数组保存到一个文件中

1
np.savez('two_array.npz', a=x, b=y)
1
!ls

Array的input和output.ipynb two_array.npz
array.ipynb x.pkl
one_array.npy 数组与矩阵运算.ipynb

利用np.load读取文件内容

1
c = np.load('two_array.npz')

利用索引名读取对应数组内容

1
c['a']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1
c['b']

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19])

Pandas

Series

Series初始化

1
2
3
4
import numpy as np
import pandas as pd

s1 = pd.Series([1,2,3,4])
1
s1

0 1
1 2
2 3
3 4
dtype: int64

values

1
s1.values

array([1, 2, 3, 4])

index

1
s1.index

RangeIndex(start=0, stop=4, step=1)

1
s2 = pd.Series(np.arange(10))
1
s2

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

指定index

1
s3 = pd.Series({'1':1,'2':2,'3':3})
1
s3

1 1
2 2
3 3
dtype: int64

1
s3.index

Index([‘1’, ‘2’, ‘3’], dtype=’object’)

1
s3.values

array([1, 2, 3])

1
s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])
1
s4

A 1
B 2
C 3
D 4
dtype: int64

1
s4.values

array([1, 2, 3, 4])

1
s4.index

Index([‘A’, ‘B’, ‘C’, ‘D’], dtype=’object’)

访问Series元素

1
s4['A']

1

1
s4[s4>2]

C 3
D 4
dtype: int64

1
s4

A 1
B 2
C 3
D 4
dtype: int64

1
s4.to_dict()

{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}

1
s5 = pd.Series(s4.to_dict())
1
s5

A 1
B 2
C 3
D 4
dtype: int64

新增index

1
2
index_1=['A','B','C','D','E']
s6 = pd.Series(s5, index=index_1)
1
s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

对Series元素进行空值判定

1
pd.isnull(s6)

A False
B False
C False
D False
E True
dtype: bool

对Series元素进行非空判定

1
pd.notnull(s6)

A True
B True
C True
D True
E False
dtype: bool

指定Series名称

1
s6.name = 'demo'
1
s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

指定Series索引名称

1
s6.index.name = 'index demo'
1
s6

index demo
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

1
s6.index

Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’index demo’)

1
s6.values

array([ 1., 2., 3., 4., nan])

1
s1 = pd.Series(data['Country'], index=['A','B','C'])
1
s1

0 Belgium
1 India
2 Brazil
dtype: object

1
s1.values

array([‘Belgium’, ‘India’, ‘Brazil’], dtype=object)

1
s1.index

Index([‘A’, ‘B’, ‘C’], dtype=’object’)

Reindex on Series

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = pd.Series([1,2,3,4], index=['A','B','C','D'])
1
s1

A 1
B 2
C 3
D 4
dtype: int64

添加index

1
s1.reindex(['A','B','C','D','E'])

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

用固定数值填充新的index

1
s1.reindex(['A','B','C','D','E'], fill_value=10)

A 1
B 2
C 3
D 4
E 10
dtype: int64

1
s2 = pd.Series(['A','B','C'], index=[1,5,10])
1
s2

1 A
5 B
10 C
dtype: object

用 range() 扩充index

1
s2.reindex(index=range(15))

0 NaN
1 A
2 NaN
3 NaN
4 NaN
5 B
6 NaN
7 NaN
8 NaN
9 NaN
10 C
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object

用ffill填充新增index

1
s2.reindex(index=range(15), method='ffill')

0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object

对Series切片

1
s1

A 1
B 2
C 3
D 4
dtype: int64

1
s1.reindex(index=['A','B'])

A 1
B> 2
dtype: int64

1
s1
A    1
B    2
C    3
D    4
dtype: int64

对Series进行drop index

1
s1.drop('A')

B 2
C 3
D 4
dtype: int64

Series的简单数学计算

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series([1,2,3], index=['A','B','C'])

s1

A 1
B 2
C 3
dtype: int64

1
2
s2 = Series([4,5,6,7], index=['B','C','D','E'])
s2

B 4
C 5
D 6
E 7
dtype: int64

1
s1 + s2

A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64

Series的排序

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series(np.random.randn(10))

s1

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

1
s1.values

array([ 0.03503903, 1.08739272, -0.34281109, -0.5579896 , 0.54898768,
0.05196506, 2.98323291, 0.75773189, -0.39136856, 0.85090778])

1
s1.index

RangeIndex(start=0, stop=10, step=1)

按columns排序

1
2
s2 = s1.sort_values(ascending=False)
s2
6    2.983233
1    1.087393
9    0.850908
7    0.757732
4    0.548988
5    0.051965
0    0.035039
2   -0.342811
8   -0.391369
3   -0.557990
dtype: float64

按index排序

1
s2.sort_index()

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

Nan in Series

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
1
n = np.nan
1
n

nan

1
type(n)

float

1
2
m = 1
m + n

nan

1
s1 = Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
1
s1

A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64

空值判断

1
s1.isnull()

A False
B False
C True
D False
E False
dtype: bool

删除nan的行

1
s1.dropna()

A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64

DataFrame

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)

读取剪贴板内容,并生成DataFrame

1
2
3
df = pd.read_clipboard()

df
Jul 2019Jul 2018ChangeProgramming LanguageRatingsChange.1
011NaNJava15.058%-1.08%
122NaNC14.211%-0.45%
234changePython9.260%+2.90%
343changeC++6.705%-0.91%
456changeC#4.365%+0.57%
565changeVisual Basic .NET4.208%-0.04%
678changeJavaScript2.304%-0.53%
787changePHP2.167%-0.67%
899NaNSQL1.977%-0.36%
91010NaNObjective-C1.686%+0.23%

Type

1
type(df)

pandas.core.frame.DataFrame

Columns

1
df.columns
Index(['Jul 2019', 'Jul 2018', 'Change', 'Programming Language', 'Ratings',
       'Change.1'],
      dtype='object')

访问DataFrame列

1
df.Ratings

0 15.058%
1 14.211%
2 9.260%
3 6.705%
4 4.365%
5 4.208%
6 2.304%
7 2.167%
8 1.977%
9 1.686%
Name: Ratings, dtype: object

复制DataFrame

1
df_new = DataFrame(df, columns=['Programming Language','Jul 2018'])
1
df_new
Programming LanguageJul 2018
0Java1
1C2
2Python4
3C++3
4C#6
5Visual Basic .NET5
6JavaScript8
7PHP7
8SQL9
9Objective-C10

向DataFrame中增加新列

1
df_new = DataFrame(df, columns=['Programming Language','Jul 2018', 'Sep 2018'])
1
df_new
Programming LanguageJul 2018Sep 2018
0Java1NaN
1C2NaN
2Python4NaN
3C++3NaN
4C#6NaN
5Visual Basic .NET5NaN
6JavaScript8NaN
7PHP7NaN
8SQL9NaN
9Objective-C10NaN

向新增列填充数据

方法1:range()

1
df_new['Sep 2018'] = range(0,10)
1
df_new
Programming LanguageJul 2018Sep 2018
0Java10
1C21
2Python42
3C++33
4C#64
5Visual Basic .NET55
6JavaScript86
7PHP77
8SQL98
9Objective-C109

方法2:np.range()

1
df_new['Sep 2018'] = np.arange(0,10)
1
df_new
Programming LanguageJul 2018Sep 2018
0Java10
1C21
2Python42
3C++33
4C#64
5Visual Basic .NET55
6JavaScript86
7PHP77
8SQL98
9Objective-C109

方法3:np.arange()

1
df_new['Sep 2018'] = pd.Series(np.arange(0,10))
1
df_new
Programming LanguageJul 2018Sep 2018
0Java10
1C21
2Python42
3C++33
4C#64
5Visual Basic .NET55
6JavaScript86
7PHP77
8SQL98
9Objective-C109

向新增列中填充数值

1
df_new['Sep 2018'] = pd.Series([100,200],index=[1,2])
1
df_new
Programming LanguageJul 2018Sep 2018
0Java1NaN
1C2100.0
2Python4200.0
3C++3NaN
4C#6NaN
5Visual Basic .NET5NaN
6JavaScript8NaN
7PHP7NaN
8SQL9NaN
9Objective-C10NaN
1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

data = {'Country': ['Belgium','India', 'Brazil'],
'Capital':['Brussels', 'New Delhi', 'Brasilia'],
'Population':[11190846, 1303171035, 207847528]}
1
df1 = pd.DataFrame(data)
1
df1
CountryCapitalPopulation
0BelgiumBrussels11190846
1IndiaNew Delhi1303171035
2BrazilBrasilia207847528

访问DataFrame某一列

1
df1['Country']

0 Belgium
1 India
2 Brazil
Name: Country, dtype: object

1
cou = df1['Country']
1
type(cou)

pandas.core.series.Series

遍历DataFrame

1
df1.iterrows()

<generator object DataFrame.iterrows at 0x118d895e8>

1
2
3
for row in df1.iterrows():
print(row[0]), print(row[1])
break

0
Country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object

1
df1
CountryCapitalPopulation
0BelgiumBrussels11190846
1IndiaNew Delhi1303171035
2BrazilBrasilia207847528
1
data

{‘Country’: [‘Belgium’, ‘India’, ‘Brazil’],
‘Capital’: [‘Brussels’, ‘New Delhi’, ‘Brasilia’],
‘Population’: [11190846, 1303171035, 207847528]}

Series => DataFrame

1
s1 = pd.Series(data['Capital'])
1
s2 = pd.Series(data['Country'])
1
s3 = pd.Series(data['Population'])
1
df_new = pd.DataFrame([s2,s1,s3],index=['Country','Capital','Population'])
1
df_new
012
CountryBelgiumIndiaBrazil
CapitalBrusselsNew DelhiBrasilia
Population111908461303171035207847528

行转列

1
df_new2 = df_new.T
1
df_new2
CountryCapitalPopulation
0BelgiumBrussels11190846
1IndiaNew Delhi1303171035
2BrazilBrasilia207847528
1
df1
CountryCapitalPopulation
0BelgiumBrussels11190846
1IndiaNew Delhi1303171035
2BrazilBrasilia207847528

Nan in DataFrame

1
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
1
dframe
012
01.02.03.0
1NaN5.06.0
27.0NaN9.0
3NaNNaNNaN

空值判断

1
dframe.isnull()
012
0FalseFalseFalse
1TrueFalseFalse
2FalseTrueFalse
3TrueTrueTrue

非空判断

1
dframe.notnull()
012
0TrueTrueTrue
1FalseTrueTrue
2TrueFalseTrue
3FalseFalseFalse

删除存在nan的行

1
df1 = dframe.dropna(axis=0)
1
df1
012
01.02.03.0

删除存在nan的列

1
df2 = dframe.dropna(axis=1)
1
df2
0
1
2
3

仅删除所有值都为nan的行

1
df3 = dframe.dropna(axis=0, how='all')
1
df3
012
01.02.03.0
1NaN5.06.0
27.0NaN9.0
1
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
1
dframe2
0123
01.02.03.0NaN
12.0NaN5.06.0
2NaN7.0NaN9.0
31.0NaNNaNNaN

删除大于thresh阈值的行

1
df4 = dframe2.dropna(thresh=2)
1
df4
0123
01.02.03.0NaN
12.0NaN5.06.0
2NaN7.0NaN9.0

利用fillna对空值进行填充

1
dframe2.fillna(value=1)
0123
01.02.03.01.0
12.01.05.06.0
21.07.01.09.0
31.01.01.01.0

利用fillna按列级别填充不同数值

1
dframe2.fillna(value={0:0,1:1,2:2,3:5})
0123
01.02.03.05.0
12.01.05.06.0
20.07.02.09.0
31.01.02.05.0

DataFrame I/O操作

打开网页

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser

link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)

True

读取剪贴板内容

1
df1 = pd.read_clipboard()
1
df1
Format TypeData DescriptionReaderWriter
0textCSVread_csvto_csv
1textJSONread_jsonto_json
2textHTMLread_htmlto_html
3textLocal clipboardread_clipboardto_clipboard
4binaryMS Excelread_excelto_excel
5binaryHDF5 Formatread_hdfto_hdf
6binaryFeather Formatread_featherto_feather
7binaryMsgpackread_msgpackto_msgpack
8binaryStataread_statato_stata
9binarySASread_sas
10binaryPython Pickle Formatread_pickleto_pickle
11SQLSQLread_sqlto_sql
12SQLGoogle Big Queryread_gbqto_gbq

将内容粘贴到剪贴板

1
df1.to_clipboard()

写入CSV (不写入索引值)

1
df1.to_csv('df1.csv', index=False)
1
!ls

2-3 Array的创建及访问.ipynb
2-4 数组与矩阵运算.ipynb
2-5 Array的input和output.ipynb
3-1 Pandas Series.ipynb
3-2 Pandas DataFrame.ipynb
3-3 深入理解Series和Dataframe.ipynb
3-4 Pandas-Dataframe-IO操作.ipynb
df1.csv
one_array.npy
two_array.npz
x.pkl

预览CSV文件内容

1
!more df1.csv

Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
[7mdf1.csv (END)[m[K

Selecting & Indexing

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

imdb = pd.read_csv('/Users/kwang/jupyter-projects/movie_metadata.csv')
1
imdb.shape

(5043, 28)

查看DataFrame前5行内容

1
imdb.head()
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0

5 rows × 28 columns

查看DataFrame后5行内容

1
imdb.tail()
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
5038ColorScott Smith1.087.02.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.0NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.00.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.00.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456

5 rows × 28 columns

根据指定列生成新的DataFrame

1
sub_df = imdb[['director_name', 'movie_title', 'imdb_score']]
1
sub_df.head()
director_namemovie_titleimdb_score
0James CameronAvatar7.9
1Gore VerbinskiPirates of the Caribbean: At World's End7.1
2Sam MendesSpectre6.8
3Christopher NolanThe Dark Knight Rises8.5
4Doug WalkerStar Wars: Episode VII - The Force Awakens ...7.1

根据index定位内容(iloc)

1
temp_df = sub_df.iloc[10:20,0:2]
1
temp_df
director_namemovie_title
10Zack SnyderBatman v Superman: Dawn of Justice
11Bryan SingerSuperman Returns
12Marc ForsterQuantum of Solace
13Gore VerbinskiPirates of the Caribbean: Dead Man's Chest
14Gore VerbinskiThe Lone Ranger
15Zack SnyderMan of Steel
16Andrew AdamsonThe Chronicles of Narnia: Prince Caspian
17Joss WhedonThe Avengers
18Rob MarshallPirates of the Caribbean: On Stranger Tides
19Barry SonnenfeldMen in Black 3
1
temp_df.iloc[2:4,:]
director_namemovie_title
12Marc ForsterQuantum of Solace
13Gore VerbinskiPirates of the Caribbean: Dead Man's Chest

根据label定位内容(loc)

1
temp_df.loc[15:17,:'director_name']
director_name
15Zack Snyder
16Andrew Adamson
17Joss Whedon

Reindex on DataFrame

1
df1 = DataFrame(np.random.rand(25).reshape([5,5]), index=['A','B','D','E','F'], columns=['c1','c2','c3','c4','c5'])
1
df1
c1c2c3c4c5
A0.6593690.0967240.8801280.4031680.282619
B0.0030700.6617470.1210260.2628810.557213
D0.9726930.9621490.4110360.8492160.704441
E0.9697160.4254150.9435070.5893440.670303
F0.3157320.3712350.9438800.6495410.495330

插入新的行(index)

1
df1.reindex(index=['A','B','C','D','E','F'])
c1c2c3c4c5
A0.6593690.0967240.8801280.4031680.282619
B0.0030700.6617470.1210260.2628810.557213
CNaNNaNNaNNaNNaN
D0.9726930.9621490.4110360.8492160.704441
E0.9697160.4254150.9435070.5893440.670303
F0.3157320.3712350.9438800.6495410.495330

插入新的列 (columns)

1
df1.reindex(columns=['c1','c2','c3','c4','c5','c6'])
c1c2c3c4c5c6
A0.6593690.0967240.8801280.4031680.282619NaN
B0.0030700.6617470.1210260.2628810.557213NaN
D0.9726930.9621490.4110360.8492160.704441NaN
E0.9697160.4254150.9435070.5893440.670303NaN
F0.3157320.3712350.9438800.6495410.495330NaN

同时新增index和columns

1
df1.reindex(index=['A','B','C','D','E','F'], columns=['c1','c2','c3','c4','c5','c6'])
c1c2c3c4c5c6
A0.6593690.0967240.8801280.4031680.282619NaN
B0.0030700.6617470.1210260.2628810.557213NaN
CNaNNaNNaNNaNNaNNaN
D0.9726930.9621490.4110360.8492160.704441NaN
E0.9697160.4254150.9435070.5893440.670303NaN
F0.3157320.3712350.9438800.6495410.495330NaN

对DataFrame切片

1
df1.reindex(index=['A','B'])
c1c2c3c4c5
A0.6593690.0967240.8801280.4031680.282619
B0.0030700.6617470.1210260.2628810.557213

对DataFrame进行drop index

1
df1.drop('A',axis=0)
c1c2c3c4c5
B0.0030700.6617470.1210260.2628810.557213
D0.9726930.9621490.4110360.8492160.704441
E0.9697160.4254150.9435070.5893440.670303
F0.3157320.3712350.9438800.6495410.495330

对DataFrame进行drop columns

1
df1.drop('c2',axis=1)
c1c3c4c5
A0.6593690.8801280.4031680.282619
B0.0030700.1210260.2628810.557213
D0.9726930.4110360.8492160.704441
E0.9697160.9435070.5893440.670303
F0.3157320.9438800.6495410.495330

多级index

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series(np.random.randn(6),index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])
1
s1

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

1
s1[1]['a']

-0.8287592467933553

1
s1[:,'a']

1 -0.828759
2 0.801019
dtype: float64

多级index Series => DataFrame

1
2
3
df1 = s1.unstack()

df1
abc
1-0.8287591.1008930.705300
20.8010190.676878-0.585011

DataFrame => 多级index Series

1
df1.unstack()

a 1 -0.828759
2 0.801019
b 1 1.100893
2 0.676878
c 1 0.705300
2 -0.585011
dtype: float64

1
df1.T
12
a-0.8287590.801019
b1.1008930.676878
c0.705300-0.585011
1
df1.T.unstack()

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

初始化DataFrame (多级index & columns)

1
2
3
df = DataFrame(np.arange(16).reshape(4,4), index=[['a','a','b','b'],[1,2,1,2]], columns=[['BJ','BJ','SH','GZ'],[8,9,8,8]])

df
BJSHGZ
8988
a10123
24567
b1891011
212131415

访问多级DataFrame元素

1
df['BJ']
89
a101
245
b189
21213
1
df['BJ'][8]

a 1 0
2 4
b 1 8
2 12
Name: 8, dtype: int64

1
df.columns

MultiIndex(levels=[[‘BJ’, ‘GZ’, ‘SH’], [8, 9]],
codes=[[0, 0, 2, 1], [0, 1, 0, 0]])

创建多级索引

1
2
3
4
data = Series(np.random.randn(10),index=[['x','x','x','x','x','x','y','y','y','y'],
['a','a','a','b','b','b','c','c','d','d'],
[1,2,3,1,2,3,1,2,2,3]])
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

检索多级索引

选取外层索引
1
data['x']

a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
dtype: float64

1
data['x':'y']

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

选取内层索引
1
data['x','a',1]

0.6737686360875892

解除多级索引

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

默认解除最内层索引,即level=-1
1
data.unstack()
123
xa0.673769-1.781723-2.232971
b-0.832687-0.1418500.704339
yc1.186975-1.106244NaN
dNaN-0.1665370.695451
解除其他层索引
1
data.unstack(level=-2)
abcd
x10.673769-0.832687NaNNaN
2-1.781723-0.141850NaNNaN
3-2.2329710.704339NaNNaN
y1NaNNaN1.186975NaN
2NaNNaN-1.106244-0.166537
3NaNNaNNaN0.695451

转化为多级索引

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

将中间层索引转换为内层索引
1
data.unstack(level=-2).stack()

x 1 a 0.673769
b -0.832687
2 a -1.781723
b -0.141850
3 a -2.232971
b 0.704339
y 1 c 1.186975
2 c -1.106244
d -0.166537
3 d 0.695451
dtype: float64

为不同层索引指定名称(name)

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

1
2
3
df = data.unstack()

df
123
xa0.673769-1.781723-2.232971
b-0.832687-0.1418500.704339
yc1.186975-1.106244NaN
dNaN-0.1665370.695451
1
2
3
4
df.index.names = ['outer','inner'] # 指定两层索引的名称
df.columns.name = 'hello'

df
hello123
outerinner
xa0.673769-1.781723-2.232971
b-0.832687-0.1418500.704339
yc1.186975-1.106244NaN
dNaN-0.1665370.695451

Mapping

通过Series为DataFrame新增列

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df1
城市人口
0北京1000
1上海2000
2广州1500
1
2
3
df1["GDP"] = Series([1200,1800,1600])

df1
城市人口GDP
0北京10001200
1上海20001800
2广州15001600

通过Map为DataFrame新增列

1
2
3
df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df2
城市人口
0北京1000
1上海2000
2广州1500
1
2
3
4
5
gdp_map = {"北京": 1200, "广州": 1400, "上海": 1800}

df2['GDP'] = df2['城市'].map(gdp_map)

df2
城市人口GDP
0北京10001200
1上海20001800
2广州15001400

Replace

1
2
3
s1 = Series(np.arange(10))

s1

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

1
s1.replace(1, np.nan)

0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
6 6.0
7 7.0
8 8.0
9 9.0
dtype: float64

1
s1.replace([1,2,3],[10,20,30])

0 0
1 10
2 20
3 30
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

DataFrame的简单数学计算

1
2
df1 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','SH'])
df1
BJSH
A01
B23
1
2
3
df2 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ','SH','GZ'])

df2
BJSHGZ
A012
B345
C678
1
df1 + df2
BJGZSH
A0.0NaN2.0
B5.0NaN7.0
CNaNNaNNaN
1
2
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]], index=['A','B','C'], columns=['c1','c2','c3'])
df3
c1c2c3
A123.0
B45NaN
C789.0

对列进行求和

1
df3.sum()

c1 12.0
c2 15.0
c3 12.0
dtype: float64

对行进行求和

1
df3.sum(axis=1)

A 6.0
B 9.0
C 24.0
dtype: float64

求最小值

1
df3.min()

c1 1.0
c2 2.0
c3 3.0
dtype: float64

求最大值

1
df3.max()

c1 7.0
c2 8.0
c3 9.0
dtype: float64

1
df3
c1c2c3
A123.0
B45NaN
C789.0
1
df3.describe()
c1c2c3
count3.03.02.000000
mean4.05.06.000000
std3.03.04.242641
min1.02.03.000000
25%2.53.54.500000
50%4.05.06.000000
75%5.56.57.500000
max7.08.09.000000

DataFrame排序

1
2
df1 = DataFrame(np.random.randn(40).reshape(8,5), columns=['A','B','C','D','E'])
df1
ABCDE
00.428384-0.242118-0.311413-2.032690-1.519783
1-1.763709-0.299535-1.712632-0.662375-0.434309
2-1.7997460.937149-0.3873390.222759-0.040818
30.2396741.813621-0.010476-0.0133001.411217
4-0.9801560.8897081.3222671.848655-0.210548
51.655132-0.5614051.369573-0.4854371.772758
6-0.032585-2.0388271.5600830.6388740.626102
7-0.153819-1.3860770.779540-1.4419961.106010

按column排序

1
2
df2 = df1.sort_values('A', ascending=False)
df2
ABCDE
51.655132-0.5614051.369573-0.4854371.772758
00.428384-0.242118-0.311413-2.032690-1.519783
30.2396741.813621-0.010476-0.0133001.411217
6-0.032585-2.0388271.5600830.6388740.626102
7-0.153819-1.3860770.779540-1.4419961.106010
4-0.9801560.8897081.3222671.848655-0.210548
1-1.763709-0.299535-1.712632-0.662375-0.434309
2-1.7997460.937149-0.3873390.222759-0.040818

按index排序

1
df2.sort_index()
ABCDE
00.428384-0.242118-0.311413-2.032690-1.519783
1-1.763709-0.299535-1.712632-0.662375-0.434309
2-1.7997460.937149-0.3873390.222759-0.040818
30.2396741.813621-0.010476-0.0133001.411217
4-0.9801560.8897081.3222671.848655-0.210548
51.655132-0.5614051.369573-0.4854371.772758
6-0.032585-2.0388271.5600830.6388740.626102
7-0.153819-1.3860770.779540-1.4419961.106010

Rename index

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame(np.arange(9).reshape(3,3), index=['BJ','SH','GZ'], columns=['A','B','C'])

df1
ABC
BJ012
SH345
GZ678
1
2
3
df1.index = Series(['bj','sh','gz'])

df1
ABC
bj012
sh345
gz678
1
2
df1.index = df1.index.map(str.upper)
df1
ABC
BJ012
SH345
GZ678
1
df1.rename(index=str.lower, columns=str.lower)
abc
bj012
sh345
gz678
1
df1
ABC
BJ012
SH345
GZ678
1
df1.rename(index={'BJ':'beijing'}, columns={'A':'a'})
aBC
beijing012
SH345
GZ678

数字列表 => 字符串列表

1
2
list1 = [1,2,3,4]
list2 = ['1','2','3','4']
1
[str(x) for x in list1]

[‘1’, ‘2’, ‘3’, ‘4’]

1
list(map(str, list1))

[‘1’, ‘2’, ‘3’, ‘4’]

利用自定义map函数重命名index

1
2
def test_map(x):
return x + '_ABC'
1
df1.index.map(test_map)

Index([‘BJ_ABC’, ‘SH_ABC’, ‘GZ_ABC’], dtype=’object’)

1
df1.rename(index=test_map)
ABC
BJ_ABC012
SH_ABC345
GZ_ABC678

DataFrame合并 (merge) 操作

1
2
3
4
5
6
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.DataFrame({'key':['X','Y','Z','X'], 'data_set_1':[1,2,3,4]})
df1
keydata_set_1
0X1
1Y2
2Z3
3X4
1
2
df2 = pd.DataFrame({'key':['X','B','C'],'data_set_2':[5,6,7]})
df2
keydata_set_2
0X5
1B6
2C7
1
pd.merge(df1,df2,on=None)
keydata_set_1data_set_2
0X15
1X45

指定关联列(默认inner连接)

1
pd.merge(df1,df2,on='key',how='inner')
keydata_set_1data_set_2
0X15
1X45

左连接

1
pd.merge(df1,df2,on='key',how='left')
keydata_set_1data_set_2
0X15.0
1Y2NaN
2Z3NaN
3X45.0

右连接

1
pd.merge(df1,df2,on='key',how='right')
keydata_set_1data_set_2
0X1.05
1X4.05
2BNaN6
3CNaN7

Concatenate

Array Concatenate

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
1
2
arr1 = np.arange(9).reshape(3,3)
arr1

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
2
arr2 = np.arange(9).reshape(3,3)
arr2

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
np.concatenate([arr1,arr2])

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
np.concatenate([arr1,arr2],axis=1)

array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])

Series Concatenate

1
2
s1 = Series([1,2,3], index=['X','Y','Z'])
s1

X 1
Y 2
Z 3
dtype: int64

1
2
s2 = Series([4,5], index=['A','B'])
s2

A 4
B 5
dtype: int64

1
pd.concat([s1,s2])

X 1
Y 2
Z 3
A 4
B 5
dtype: int64

1
pd.concat([s1,s2], axis=1)
01
ANaN4.0
BNaN5.0
X1.0NaN
Y2.0NaN
Z3.0NaN

DataFrame Concatenate

1
2
df1 = DataFrame(np.random.randn(4,3), columns=['X','Y','Z'])
df1
XYZ
0-2.150094-0.1729150.391376
1-2.9431841.4426290.934177
2-0.015998-1.2047390.912159
3-0.131997-0.1524301.080225
1
2
df2 = DataFrame(np.random.randn(3,3), columns=['X','Y','A'])
df2
XYA
0-0.6407850.4048441.140704
10.3315780.4904771.158403
21.542735-0.6937171.928202
1
pd.concat([df1,df2])
AXYZ
0NaN-2.150094-0.1729150.391376
1NaN-2.9431841.4426290.934177
2NaN-0.015998-1.2047390.912159
3NaN-0.131997-0.1524301.080225
01.140704-0.6407850.404844NaN
11.1584030.3315780.490477NaN
21.9282021.542735-0.693717NaN

Combine

Series Combine

1
2
s1 = Series([2,np.nan,4,np.nan], index=['A','B','C','D'])
s1

A 2.0
B NaN
C 4.0
D NaN
dtype: float64

1
2
s2 = Series([1,2,3,4], index=['A','B','C','D'])
s2

A 1
B 2
C 3
D 4
dtype: int64

combine_first - 用后者填充前者中的空值 (如有)
1
s1.combine_first(s2)

A 2.0
B 2.0
C 4.0
D 4.0
dtype: float64

DataFrame Combine

1
2
3
4
5
6
df1 = pd.DataFrame({
'X':[1,np.nan,3,np.nan],
'Y':[5,np.nan,7,np.nan],
'Z':[9,np.nan,11,np.nan]
})
df1
XYZ
01.05.09.0
1NaNNaNNaN
23.07.011.0
3NaNNaNNaN
1
2
3
4
5
df2 = pd.DataFrame({
'Z':[np.nan,10,np.nan,12],
'A':[1,2,3,4]
})
df2
ZA
0NaN1
110.02
2NaN3
312.04
1
df1.combine_first(df2)
AXYZ
01.01.05.09.0
12.0NaNNaN10.0
23.03.07.011.0
34.0NaNNaN12.0

利用apply对数据进行处理

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.read_csv("apply_demo.csv")
1
df1.head()
timedata
01473411962Symbol: APPL Seqno: 0 Price: 1623
11473411962Symbol: APPL Seqno: 0 Price: 1623
21473411963Symbol: APPL Seqno: 0 Price: 1623
31473411963Symbol: APPL Seqno: 0 Price: 1623
41473411963Symbol: APPL Seqno: 1 Price: 1649
1
df1.size

7978

1
s1 = Series(['a']*7978)
1
df1['A'] = s1
1
df1.head()
timedataA
01473411962Symbol: APPL Seqno: 0 Price: 1623a
11473411962Symbol: APPL Seqno: 0 Price: 1623a
21473411963Symbol: APPL Seqno: 0 Price: 1623a
31473411963Symbol: APPL Seqno: 0 Price: 1623a
41473411963Symbol: APPL Seqno: 1 Price: 1649a

利用apply批量处理某一列数据

1
2
3
df1['A'] = df1['A'].apply(str.upper)

df1.head()
timedataA
01473411962Symbol: APPL Seqno: 0 Price: 1623A
11473411962Symbol: APPL Seqno: 0 Price: 1623A
21473411963Symbol: APPL Seqno: 0 Price: 1623A
31473411963Symbol: APPL Seqno: 0 Price: 1623A
41473411963Symbol: APPL Seqno: 1 Price: 1649A
1
2
l1 = df1['data'][0].strip().split(' ')
l1

[‘Symbol:’, ‘APPL’, ‘Seqno:’, ‘0’, ‘Price:’, ‘1623’]

1
l1[1], l1[3], l1[5]

(‘APPL’, ‘0’, ‘1623’)

编写自定义函数,对列中的字符串进行提取

1
2
3
def foo(line):
items = line.strip().split(' ')
return Series([items[1], items[3], items[5]])
1
2
3
4
5
df_temp = df1['data'].apply(foo)

df_temp = df_temp.rename(columns={0:"Symbol",1:"Seqno",2:"Price"})

df_temp.head()
SymbolSeqnoPrice
0APPL01623
1APPL01623
2APPL01623
3APPL01623
4APPL11649
1
df_new = df1.combine_first(df_temp)
1
df_new.head()
APriceSeqnoSymboldatatime
0A1623.00.0APPLSymbol: APPL Seqno: 0 Price: 16231473411962
1A1623.00.0APPLSymbol: APPL Seqno: 0 Price: 16231473411962
2A1623.00.0APPLSymbol: APPL Seqno: 0 Price: 16231473411963
3A1623.00.0APPLSymbol: APPL Seqno: 0 Price: 16231473411963
4A1649.01.0APPLSymbol: APPL Seqno: 1 Price: 16491473411963
1
2
3
4
del df_new['data']
del df_new['A']

df_new.head()
PriceSeqnoSymboltime
01623.00.0APPL1473411962
11623.00.0APPL1473411962
21623.00.0APPL1473411963
31623.00.0APPL1473411963
41649.01.0APPL1473411963
1
df_new.to_csv("demo_duplicate.csv")

利用drop_duplicates去重

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('demo_duplicate.csv')

df.head()
Unnamed: 0PriceSeqnoSymboltime
001623.00.0APPL1473411962
111623.00.0APPL1473411962
221623.00.0APPL1473411963
331623.00.0APPL1473411963
441649.01.0APPL1473411963
1
2
3
del df['Unnamed: 0']

df.head(10)
PriceSeqnoSymboltime
01623.00.0APPL1473411962
11623.00.0APPL1473411962
21623.00.0APPL1473411963
31623.00.0APPL1473411963
41649.01.0APPL1473411963
51649.01.0APPL1473411963
61649.01.0APPL1473411964
71649.01.0APPL1473411964
81642.02.0APPL1473411964
91642.02.0APPL1473411964
1
df.size

15956

1
len(df)

3989

1
len(df['Seqno'].unique())

1000

1
df['Seqno'].duplicated()

0 False
1 True
2 True
3 True
4 False
5 True
6 True
7 True
8 False
9 True
10 True
11 True
12 False
13 True
14 True
15 True
16 False
17 True
18 True
19 True
20 False
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 False
29 True

3959 True
3960 True
3961 False
3962 True
3963 True
3964 True
3965 False
3966 True
3967 True
3968 True
3969 False
3970 True
3971 True
3972 True
3973 False
3974 True
3975 True
3976 True
3977 False
3978 True
3979 True
3980 True
3981 False
3982 True
3983 True
3984 True
3985 False
3986 True
3987 True
3988 True
Name: Seqno, Length: 3989, dtype: bool

1
df.drop_duplicates(['Seqno'], keep='first')
PriceSeqnoSymboltime
01623.00.0APPL1473411962
41649.01.0APPL1473411963
81642.02.0APPL1473411964
121636.03.0APPL1473411965
161669.04.0APPL1473411966
201639.05.0APPL1473411967
241611.06.0APPL1473411968
281660.07.0APPL1473411969
321657.08.0APPL1473411970
361509.09.0APPL1473411971
401514.010.0APPL1473411972
441676.011.0APPL1473411973
481596.012.0APPL1473411974
521527.013.0APPL1473411975
561643.014.0APPL1473411976
601632.015.0APPL1473411977
641595.016.0APPL1473411978
681565.017.0APPL1473411979
721521.018.0APPL1473411980
761501.019.0APPL1473411981
801579.020.0APPL1473411982
841511.021.0APPL1473411983
881513.022.0APPL1473411984
921687.023.0APPL1473411985
961539.024.0APPL1473411986
1001682.025.0APPL1473411987
1041534.026.0APPL1473411988
1081628.027.0APPL1473411989
1121582.028.0APPL1473411990
1161540.029.0APPL1473411991
...............
38691649.0970.0APPL1473412932
38731696.0971.0APPL1473412933
38771596.0972.0APPL1473412934
38811664.0973.0APPL1473412935
38851539.0974.0APPL1473412936
38891654.0975.0APPL1473412937
38931573.0976.0APPL1473412938
38971695.0977.0APPL1473412939
39011584.0978.0APPL1473412940
39051607.0979.0APPL1473412941
39091676.0980.0APPL1473412942
39131665.0981.0APPL1473412943
39171540.0982.0APPL1473412944
39211546.0983.0APPL1473412945
39251646.0984.0APPL1473412946
39291648.0985.0APPL1473412947
39331647.0986.0APPL1473412948
39371602.0987.0APPL1473412949
39411562.0988.0APPL1473412950
39451629.0989.0APPL1473412951
39491683.0990.0APPL1473412952
39531649.0991.0APPL1473412953
39571698.0992.0APPL1473412954
39611566.0993.0APPL1473412955
39651597.0994.0APPL1473412956
39691641.0995.0APPL1473412957
39731581.0996.0APPL1473412958
39771674.0997.0APPL1473412959
39811680.0998.0APPL1473412960
39851509.0999.0APPL1473412961

1000 rows × 4 columns

时间序列操作

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
1
2
from datetime import datetime
t1 = datetime(2019,7,12)
1
t1

datetime.datetime(2019, 7, 12, 0, 0)

1
2
3
4
5
6
7
date_list = [
datetime(2016,9,1),
datetime(2016,9,10),
datetime(2017,9,1),
datetime(2017,9,20),
datetime(2017,10,1)
]
1
date_list

[datetime.datetime(2016, 9, 1, 0, 0),
datetime.datetime(2016, 9, 10, 0, 0),
datetime.datetime(2017, 9, 1, 0, 0),
datetime.datetime(2017, 9, 20, 0, 0),
datetime.datetime(2017, 10, 1, 0, 0)]

1
2
3
s1 = Series(np.random.rand(5),index=date_list)

s1

2016-09-01 0.192942
2016-09-10 0.821524
2017-09-01 0.503137
2017-09-20 0.896424
2017-10-01 0.267054
dtype: float64

1
s1.values

array([0.19294241, 0.82152442, 0.5031373 , 0.8964235 , 0.26705384])

1
s1.index

DatetimeIndex([‘2016-09-01’, ‘2016-09-10’, ‘2017-09-01’, ‘2017-09-20’,
‘2017-10-01’],
dtype=’datetime64[ns]’, freq=None)

根据索引位置访问

1
s1[0]

0.19294241160815995

根据时间序列访问

1
s1[datetime(2016,9,1)]

0.19294241160815995

根据时间访问

1
s1['2016-09-01']

0.19294241160815995

1
s1['20160901']

0.19294241160815995

返回整月数据

1
s1['2016-09']

2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64

返回整年数据

1
s1['2016']

2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64

利用pd.date_range() 生成时间序列

1
2
date_list_new = pd.date_range('2016-01-01', periods=100)
date_list_new

DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’, ‘2016-01-11’, ‘2016-01-12’,
‘2016-01-13’, ‘2016-01-14’, ‘2016-01-15’, ‘2016-01-16’,
‘2016-01-17’, ‘2016-01-18’, ‘2016-01-19’, ‘2016-01-20’,
‘2016-01-21’, ‘2016-01-22’, ‘2016-01-23’, ‘2016-01-24’,
‘2016-01-25’, ‘2016-01-26’, ‘2016-01-27’, ‘2016-01-28’,
‘2016-01-29’, ‘2016-01-30’, ‘2016-01-31’, ‘2016-02-01’,
‘2016-02-02’, ‘2016-02-03’, ‘2016-02-04’, ‘2016-02-05’,
‘2016-02-06’, ‘2016-02-07’, ‘2016-02-08’, ‘2016-02-09’,
‘2016-02-10’, ‘2016-02-11’, ‘2016-02-12’, ‘2016-02-13’,
‘2016-02-14’, ‘2016-02-15’, ‘2016-02-16’, ‘2016-02-17’,
‘2016-02-18’, ‘2016-02-19’, ‘2016-02-20’, ‘2016-02-21’,
‘2016-02-22’, ‘2016-02-23’, ‘2016-02-24’, ‘2016-02-25’,
‘2016-02-26’, ‘2016-02-27’, ‘2016-02-28’, ‘2016-02-29’,
‘2016-03-01’, ‘2016-03-02’, ‘2016-03-03’, ‘2016-03-04’,
‘2016-03-05’, ‘2016-03-06’, ‘2016-03-07’, ‘2016-03-08’,
‘2016-03-09’, ‘2016-03-10’, ‘2016-03-11’, ‘2016-03-12’,
‘2016-03-13’, ‘2016-03-14’, ‘2016-03-15’, ‘2016-03-16’,
‘2016-03-17’, ‘2016-03-18’, ‘2016-03-19’, ‘2016-03-20’,
‘2016-03-21’, ‘2016-03-22’, ‘2016-03-23’, ‘2016-03-24’,
‘2016-03-25’, ‘2016-03-26’, ‘2016-03-27’, ‘2016-03-28’,
‘2016-03-29’, ‘2016-03-30’, ‘2016-03-31’, ‘2016-04-01’,
‘2016-04-02’, ‘2016-04-03’, ‘2016-04-04’, ‘2016-04-05’,
‘2016-04-06’, ‘2016-04-07’, ‘2016-04-08’, ‘2016-04-09’],
dtype=’datetime64[ns]’, freq=’D’)

从指定时间的(周一)开始生成时间序列

1
2
date_list_new = pd.date_range('2016-01-01', periods=10, freq='W-MON')
date_list_new

DatetimeIndex([‘2016-01-04’, ‘2016-01-11’, ‘2016-01-18’, ‘2016-01-25’,
‘2016-02-01’, ‘2016-02-08’, ‘2016-02-15’, ‘2016-02-22’,
‘2016-02-29’, ‘2016-03-07’],
dtype=’datetime64[ns]’, freq=’W-MON’)

1
2
date_list_new = pd.date_range('2016-01-01', periods=10, freq='5H')
date_list_new

DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 10:00:00’, ‘2016-01-01 15:00:00’,
‘2016-01-01 20:00:00’, ‘2016-01-02 01:00:00’,
‘2016-01-02 06:00:00’, ‘2016-01-02 11:00:00’,
‘2016-01-02 16:00:00’, ‘2016-01-02 21:00:00’],
dtype=’datetime64[ns]’, freq=’5H’)

1
2
s2 = Series(np.random.rand(10), index=date_list_new)
s2

2016-01-01 00:00:00 0.312884
2016-01-01 05:00:00 0.337516
2016-01-01 10:00:00 0.911010
2016-01-01 15:00:00 0.055390
2016-01-01 20:00:00 0.810770
2016-01-02 01:00:00 0.313263
2016-01-02 06:00:00 0.025229
2016-01-02 11:00:00 0.229192
2016-01-02 16:00:00 0.509811
2016-01-02 21:00:00 0.169681
Freq: 5H, dtype: float64

时间序列数据的采样和画图

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from datetime import datetime

t_range = pd.date_range('2016-01-01','2016-12-31')
t_range

DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’,

‘2016-12-22’, ‘2016-12-23’, ‘2016-12-24’, ‘2016-12-25’,
‘2016-12-26’, ‘2016-12-27’, ‘2016-12-28’, ‘2016-12-29’,
‘2016-12-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, length=366, freq=’D’)

1
2
3
s1 = Series(np.random.rand(len(t_range)), index=t_range)

s1['2016-01'].mean()

0.48349408856972315

按月份采样,计算平均值

1
2
3
s1_month = s1.resample('M').mean()

s1_month.index

DatetimeIndex([‘2016-01-31’, ‘2016-02-29’, ‘2016-03-31’, ‘2016-04-30’,
‘2016-05-31’, ‘2016-06-30’, ‘2016-07-31’, ‘2016-08-31’,
‘2016-09-30’, ‘2016-10-31’, ‘2016-11-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, freq=’M’)

按小时采样,并(向后)填充数据

1
s1.resample('H').ffill()

2016-01-01 00:00:00 0.196584
2016-01-01 01:00:00 0.196584
2016-01-01 02:00:00 0.196584
2016-01-01 03:00:00 0.196584
2016-01-01 04:00:00 0.196584
2016-01-01 05:00:00 0.196584
2016-01-01 06:00:00 0.196584
2016-01-01 07:00:00 0.196584
2016-01-01 08:00:00 0.196584
2016-01-01 09:00:00 0.196584
2016-01-01 10:00:00 0.196584
2016-01-01 11:00:00 0.196584
2016-01-01 12:00:00 0.196584
2016-01-01 13:00:00 0.196584
2016-01-01 14:00:00 0.196584
2016-01-01 15:00:00 0.196584
2016-01-01 16:00:00 0.196584
2016-01-01 17:00:00 0.196584
2016-01-01 18:00:00 0.196584
2016-01-01 19:00:00 0.196584
2016-01-01 20:00:00 0.196584
2016-01-01 21:00:00 0.196584
2016-01-01 22:00:00 0.196584
2016-01-01 23:00:00 0.196584
2016-01-02 00:00:00 0.209433
2016-01-02 01:00:00 0.209433
2016-01-02 02:00:00 0.209433
2016-01-02 03:00:00 0.209433
2016-01-02 04:00:00 0.209433
2016-01-02 05:00:00 0.209433

2016-12-29 19:00:00 0.287915
2016-12-29 20:00:00 0.287915
2016-12-29 21:00:00 0.287915
2016-12-29 22:00:00 0.287915
2016-12-29 23:00:00 0.287915
2016-12-30 00:00:00 0.023052
2016-12-30 01:00:00 0.023052
2016-12-30 02:00:00 0.023052
2016-12-30 03:00:00 0.023052
2016-12-30 04:00:00 0.023052
2016-12-30 05:00:00 0.023052
2016-12-30 06:00:00 0.023052
2016-12-30 07:00:00 0.023052
2016-12-30 08:00:00 0.023052
2016-12-30 09:00:00 0.023052
2016-12-30 10:00:00 0.023052
2016-12-30 11:00:00 0.023052
2016-12-30 12:00:00 0.023052
2016-12-30 13:00:00 0.023052
2016-12-30 14:00:00 0.023052
2016-12-30 15:00:00 0.023052
2016-12-30 16:00:00 0.023052
2016-12-30 17:00:00 0.023052
2016-12-30 18:00:00 0.023052
2016-12-30 19:00:00 0.023052
2016-12-30 20:00:00 0.023052
2016-12-30 21:00:00 0.023052
2016-12-30 22:00:00 0.023052
2016-12-30 23:00:00 0.023052
2016-12-31 00:00:00 0.515360
Freq: H, Length: 8761, dtype: float64

基于时间序列画图

1
2
3
t_range = pd.date_range('2016-01-01','2016-12-31',freq='H')

t_range

DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 01:00:00’,
‘2016-01-01 02:00:00’, ‘2016-01-01 03:00:00’,
‘2016-01-01 04:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 06:00:00’, ‘2016-01-01 07:00:00’,
‘2016-01-01 08:00:00’, ‘2016-01-01 09:00:00’,

‘2016-12-30 15:00:00’, ‘2016-12-30 16:00:00’,
‘2016-12-30 17:00:00’, ‘2016-12-30 18:00:00’,
‘2016-12-30 19:00:00’, ‘2016-12-30 20:00:00’,
‘2016-12-30 21:00:00’, ‘2016-12-30 22:00:00’,
‘2016-12-30 23:00:00’, ‘2016-12-31 00:00:00’],
dtype=’datetime64[ns]’, length=8761, freq=’H’)

1
2
3
4
5
stock_df = DataFrame(index=t_range)
stock_df['BABA'] = np.random.randint(80,160, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30,50, size=len(t_range))

stock_df.head()
BABATENCENT
2016-01-01 00:00:0013443
2016-01-01 01:00:009835
2016-01-01 02:00:0015032
2016-01-01 03:00:009637
2016-01-01 04:00:0014040
1
stock_df.plot()

1
2
import matplotlib.pyplot as plt
plt.show()
1
2
3
4
5
weekly_df = DataFrame()
weekly_df['BABA'] = stock_df['BABA'].resample('W').mean()
weekly_df['TENCENT'] = stock_df['TENCENT'].resample('W').mean()

weekly_df.head()
BABATENCENT
2016-01-03118.77777838.638889
2016-01-10120.69047640.077381
2016-01-17120.67857139.452381
2016-01-24119.83333339.386905
2016-01-31119.57738139.416667
1
weekly_df.plot()

数据分箱Binning

对list进行分箱

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

score_list = np.random.randint(25,100,size=20)

score_list

array([95, 75, 43, 49, 26, 95, 28, 55, 93, 97, 91, 46, 39, 88, 32, 97, 93,
80, 57, 70])

1
2
3
4
5
bins = [0,59,70,80,100]

score_cat = pd.cut(score_list, bins)

pd.value_counts(score_cat)

(0, 59] 9
(80, 100] 8
(70, 80] 2
(59, 70] 1
dtype: int64

对DataFrame进行分箱

1
2
3
4
df = DataFrame()
df['score'] = score_list

df
score
095
175
243
349
426
595
628
755
893
997
1091
1146
1239
1388
1432
1597
1693
1780
1857
1970

利用 pd.util.testing.rands() 随机生成指定位数的字符串

1
2
3
 df['student'] = [pd.util.testing.rands(3) for i in range(20)]

df
scorestudent
095PqQ
175ds2
243iUV
349JjY
426FK3
595b9b
628DyV
755103
893RUC
997VqI
1091sjo
1146cCl
1239TeO
1388xap
1432GfG
1597FNg
1693pk0
1780ziG
18575k0
1970ozP

通过 pd.cut() 的 labels 参数,对数据进行分类打标签

1
2
3
df['Categories'] = pd.cut(df['score'], bins, labels=['Low', 'OK', 'Good', 'Great'])

df
scorestudentCategories
095PqQGreat
175ds2Good
243iUVLow
349JjYLow
426FK3Low
595b9bGreat
628DyVLow
755103Low
893RUCGreat
997VqIGreat
1091sjoGreat
1146cClLow
1239TeOLow
1388xapGreat
1432GfGLow
1597FNgGreat
1693pk0Great
1780ziGGood
18575k0Low
1970ozPOK

数据分组技术GroupBy

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('city_weather.csv')

df.head()
datecitytemperaturewind
003/01/2016BJ85
117/01/2016BJ122
231/01/2016BJ192
314/02/2016BJ-33
428/02/2016BJ192
1
2
3
g = df.groupby(df['city'])

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x119b0d6d8>

获取分组内容

1
2
3
4
5
g.groups

df_bj = g.get_group('BJ')

df_bj
datecitytemperaturewind
003/01/2016BJ85
117/01/2016BJ122
231/01/2016BJ192
314/02/2016BJ-33
428/02/2016BJ192
513/03/2016BJ53
1
df_bj.mean()

temperature 10.000000
wind 2.833333
dtype: float64

对分组取平均值

1
g.mean()
temperaturewind
city
BJ10.0002.833333
GZ8.7504.000000
SH4.6253.625000
SZ5.0002.500000

对分组取最小值

1
g.min()
datetemperaturewind
city
BJ03/01/2016-32
GZ14/08/2016-12
SH03/07/2016-102
SZ11/09/2016-101

groupby对象 => list => dict

1
dict(list(g))['BJ']
datecitytemperaturewind
003/01/2016BJ85
117/01/2016BJ122
231/01/2016BJ192
314/02/2016BJ-33
428/02/2016BJ192
513/03/2016BJ53

for循环遍历groupby变量

1
2
3
for name, group_df in g:
print(name)
print(group_df)

BJ
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
GZ
date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
SH
date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ
date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4

数据聚合技术Aggregation

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('city_weather.csv')
g = df.groupby('city')

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11b921cc0>

1
g.max()
datetemperaturewind
city
BJ31/01/2016195
GZ31/07/2016255
SH27/03/2016205
SZ25/09/2016204
1
g.count()
datetemperaturewind
city
BJ666
GZ444
SH888
SZ222
1
g.describe()
temperaturewind
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
city
BJ6.010.0008.532292-3.05.7510.017.2519.06.02.8333331.1690452.02.002.53.005.0
GZ4.08.75011.842719-1.00.505.513.7525.04.04.0000001.4142142.03.504.55.005.0
SH8.04.62512.489281-10.0-5.252.017.5020.08.03.6250001.0606602.03.003.54.255.0
SZ2.05.00021.213203-10.0-2.505.012.5020.02.02.5000002.1213201.01.752.53.254.0
1
g.agg('min')
datetemperaturewind
city
BJ03/01/2016-32
GZ14/08/2016-12
SH03/07/2016-102
SZ11/09/2016-101

自定义聚合函数

1
2
def foo(attr):
return attr.max() - attr.min()
1
g.agg(foo)
temperaturewind
city
BJ223
GZ263
SH303
SZ303

对多个列进行groupby

1
2
3
g_new = df.groupby(['city','wind'])

g_new

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ba83208>

1
g_new.groups

{(‘BJ’, 2): Int64Index([1, 2, 4], dtype=’int64’),
(‘BJ’, 3): Int64Index([3, 5], dtype=’int64’),
(‘BJ’, 5): Int64Index([0], dtype=’int64’),
(‘GZ’, 2): Int64Index([14], dtype=’int64’),
(‘GZ’, 4): Int64Index([17], dtype=’int64’),
(‘GZ’, 5): Int64Index([15, 16], dtype=’int64’),
(‘SH’, 2): Int64Index([10], dtype=’int64’),
(‘SH’, 3): Int64Index([7, 8, 9], dtype=’int64’),
(‘SH’, 4): Int64Index([6, 11], dtype=’int64’),
(‘SH’, 5): Int64Index([12, 13], dtype=’int64’),
(‘SZ’, 1): Int64Index([18], dtype=’int64’),
(‘SZ’, 4): Int64Index([19], dtype=’int64’)}

1
g_new.get_group(('BJ',2))
datecitytemperaturewind
117/01/2016BJ122
231/01/2016BJ192
428/02/2016BJ192

for循环遍历多列groupby变量

1
2
3
for (name_1, name_2), group in g_new:
print(name_1,name_2)
print(group)

BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4

透视表

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_excel('sales-funnel.xlsx')

df
AccountNameRepManagerProductQuantityPriceStatus
0714466Trantow-BarrowsCraig BookerDebra HenleyCPU130000presented
1714466Trantow-BarrowsCraig BookerDebra HenleySoftware110000presented
2714466Trantow-BarrowsCraig BookerDebra HenleyMaintenance25000pending
3737550Fritsch, Russel and AndersonCraig BookerDebra HenleyCPU135000declined
4146832Kiehn-SpinkaDaniel HiltonDebra HenleyCPU265000won
5218895Kulas IncDaniel HiltonDebra HenleyCPU240000pending
6218895Kulas IncDaniel HiltonDebra HenleySoftware110000presented
7412290Jerde-HilpertJohn SmithDebra HenleyMaintenance25000pending
8740150Barton LLCJohn SmithDebra HenleyCPU135000declined
9141962Herman LLCCedric MossFred AndersonCPU265000won
10163416Purdy-KundeCedric MossFred AndersonCPU130000presented
11239344Stokes LLCCedric MossFred AndersonMaintenance15000pending
12239344Stokes LLCCedric MossFred AndersonSoftware110000presented
13307599Kassulke, Ondricka and MetzWendy YuleFred AndersonMaintenance37000won
14688981Keeling LLCWendy YuleFred AndersonCPU5100000won
15729833Koepp LtdWendy YuleFred AndersonCPU265000declined
16729833Koepp LtdWendy YuleFred AndersonMonitor25000presented
1
df.size

136

1
len(df)

17

1
pd.pivot_table(df, index=['Name','Rep','Manager'])
AccountPriceQuantity
NameRepManager
Barton LLCJohn SmithDebra Henley740150350001.000000
Fritsch, Russel and AndersonCraig BookerDebra Henley737550350001.000000
Herman LLCCedric MossFred Anderson141962650002.000000
Jerde-HilpertJohn SmithDebra Henley41229050002.000000
Kassulke, Ondricka and MetzWendy YuleFred Anderson30759970003.000000
Keeling LLCWendy YuleFred Anderson6889811000005.000000
Kiehn-SpinkaDaniel HiltonDebra Henley146832650002.000000
Koepp LtdWendy YuleFred Anderson729833350002.000000
Kulas IncDaniel HiltonDebra Henley218895250001.500000
Purdy-KundeCedric MossFred Anderson163416300001.000000
Stokes LLCCedric MossFred Anderson23934475001.000000
Trantow-BarrowsCraig BookerDebra Henley714466150001.333333
1
pd.pivot_table(df, index=['Manager','Rep'], values=['Price', 'Quantity'], aggfunc='sum')
PriceQuantity
ManagerRep
Debra HenleyCraig Booker800005
Daniel Hilton1150005
John Smith400003
Fred AndersonCedric Moss1100005
Wendy Yule17700012
1
pd.pivot_table(df, index=['Manager','Rep'], values=['Price'], columns=['Product'], fill_value=0, aggfunc='sum')
Price
ProductCPUMaintenanceMonitorSoftware
ManagerRep
Debra HenleyCraig Booker650005000010000
Daniel Hilton1050000010000
John Smith35000500000
Fred AndersonCedric Moss950005000010000
Wendy Yule165000700050000

分组和透视功能

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('usa_flights.csv')

df.head()
flight_dateunique_carrierflight_numorigindestarr_delaycancelleddistancecarrier_delayweather_delaylate_aircraft_delaynas_delaysecurity_delayactual_elapsed_time
002/01/2015 0:00AA1JFKLAX-19.002475NaNNaNNaNNaNNaN381.0
103/01/2015 0:00AA1JFKLAX-39.002475NaNNaNNaNNaNNaN358.0
204/01/2015 0:00AA1JFKLAX-12.002475NaNNaNNaNNaNNaN385.0
305/01/2015 0:00AA1JFKLAX-8.002475NaNNaNNaNNaNNaN389.0
406/01/2015 0:00AA1JFKLAX25.0024750.00.00.025.00.0424.0
1
df.shape

(201664, 14)

1
df.tail()
flight_dateunique_carrierflight_numorigindestarr_delaycancelleddistancecarrier_delayweather_delaylate_aircraft_delaynas_delaysecurity_delayactual_elapsed_time
20165910/01/2015 0:00NK188OAKLAS-16.00407NaNNaNNaNNaNNaN77.0
20166011/01/2015 0:00NK188OAKLAS-4.00407NaNNaNNaNNaNNaN87.0
20166112/01/2015 0:00NK188OAKLAS-7.00407NaNNaNNaNNaNNaN82.0
20166213/01/2015 0:00NK188OAKLAS23.004073.00.00.020.00.0103.0
20166314/01/2015 0:00NK188OAKLAS-7.00407NaNNaNNaNNaNNaN82.0

获取延误时间Top 10的航班

1
df.sort_values('arr_delay', ascending=False)[:10]
flight_dateunique_carrierflight_numorigindestarr_delaycancelleddistancecarrier_delayweather_delaylate_aircraft_delaynas_delaysecurity_delayactual_elapsed_time
1107311/01/2015 0:00AA1595AUSDFW1444.001901444.00.00.00.00.059.0
1021413/01/2015 0:00AA1487OMADFW1392.005831392.00.00.00.00.0117.0
1243003/01/2015 0:00AA1677MEMDFW1384.004321380.00.00.04.00.0104.0
844304/01/2015 0:00AA1279OMADFW1237.005831222.00.015.00.00.0102.0
1032805/01/2015 0:00AA1495EGEDFW1187.007211019.00.0168.00.00.0127.0
3657004/01/2015 0:00DL1435MIAMSP1174.0015011174.00.00.00.00.0231.0
3649504/01/2015 0:00DL1367ROCATL1138.007491112.00.00.026.00.0171.0
5907214/01/2015 0:00DL1687SANMSP1084.0015321070.00.00.014.00.0240.0
3217305/01/2015 0:00AA970LASLAX1042.002361033.00.09.00.00.066.0
5648812/01/2015 0:00DL2117ATLCOS1016.0011841016.00.00.00.00.0193.0

计算延误和没有延误所占比例

1
df['cancelled'].value_counts()

0 196873
1 4791
Name: cancelled, dtype: int64

1
2
3
df['delayed'] = df['arr_delay'].apply(lambda x:x>0)

df.head()
flight_dateunique_carrierflight_numorigindestarr_delaycancelleddistancecarrier_delayweather_delaylate_aircraft_delaynas_delaysecurity_delayactual_elapsed_timedelayed
002/01/2015 0:00AA1JFKLAX-19.002475NaNNaNNaNNaNNaN381.0False
103/01/2015 0:00AA1JFKLAX-39.002475NaNNaNNaNNaNNaN358.0False
204/01/2015 0:00AA1JFKLAX-12.002475NaNNaNNaNNaNNaN385.0False
305/01/2015 0:00AA1JFKLAX-8.002475NaNNaNNaNNaNNaN389.0False
406/01/2015 0:00AA1JFKLAX25.0024750.00.00.025.00.0424.0True
1
2
3
delay_data = df['delayed'].value_counts()

delay_data

False 103037
True 98627
Name: delayed, dtype: int64

1
delay_data[1]/(delay_data[0] + delay_data[1])

0.4890659711202793

每个航空公司延误情况

1
2
3
delay_group = df.groupby(['unique_carrier','delayed'])

delay_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118b616d8>

1
delay_group.size()

unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64

1
2
3
df_delay = delay_group.size().unstack()

df_delay
delayedFalseTrue
unique_carrier
AA89129841
AS35272104
B648324401
DL177199803
EV1059611371
F911031848
HA13511354
MQ46928060
NK15502133
OO997710804
UA78858624
US78506353
VX1254781
WN2178921150
1
2
3
import matplotlib.pyplot as plt

df_delay.plot(kind='barh',stacked=True,figsize=[16,6],colormap='winter')

透视表

1
2
3
flight_by_carrier = df.pivot_table(index=['flight_date'],columns=['unique_carrier'],values=['flight_num'],aggfunc='count')

flight_by_carrier.head()
flight_num
unique_carrierAAASB6DLEVF9HAMQNKOOUAUSVXWN
flight_date
02/01/2015 0:0015454777592271182425422410462871763142011771763518
03/01/2015 0:001453449711203117441922029372851681123310281603328
04/01/2015 0:0015344587592258183324920610272841731128311581693403
05/01/2015 0:0015324337542212181126420910392881737143211571743506
06/01/2015 0:001400415692205416862492029662791527129410031523396

Matplotlib

简单绘图之plot

1
2
3
4
5
import numpy as np
import matplotlib.pyplot as plt

a = [1,2,3]
plt.plot(a)

plt.show() 开始绘图

1
plt.show()
1
2
b = [4,5,6]
plt.plot(a,b)

魔法命令,可直接显示图像

1
%matplotlib inline

显示运行时间

1
%timeit np.arange(10)

594 ns ± 9.52 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

1
plt.plot(a,b,'*')

1
plt.plot(a,b,'r--')

一张图中画多条线

1
2
3
4
c = [10,8,6]
d = [1,8,3]

plt.plot(a,b,'r--',c,d,'b*')

画三角函数,并设置label,label,title

1
2
t = np.arange(0.0,2.0,0.1)
t.size

20

1
2
s = np.sin(t * np.pi)
s.size

20

1
s

array([ 0.00000000e+00, 3.09016994e-01, 5.87785252e-01, 8.09016994e-01,
9.51056516e-01, 1.00000000e+00, 9.51056516e-01, 8.09016994e-01,
5.87785252e-01, 3.09016994e-01, 1.22464680e-16, -3.09016994e-01,
-5.87785252e-01, -8.09016994e-01, -9.51056516e-01, -1.00000000e+00,
-9.51056516e-01, -8.09016994e-01, -5.87785252e-01, -3.09016994e-01])

1
2
3
4
5
6
plt.plot(t,s,'r--',label='aaaa')
plt.plot(t*2,s,'--',label='bbbb')
plt.xlabel('this is x')
plt.ylabel('this is y')
plt.title('this is a demo')
plt.legend()

简单绘图之subplot

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0,5)
y1 = np.sin(np.pi*x)
y2 = np.sin(np.pi*x*2)

plt.plot(x, y1, 'b--', label='sin(pi*x)')
plt.ylabel('y1 value')
plt.plot(x, y2, 'r--', label='sin(pi*2x)')
plt.ylabel('y2 value')
plt.xlabel('x value')
plt.title('this is x-y value')
plt.legend()

1
2
3
4
5
6
7
8
9
10
11
plt.subplot(2,2,1)
plt.plot(x, y1, 'b--')
plt.ylabel('y1')
plt.subplot(2,2,2)
plt.plot(x, y2, 'r--')
plt.ylabel('y2')
plt.xlabel('x')
plt.subplot(2,2,3)
plt.plot(x, y1, 'r*')
plt.subplot(2,2,4)
plt.plot(x, y1, 'b*')

subplots

1
figure, ax = plt.subplots(2,2)

1
2
3
4
ax[0][0].plot(x,y1)
ax[0][1].plot(x,y2)

figure

Series

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series
import matplotlib.pyplot as plt

s1 = Series(np.random.randn(100).cumsum())
s2 = Series(np.random.randn(100).cumsum())
1
s1.plot(style='--')

1
2
3
s1.plot(kind='line', grid=True, label='S1', title='this is Series')
s2.plot(label='S2', style='--')
plt.legend()

1
2
3
figure, ax = plt.subplots(2,1)

figure

1
2
3
s1[0:10].plot(ax=ax[0],label='S1',kind='bar')
s2.plot(ax=ax[1],label='S2')
figure

DataFrame

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

df = DataFrame(
np.random.randint(1,10,40).reshape(10,4),
columns=['A','B','C','D']
)

df
ABCD
02198
12391
22592
34186
41989
55465
67218
71418
88129
92454

横向柱状图

1
df.plot(kind='barh')

Stacked柱状图

1
df.plot(kind='bar',stacked=True)

区域填充

1
df.plot(kind='area')

指定index画图

1
2
3
a = df.iloc[5]

a.plot()

按index画图(方法1)

1
2
3
4
for i in df.index:
df.iloc[i].plot(label=str(i))

plt.legend()

按index画图(方法2)

1
df.T.plot()

按column画图

1
df['A'].plot()

直方图

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

s = Series(np.random.randn(1000))

plt.hist(s, rwidth=0.9)

array([ 1., 9., 42., 129., 243., 257., 198., 93., 24., 4.]),
array([-3.75513064, -3.04405274, -2.33297485, -1.62189695, -0.91081905,
-0.19974115, 0.51133675, 1.22241465, 1.93349255, 2.64457045,
3.35564835])

1
plt.hist(s, rwidth=0.9, bins=20, color='r')

array([ 1., 0., 1., 8., 15., 27., 46., 83., 103., 140., 143.,
114., 120., 78., 61., 32., 15., 9., 3., 1.]),
array([-3.75513064, -3.39959169, -3.04405274, -2.68851379, -2.33297485,
-1.9774359 , -1.62189695, -1.266358 , -0.91081905, -0.5552801 ,
-0.19974115, 0.1557978 , 0.51133675, 0.8668757 , 1.22241465,
1.5779536 , 1.93349255, 2.2890315 , 2.64457045, 3.0001094 ,
3.35564835])

密度图

1
s.plot(kind='kde')

Seaborn

散点图

方法1:matplotlib

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

iris = pd.read_csv('iris.csv')
iris.head()
SepalLengthSepalWidthPetalLengthPetalWidthName
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
1
iris.Name.unique()

array([‘Iris-setosa’, ‘Iris-versicolor’, ‘Iris-virginica’], dtype=object)

1
2
3
color_map = dict(zip(iris.Name.unique(),['blue','green','red']))

color_map

{‘Iris-setosa’: ‘blue’, ‘Iris-versicolor’: ‘green’, ‘Iris-virginica’: ‘red’}

1
2
3
4
5
6
7
8
9
for species, group in iris.groupby('Name'):
plt.scatter(group['PetalLength'], group['SepalLength'],
color=color_map[species],
alpha=0.3, edgecolor=None,
label=species)

plt.legend(frameon=True, title='Name')
plt.xlabel('petalLength')
plt.ylabel('sepalLength')

方法2:Seaborn

1
sns.lmplot('PetalLength','SepalLength', iris, hue='Name', fit_reg=False)

直方图(matplotlib)

1
2
s1 = Series(np.random.randn(1000))
plt.hist(s1)

(array([ 5., 36., 96., 196., 278., 227., 111., 37., 9., 5.]),
array([-3.17567133, -2.48638745, -1.79710358, -1.1078197 , -0.41853582,
0.27074806, 0.96003194, 1.64931581, 2.33859969, 3.02788357,
3.71716745]),
<a list of 10 Patch objects>)

密度图(matplotlib)

1
s1.plot(kind='kde')

直方图(Seaborn)

1
sns.distplot(s1, hist=True, kde=False, rug=True)

密度图(Seaborn)

1
sns.kdeplot(s1, shade=True, color='r')

轴须图(Seaborn)

1
sns.rugplot(s1)

热力图(Seaborn)

1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('flights')

df.head()
yearmonthpassengers
01949January112
11949February118
21949March132
31949April129
41949May121
1
df.shape

(144, 3)

生成透视表

1
2
df = df.pivot(index='month', columns='year', values='passengers')
df
year194919501951195219531954195519561957195819591960
month
January112115145171196204242284315340360417
February118126150180196188233277301318342391
March132141178193236235267317356362406419
April129135163181235227269313348348396461
May121125172183229234270318355363420472
June135149178218243264315374422435472535
July148170199230264302364413465491548622
August148170199242272293347405467505559606
September136158184209237259312355404404463508
October119133162191211229274306347359407461
November104114146172180203237271305310362390
December118140166194201229278306336337405432

线形图(Matplotlib)

1
df.plot()

热力图(Seaborn)

1
sns.heatmap(df, annot=True, fmt='d')

柱状图(Matplotlib)

1
s.plot(kind='bar')

柱状图(Seaborn)

1
2
3
 s = df.sum()

s.index

Int64Index([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
1960],
dtype=’int64’, name=’year’)

1
s.values

array([1520, 1676, 2042, 2364, 2700, 2867, 3408, 3939, 4421, 4572, 5140,
5714])

1
sns.barplot(x=s.index, y=s.values)

Seaborn设置图形效果

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x = np.linspace(0,14,100)
y1 = np.sin(x)
y2 = np.sin(x+2)*1.25

def sinplot():
plt.plot(x, y1)
plt.plot(x, y2)
1
sinplot()

设置Seaborn显示风格

1
2
3
4
5
import seaborn as sns
style = ['darkgrid', 'dark', 'white', 'whitegrid', 'ticks']

sns.set_style(style[0])
sinplot()

1
2
sns.set_style(style[3])
sinplot()

1
sns.axes_style()

{‘axes.facecolor’: ‘white’,
‘axes.edgecolor’: ‘.8’,
‘axes.grid’: True,
‘axes.axisbelow’: True,
‘axes.labelcolor’: ‘.15’,
‘figure.facecolor’: ‘white’,
‘grid.color’: ‘.8’,
‘grid.linestyle’: ‘-‘,
‘text.color’: ‘.15’,
‘xtick.color’: ‘.15’,
‘ytick.color’: ‘.15’,
‘xtick.direction’: ‘out’,
‘ytick.direction’: ‘out’,
‘lines.solid_capstyle’: ‘round’,
‘patch.edgecolor’: ‘w’,
‘image.cmap’: ‘rocket’,
‘font.family’: [‘sans-serif’],
‘font.sans-serif’: [‘Arial’,
‘DejaVu Sans’,
‘Liberation Sans’,
‘Bitstream Vera Sans’,
‘sans-serif’],
‘patch.force_edgecolor’: True,
‘xtick.bottom’: False,
‘xtick.top’: False,
‘ytick.left’: False,
‘ytick.right’: False,
‘axes.spines.left’: True,
‘axes.spines.bottom’: True,
‘axes.spines.right’: True,
‘axes.spines.top’: True}

自定义配置

1
2
sns.set_style(style[0],{'grid.color': 'green'})
sinplot()

清空自定义设置

1
2
sns.set()
sinplot()

plotting_context() & set_context()

1
2
3
4
context = ['paper', 'notebook', 'talk', 'poster']

sns.set_context(context[1], rc={'grid.linewidth': 3.0})
sinplot()

1
2
sns.set_context(context[3])
sinplot()

显示context详细设置

1
sns.plotting_context()

{‘font.size’: 24.0,
‘axes.labelsize’: 24.0,
‘axes.titlesize’: 24.0,
‘xtick.labelsize’: 22.0,
‘ytick.labelsize’: 22.0,
‘legend.fontsize’: 22.0,
‘axes.linewidth’: 2.5,
‘grid.linewidth’: 2.0,
‘lines.linewidth’: 3.0,
‘lines.markersize’: 12.0,
‘patch.linewidth’: 2.0,
‘xtick.major.width’: 2.5,
‘ytick.major.width’: 2.5,
‘xtick.minor.width’: 2.0,
‘ytick.minor.width’: 2.0,
‘xtick.major.size’: 12.0,
‘ytick.major.size’: 12.0,
‘xtick.minor.size’: 8.0,
‘ytick.minor.size’: 8.0}

股票分析

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader as pdr
from datetime import datetime
1
2
3
alibaba = pdr.get_data_yahoo('BABA')

alibaba.head()
HighLowOpenCloseVolumeAdj Close
Date
2014-09-1999.69999789.94999792.69999793.88999927187940093.889999
2014-09-2292.94999789.50000092.69999789.8899996665780089.889999
2014-09-2390.48000386.62000388.94000287.1699983900980087.169998
2014-09-2490.57000087.22000188.47000190.5700003208800090.570000
2014-09-2591.50000088.50000091.08999688.9199982859800088.919998
1
alibaba.shape

(1211, 6)

1
alibaba.tail()
HighLowOpenCloseVolumeAdj Close
Date
2019-07-08170.279999168.350006170.089996168.44999716531000168.449997
2019-07-09168.979996166.500000167.085007168.80000315782800168.800003
2019-07-10171.369995166.860001171.240005166.92999316675600166.929993
2019-07-11168.423996165.000000167.559998166.55000314796400166.550003
2019-07-12170.199997167.649994168.000000169.07000718093300169.070007
1
alibaba.describe()
HighLowOpenCloseVolumeAdj Close
count1211.0000001211.0000001211.0000001211.0000001.211000e+031211.000000
mean126.715532123.620233125.280824125.1753511.760189e+07125.175351
std44.69880043.68618344.26311344.2088171.260813e+0744.208817
min58.65000257.20000157.29999957.3899993.775300e+0657.389999
25%85.12999783.37500084.59499784.4949991.107115e+0784.494999
50%108.820000106.510002107.809998107.8300021.489790e+07107.830002
75%173.044998169.508499171.739998171.5900042.063065e+07171.590004
max211.699997207.509995209.949997210.8600012.718794e+08210.860001
1
alibaba.info()

<class ‘pandas.core.frame.DataFrame’>
DatetimeIndex: 1211 entries, 2014-09-19 to 2019-07-12
Data columns (total 6 columns):
High 1211 non-null float64
Low 1211 non-null float64
Open 1211 non-null float64
Close 1211 non-null float64
Volume 1211 non-null int64
Adj Close 1211 non-null float64
dtypes: float64(5), int64(1)
memory usage: 66.2 KB

1
2
3
4
5
6
7
8
start = datetime(2015,9,20)
alibaba = pdr.get_data_yahoo('BABA', start=start)
amazon = pdr.get_data_yahoo('AMZN', start=start)

alibaba.to_csv('/Users/kwang/jupyter-projects/BABA.csv')
amazon.to_csv('/Users/kwang/jupyter-projects/AMZN.csv')

alibaba.head()
HighLowOpenCloseVolumeAdj Close
Date
2015-09-2166.40000262.95999965.37999763.9000022235510063.900002
2015-09-2263.27000061.58000262.93999961.9000021489790061.900002
2015-09-2362.29999959.68000061.95999960.0000002268460060.000000
2015-09-2460.34000058.20999959.41999859.9199982064570059.919998
2015-09-2560.84000058.91999860.63000159.2400021700910059.240002
1
alibaba['Adj Close'].plot(legend=True)

1
alibaba['Volume'].plot(legend=True)

1
2
alibaba['Adj Close'].plot()
amazon['Adj Close'].plot()

1
2
3
alibaba['high-low'] = alibaba['High'] - alibaba['Low']

alibaba.head()
HighLowOpenCloseVolumeAdj Closehigh-low
Date
2015-09-2166.40000262.95999965.37999763.9000022235510063.9000023.440002
2015-09-2263.27000061.58000262.93999961.9000021489790061.9000021.689999
2015-09-2362.29999959.68000061.95999960.0000002268460060.0000002.619999
2015-09-2460.34000058.20999959.41999859.9199982064570059.9199982.130001
2015-09-2560.84000058.91999860.63000159.2400021700910059.2400021.920002
1
alibaba['high-low'].plot()

1
2
3
alibaba['daily-return'] = alibaba['Adj Close'].pct_change()

alibaba['daily-return'].plot(figsize=(20,4),linestyle='--',marker='o')

1
alibaba['daily-return'].plot(kind='hist')

1
sns.distplot(alibaba['daily-return'].dropna(),bins=100,color='purple')

1
2
3
4
5
6
7
start = datetime(2015,1,1)
company = ['AAPL', 'GOOG', 'MSFT', 'AMZN', 'FB']
top_tech_df = pdr.get_data_yahoo(company, start=start)['Adj Close']

top_tech_df.to_csv('/Users/kwang/jupyter-projects/top5.csv')

top_tech_df.head()
SymbolsAAPLAMZNFBGOOGMSFT
Date
2014-12-31102.110046310.35000678.019997524.95874041.982117
2015-01-02101.138702308.51998978.449997523.37310842.262299
2015-01-0598.289474302.19000277.190002512.46301341.873657
2015-01-0698.298729295.29000976.150002500.58563241.259060
2015-01-0799.677094298.42001376.150002499.72799741.783279
1
2
3
top_tech_dr = top_tech_df.pct_change()

top_tech_dr.head()
SymbolsAAPLAMZNFBGOOGMSFT
Date
2014-12-31NaNNaNNaNNaNNaN
2015-01-02-0.009513-0.0058970.005511-0.0030200.006674
2015-01-05-0.028171-0.020517-0.016061-0.020846-0.009196
2015-01-060.000094-0.022833-0.013473-0.023177-0.014677
2015-01-070.0140220.0106000.000000-0.0017130.012706
1
top_tech_df.plot()

1
top_tech_df[['AAPL','FB','MSFT']].plot()

散点图

1
sns.jointplot('AMZN','GOOG',top_tech_dr, kind='scatter')

1
sns.jointplot('MSFT','FB',top_tech_dr, kind='scatter')

1
sns.pairplot(top_tech_dr.dropna())

分位数

1
top_tech_dr['AAPL'].quantile(0.05)

-0.02510024760167844

95%的信心daily-return为-2%

1
top_tech_dr['MSFT'].quantile(0.05)

-0.021409700990842318

1
2
vips = pdr.get_data_yahoo('VIPS', start=start)['Adj Close']
vips.plot()

1
vips.pct_change().quantile(0.2)

-0.02329943700527643