Python 数据科学学习笔记 - 钢钢更新

发表于 2019-07-12 更新于 2019-09-03 分类于 Python 本文字数： 27k 阅读时长 ≈ 1:40

Numpy

数组Array

初始化数组

import numpy as np

list_1 = [1,2,3,4]
array_1 = np.array(list_1)

array([1, 2, 3, 4])

多维数组

1 2	list_2 = [5,6,7,8] array_2 = np.array([list_1, list_2])

array([[1, 2, 3, 4],
[5, 6, 7, 8]])

获取数组信息

1	array_2.shape

(2, 4)

获取数组类型

1	array_2.dtype

dtype(‘int64’)

获取数组尺寸

1	array_2.size

8

1	array_4 = np.array([[1.0,2,3],[4.0,5,6]])

array([[1., 2., 3.],
[4., 5., 6.]])

1	array_4.dtype

dtype(‘float64’)

1	array_5 = np.arange(1,10,2)

array([1, 3, 5, 7, 9])

初始化元素为0的数组

1	np.zeros(5)

array([0., 0., 0., 0., 0.])

1	np.zeros([2,3])

array([[0., 0., 0.],
[0., 0., 0.]])

初始化数组

1	a = np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

数组切片

a[1]

2

a[1:5]

array([2, 3, 4, 5])

1	b = np.array([[1,2,3],[4,5,6]])

array([[1, 2, 3],
[4, 5, 6]])

b[1][0]

4

b[1,0]

4

1	c = np.array([[1,2,3],[4,5,6],[7,8,9]])

array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])

c[:2,1:]

array([[2, 3],
[5, 6]])

c[1:]

array([[4, 5, 6],
[7, 8, 9]])

c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

c[0:,:1]

array([[1],
[4],
[7]])

c[0:,:2]

array([[1, 2],
[4, 5],
[7, 8]])

c[0:1]

array([[1, 2, 3]])

c[1:]

array([[4, 5, 6],
[7, 8, 9]])

c[1:,1:]

array([[5, 6],
[8, 9]])

c[0:,1:2]

array([[2],
[5],
[8]])

c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

c[0:,1:3]

array([[2, 3],
[5, 6],
[8, 9]])

1	c[0:2,1:3]

array([[2, 3],
[5, 6]])

用随机(小)数初始化数组

1
2
3

import numpy as np

np.random.randn(10)

array([-2.98186555, 0.51843113, -0.74563428, -1.84832264, -0.54354513,
1.23002927, 0.99527109, -0.49284623, 0.20869479, -0.46934356])

用随机整数初始化数组

1	np.random.randint(10)

2

1	np.random.randint(10, size=20).reshape(4,5)

array([[3, 2, 5, 1, 3],
[1, 6, 2, 8, 4],
[0, 4, 8, 7, 0],
[4, 4, 8, 5, 3]])

数组运算

1	a = np.random.randint(10, size=20).reshape(4,5)

1	b = np.random.randint(10, size=20).reshape(4,5)

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

array([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

数组相加

a + b

array([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

数组相减

a - b

array([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

数组相乘

a * b

array([[ 0, 45, 72, 45, 32],
[ 0, 27, 5, 20, 0],
[12, 42, 15, 35, 54],
[ 9, 49, 27, 6, 54]])

数组常用函数

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

获取数组中所有唯一数

1	np.unique(a)

array([0, 1, 3, 4, 5, 6, 7, 8, 9])

对数组每一列求和

sum(a)

array([14, 28, 25, 21, 20])

对数组指定行求和

sum(a[0])

37

对数组指定列求和

1	sum(a[:,0])

14

获取数组最大值

a.max()

9

获取数组指定行最大值

max(a[0])

9

获取数组指定列最大值

1	max(a[:,0])

7

使用pickle序列化array

import pickle
import numpy as np

x = np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

将数组以二进制的方式写入文件

1	f = open('x.pkl','wb')

写入文件

1	pickle.dump(x, f)

查看文件目录

!ls

Array的input和output.ipynb x.pkl
array.ipynb 数组与矩阵运算.ipynb

以二进制方式打开文件

1	f = open('x.pkl', 'rb')

读取文件内容

1	pickle.load(f)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

矩阵Matrix

1	np.mat([[1,2,3],[4,5,6]])

matrix([[1, 2, 3],
[4, 5, 6]])

np.mat(a)

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

矩阵的运算

1	A = np.mat(a)

1	B = np.mat(b)

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

matrix([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

矩阵相加

A + B

matrix([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

矩阵相减

A - B

matrix([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

矩阵相乘

1 2	c = np.mat(np.random.randint(10, size=20).reshape(4,5)) d = np.mat(np.random.randint(10, size=20).reshape(5,4))

matrix([[8, 5, 2, 1, 8],
[9, 5, 8, 5, 4],
[5, 0, 6, 9, 8],
[3, 9, 7, 9, 6]])

matrix([[1, 8, 8, 5],
[3, 4, 5, 3],
[9, 3, 5, 2],
[8, 6, 9, 2],
[4, 9, 8, 1]])

c * d

matrix([[ 81, 168, 172, 69],
[152, 182, 214, 90],
[163, 184, 215, 63],
[189, 189, 233, 80]])

使用numpy进行输入输出（文件操作）

使用np.save写入文件

1	np.save('one_array', x)

!ls

Array的input和output.ipynb   x.pkl

array.ipynb                  数组与矩阵运算.ipynb
one_array.npy

使用np.load读取文件内容

1	np.load('one_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1	y = np.arange(20)

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

使用np.savez将多个数组保存到一个文件中

1	np.savez('two_array.npz', a=x, b=y)

!ls

Array的input和output.ipynb two_array.npz
array.ipynb x.pkl
one_array.npy 数组与矩阵运算.ipynb

利用np.load读取文件内容

1	c = np.load('two_array.npz')

利用索引名读取对应数组内容

c['a']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

c['b']

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19])

Pandas

Series

Series初始化

import numpy as np
import pandas as pd

s1 = pd.Series([1,2,3,4])

s1

0 1
1 2
2 3
3 4
dtype: int64

values

s1.values

array([1, 2, 3, 4])

index

s1.index

RangeIndex(start=0, stop=4, step=1)

1	s2 = pd.Series(np.arange(10))

s2

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

指定index

1	s3 = pd.Series({'1':1,'2':2,'3':3})

s3

1 1
2 2
3 3
dtype: int64

s3.index

Index([‘1’, ‘2’, ‘3’], dtype=’object’)

s3.values

array([1, 2, 3])

1	s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])

s4

A 1
B 2
C 3
D 4
dtype: int64

s4.values

array([1, 2, 3, 4])

s4.index

Index([‘A’, ‘B’, ‘C’, ‘D’], dtype=’object’)

访问Series元素

s4['A']

1

s4[s4>2]

C 3
D 4
dtype: int64

s4

A 1
B 2
C 3
D 4
dtype: int64

1	s4.to_dict()

{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}

1	s5 = pd.Series(s4.to_dict())

s5

A 1
B 2
C 3
D 4
dtype: int64

新增index

1 2	index_1=['A','B','C','D','E'] s6 = pd.Series(s5, index=index_1)

s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

对Series元素进行空值判定

1	pd.isnull(s6)

A False
B False
C False
D False
E True
dtype: bool

对Series元素进行非空判定

1	pd.notnull(s6)

A True
B True
C True
D True
E False
dtype: bool

指定Series名称

1	s6.name = 'demo'

s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

指定Series索引名称

1	s6.index.name = 'index demo'

s6

index demo
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

s6.index

Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’index demo’)

s6.values

array([ 1., 2., 3., 4., nan])

1	s1 = pd.Series(data['Country'], index=['A','B','C'])

s1

0 Belgium
1 India
2 Brazil
dtype: object

s1.values

array([‘Belgium’, ‘India’, ‘Brazil’], dtype=object)

s1.index

Index([‘A’, ‘B’, ‘C’], dtype=’object’)

Reindex on Series

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = pd.Series([1,2,3,4], index=['A','B','C','D'])

s1

A 1
B 2
C 3
D 4
dtype: int64

添加index

1	s1.reindex(['A','B','C','D','E'])

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

用固定数值填充新的index

1	s1.reindex(['A','B','C','D','E'], fill_value=10)

A 1
B 2
C 3
D 4
E 10
dtype: int64

1	s2 = pd.Series(['A','B','C'], index=[1,5,10])

s2

1 A
5 B
10 C
dtype: object

用 range() 扩充index

1	s2.reindex(index=range(15))

0 NaN
1 A
2 NaN
3 NaN
4 NaN
5 B
6 NaN
7 NaN
8 NaN
9 NaN
10 C
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object

用ffill填充新增index

1	s2.reindex(index=range(15), method='ffill')

0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object

对Series切片

s1

A 1
B 2
C 3
D 4
dtype: int64

1	s1.reindex(index=['A','B'])

A 1
B> 2
dtype: int64

s1

A    1
B    2
C    3
D    4
dtype: int64

对Series进行drop index

1	s1.drop('A')

B 2
C 3
D 4
dtype: int64

Series的简单数学计算

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series([1,2,3], index=['A','B','C'])

s1

A 1
B 2
C 3
dtype: int64

1 2	s2 = Series([4,5,6,7], index=['B','C','D','E']) s2

B 4
C 5
D 6
E 7
dtype: int64

s1 + s2

A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64

Series的排序

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series(np.random.randn(10))

s1

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

s1.values

array([ 0.03503903, 1.08739272, -0.34281109, -0.5579896 , 0.54898768,
0.05196506, 2.98323291, 0.75773189, -0.39136856, 0.85090778])

s1.index

RangeIndex(start=0, stop=10, step=1)

按columns排序

1 2	s2 = s1.sort_values(ascending=False) s2

6    2.983233
1    1.087393
9    0.850908
7    0.757732
4    0.548988
5    0.051965
0    0.035039
2   -0.342811
8   -0.391369
3   -0.557990
dtype: float64

按index排序

1	s2.sort_index()

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

Nan in Series

1
2
3

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

1	n = np.nan

nan

type(n)

float

1 2	m = 1 m + n

nan

1	s1 = Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])

s1

A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64

空值判断

1	s1.isnull()

A False
B False
C True
D False
E False
dtype: bool

删除nan的行

1	s1.dropna()

A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64

DataFrame

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)

读取剪贴板内容，并生成DataFrame

1
2
3

df = pd.read_clipboard()

df

	Jul 2019	Jul 2018	Change	Programming Language	Ratings	Change.1
0	1	1	NaN	Java	15.058%	-1.08%
1	2	2	NaN	C	14.211%	-0.45%
2	3	4	change	Python	9.260%	+2.90%
3	4	3	change	C++	6.705%	-0.91%
4	5	6	change	C#	4.365%	+0.57%
5	6	5	change	Visual Basic .NET	4.208%	-0.04%
6	7	8	change	JavaScript	2.304%	-0.53%
7	8	7	change	PHP	2.167%	-0.67%
8	9	9	NaN	SQL	1.977%	-0.36%
9	10	10	NaN	Objective-C	1.686%	+0.23%

Type

type(df)

pandas.core.frame.DataFrame

Columns

1	df.columns

Index(['Jul 2019', 'Jul 2018', 'Change', 'Programming Language', 'Ratings',

       'Change.1'],
      dtype='object')

访问DataFrame列

1	df.Ratings

0 15.058%
1 14.211%
2 9.260%
3 6.705%
4 4.365%
5 4.208%
6 2.304%
7 2.167%
8 1.977%
9 1.686%
Name: Ratings, dtype: object

复制DataFrame

1	df_new = DataFrame(df, columns=['Programming Language','Jul 2018'])

df_new

	Programming Language	Jul 2018
0	Java	1
1	C	2
2	Python	4
3	C++	3
4	C#	6
5	Visual Basic .NET	5
6	JavaScript	8
7	PHP	7
8	SQL	9
9	Objective-C	10

向DataFrame中增加新列

1	df_new = DataFrame(df, columns=['Programming Language','Jul 2018', 'Sep 2018'])

df_new

	Programming Language	Jul 2018	Sep 2018
0	Java	1	NaN
1	C	2	NaN
2	Python	4	NaN
3	C++	3	NaN
4	C#	6	NaN
5	Visual Basic .NET	5	NaN
6	JavaScript	8	NaN
7	PHP	7	NaN
8	SQL	9	NaN
9	Objective-C	10	NaN

向新增列填充数据

方法1：range()

1	df_new['Sep 2018'] = range(0,10)

df_new

	Programming Language	Jul 2018	Sep 2018
0	Java	1	0
1	C	2	1
2	Python	4	2
3	C++	3	3
4	C#	6	4
5	Visual Basic .NET	5	5
6	JavaScript	8	6
7	PHP	7	7
8	SQL	9	8
9	Objective-C	10	9

方法2：np.range()

1	df_new['Sep 2018'] = np.arange(0,10)

df_new

	Programming Language	Jul 2018	Sep 2018
0	Java	1	0
1	C	2	1
2	Python	4	2
3	C++	3	3
4	C#	6	4
5	Visual Basic .NET	5	5
6	JavaScript	8	6
7	PHP	7	7
8	SQL	9	8
9	Objective-C	10	9

方法3：np.arange()

1	df_new['Sep 2018'] = pd.Series(np.arange(0,10))

df_new

	Programming Language	Jul 2018	Sep 2018
0	Java	1	0
1	C	2	1
2	Python	4	2
3	C++	3	3
4	C#	6	4
5	Visual Basic .NET	5	5
6	JavaScript	8	6
7	PHP	7	7
8	SQL	9	8
9	Objective-C	10	9

向新增列中填充数值

1	df_new['Sep 2018'] = pd.Series([100,200],index=[1,2])

df_new

	Programming Language	Jul 2018	Sep 2018
0	Java	1	NaN
1	C	2	100.0
2	Python	4	200.0
3	C++	3	NaN
4	C#	6	NaN
5	Visual Basic .NET	5	NaN
6	JavaScript	8	NaN
7	PHP	7	NaN
8	SQL	9	NaN
9	Objective-C	10	NaN

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

data = {'Country': ['Belgium','India', 'Brazil'], 
       'Capital':['Brussels', 'New Delhi', 'Brasilia'],
       'Population':[11190846, 1303171035, 207847528]}

1	df1 = pd.DataFrame(data)

df1

	Country	Capital	Population
0	Belgium	Brussels	11190846
1	India	New Delhi	1303171035
2	Brazil	Brasilia	207847528

访问DataFrame某一列

1	df1['Country']

0 Belgium
1 India
2 Brazil
Name: Country, dtype: object

1	cou = df1['Country']

type(cou)

pandas.core.series.Series

遍历DataFrame

1	df1.iterrows()

<generator object DataFrame.iterrows at 0x118d895e8>

1
2
3

for row in df1.iterrows():
    print(row[0]), print(row[1])
    break

0
Country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object

df1

	Country	Capital	Population
0	Belgium	Brussels	11190846
1	India	New Delhi	1303171035
2	Brazil	Brasilia	207847528

data

{‘Country’: [‘Belgium’, ‘India’, ‘Brazil’],
‘Capital’: [‘Brussels’, ‘New Delhi’, ‘Brasilia’],
‘Population’: [11190846, 1303171035, 207847528]}

Series => DataFrame

1	s1 = pd.Series(data['Capital'])

1	s2 = pd.Series(data['Country'])

1	s3 = pd.Series(data['Population'])

1	df_new = pd.DataFrame([s2,s1,s3],index=['Country','Capital','Population'])

df_new

	0	1	2
Country	Belgium	India	Brazil
Capital	Brussels	New Delhi	Brasilia
Population	11190846	1303171035	207847528

行转列

1	df_new2 = df_new.T

df_new2

	Country	Capital	Population
0	Belgium	Brussels	11190846
1	India	New Delhi	1303171035
2	Brazil	Brasilia	207847528

df1

	Country	Capital	Population
0	Belgium	Brussels	11190846
1	India	New Delhi	1303171035
2	Brazil	Brasilia	207847528

Nan in DataFrame

1	dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])

dframe

	0	1	2
0	1.0	2.0	3.0
1	NaN	5.0	6.0
2	7.0	NaN	9.0
3	NaN	NaN	NaN

空值判断

1	dframe.isnull()

	0	1	2
0	False	False	False
1	True	False	False
2	False	True	False
3	True	True	True

非空判断

1	dframe.notnull()

	0	1	2
0	True	True	True
1	False	True	True
2	True	False	True
3	False	False	False

删除存在nan的行

1	df1 = dframe.dropna(axis=0)

df1

	0	1	2
0	1.0	2.0	3.0

删除存在nan的列

1	df2 = dframe.dropna(axis=1)

df2


0
1
2
3

仅删除所有值都为nan的行

1	df3 = dframe.dropna(axis=0, how='all')

df3

	0	1	2
0	1.0	2.0	3.0
1	NaN	5.0	6.0
2	7.0	NaN	9.0

1	dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])

dframe2

	0	1	2	3
0	1.0	2.0	3.0	NaN
1	2.0	NaN	5.0	6.0
2	NaN	7.0	NaN	9.0
3	1.0	NaN	NaN	NaN

删除大于thresh阈值的行

1	df4 = dframe2.dropna(thresh=2)

df4

	0	1	2	3
0	1.0	2.0	3.0	NaN
1	2.0	NaN	5.0	6.0
2	NaN	7.0	NaN	9.0

利用fillna对空值进行填充

1	dframe2.fillna(value=1)

	0	1	2	3
0	1.0	2.0	3.0	1.0
1	2.0	1.0	5.0	6.0
2	1.0	7.0	1.0	9.0
3	1.0	1.0	1.0	1.0

利用fillna按列级别填充不同数值

1	dframe2.fillna(value={0:0,1:1,2:2,3:5})

	0	1	2	3
0	1.0	2.0	3.0	5.0
1	2.0	1.0	5.0	6.0
2	0.0	7.0	2.0	9.0
3	1.0	1.0	2.0	5.0

DataFrame I/O操作

打开网页

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser

link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)

True

读取剪贴板内容

1	df1 = pd.read_clipboard()

df1

	Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq

将内容粘贴到剪贴板

1	df1.to_clipboard()

写入CSV (不写入索引值)

1	df1.to_csv('df1.csv', index=False)

!ls

2-3 Array的创建及访问.ipynb
2-4 数组与矩阵运算.ipynb
2-5 Array的input和output.ipynb
3-1 Pandas Series.ipynb
3-2 Pandas DataFrame.ipynb
3-3 深入理解Series和Dataframe.ipynb
3-4 Pandas-Dataframe-IO操作.ipynb
df1.csv
one_array.npy
two_array.npz
x.pkl

预览CSV文件内容

1	!more df1.csv

Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
[7mdf1.csv (END)[m[K

Selecting & Indexing

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

imdb = pd.read_csv('/Users/kwang/jupyter-projects/movie_metadata.csv')

1	imdb.shape

(5043, 28)

查看DataFrame前5行内容

1	imdb.head()

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
0	Color	James Cameron	723.0	178.0	0.0	855.0	Joel David Moore	1000.0	760505847.0	Action\|Adventure\|Fantasy\|Sci-Fi	...	3054.0	English	USA	PG-13	237000000.0	2009.0	936.0	7.9	1.78	33000
1	Color	Gore Verbinski	302.0	169.0	563.0	1000.0	Orlando Bloom	40000.0	309404152.0	Action\|Adventure\|Fantasy	...	1238.0	English	USA	PG-13	300000000.0	2007.0	5000.0	7.1	2.35	0
2	Color	Sam Mendes	602.0	148.0	0.0	161.0	Rory Kinnear	11000.0	200074175.0	Action\|Adventure\|Thriller	...	994.0	English	UK	PG-13	245000000.0	2015.0	393.0	6.8	2.35	85000
3	Color	Christopher Nolan	813.0	164.0	22000.0	23000.0	Christian Bale	27000.0	448130642.0	Action\|Thriller	...	2701.0	English	USA	PG-13	250000000.0	2012.0	23000.0	8.5	2.35	164000
4	NaN	Doug Walker	NaN	NaN	131.0	NaN	Rob Walker	131.0	NaN	Documentary	...	NaN	NaN	NaN	NaN	NaN	NaN	12.0	7.1	NaN	0

5 rows × 28 columns

查看DataFrame后5行内容

1	imdb.tail()

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
5038	Color	Scott Smith	1.0	87.0	2.0	318.0	Daphne Zuniga	637.0	NaN	Comedy\|Drama	...	6.0	English	Canada	NaN	NaN	2013.0	470.0	7.7	NaN	84
5039	Color	NaN	43.0	43.0	NaN	319.0	Valorie Curry	841.0	NaN	Crime\|Drama\|Mystery\|Thriller	...	359.0	English	USA	TV-14	NaN	NaN	593.0	7.5	16.00	32000
5040	Color	Benjamin Roberds	13.0	76.0	0.0	0.0	Maxwell Moody	0.0	NaN	Drama\|Horror\|Thriller	...	3.0	English	USA	NaN	1400.0	2013.0	0.0	6.3	NaN	16
5041	Color	Daniel Hsia	14.0	100.0	0.0	489.0	Daniel Henney	946.0	10443.0	Comedy\|Drama\|Romance	...	9.0	English	USA	PG-13	NaN	2012.0	719.0	6.3	2.35	660
5042	Color	Jon Gunn	43.0	90.0	16.0	16.0	Brian Herzlinger	86.0	85222.0	Documentary	...	84.0	English	USA	PG	1100.0	2004.0	23.0	6.6	1.85	456

5 rows × 28 columns

根据指定列生成新的DataFrame

1	sub_df = imdb[['director_name', 'movie_title', 'imdb_score']]

1	sub_df.head()

	director_name	movie_title	imdb_score
0	James Cameron	Avatar	7.9
1	Gore Verbinski	Pirates of the Caribbean: At World's End	7.1
2	Sam Mendes	Spectre	6.8
3	Christopher Nolan	The Dark Knight Rises	8.5
4	Doug Walker	Star Wars: Episode VII - The Force Awakens ...	7.1

根据index定位内容（iloc）

1	temp_df = sub_df.iloc[10:20,0:2]

temp_df

	director_name	movie_title
10	Zack Snyder	Batman v Superman: Dawn of Justice
11	Bryan Singer	Superman Returns
12	Marc Forster	Quantum of Solace
13	Gore Verbinski	Pirates of the Caribbean: Dead Man's Chest
14	Gore Verbinski	The Lone Ranger
15	Zack Snyder	Man of Steel
16	Andrew Adamson	The Chronicles of Narnia: Prince Caspian
17	Joss Whedon	The Avengers
18	Rob Marshall	Pirates of the Caribbean: On Stranger Tides
19	Barry Sonnenfeld	Men in Black 3

1	temp_df.iloc[2:4,:]

	director_name	movie_title
12	Marc Forster	Quantum of Solace
13	Gore Verbinski	Pirates of the Caribbean: Dead Man's Chest

根据label定位内容（loc）

1	temp_df.loc[15:17,:'director_name']

	director_name
15	Zack Snyder
16	Andrew Adamson
17	Joss Whedon

Reindex on DataFrame

1	df1 = DataFrame(np.random.rand(25).reshape([5,5]), index=['A','B','D','E','F'], columns=['c1','c2','c3','c4','c5'])

df1

	c1	c2	c3	c4	c5
A	0.659369	0.096724	0.880128	0.403168	0.282619
B	0.003070	0.661747	0.121026	0.262881	0.557213
D	0.972693	0.962149	0.411036	0.849216	0.704441
E	0.969716	0.425415	0.943507	0.589344	0.670303
F	0.315732	0.371235	0.943880	0.649541	0.495330

插入新的行(index)

1	df1.reindex(index=['A','B','C','D','E','F'])

	c1	c2	c3	c4	c5
A	0.659369	0.096724	0.880128	0.403168	0.282619
B	0.003070	0.661747	0.121026	0.262881	0.557213
C	NaN	NaN	NaN	NaN	NaN
D	0.972693	0.962149	0.411036	0.849216	0.704441
E	0.969716	0.425415	0.943507	0.589344	0.670303
F	0.315732	0.371235	0.943880	0.649541	0.495330

插入新的列 (columns)

1	df1.reindex(columns=['c1','c2','c3','c4','c5','c6'])

	c1	c2	c3	c4	c5	c6
A	0.659369	0.096724	0.880128	0.403168	0.282619	NaN
B	0.003070	0.661747	0.121026	0.262881	0.557213	NaN
D	0.972693	0.962149	0.411036	0.849216	0.704441	NaN
E	0.969716	0.425415	0.943507	0.589344	0.670303	NaN
F	0.315732	0.371235	0.943880	0.649541	0.495330	NaN

同时新增index和columns

1	df1.reindex(index=['A','B','C','D','E','F'], columns=['c1','c2','c3','c4','c5','c6'])

	c1	c2	c3	c4	c5	c6
A	0.659369	0.096724	0.880128	0.403168	0.282619	NaN
B	0.003070	0.661747	0.121026	0.262881	0.557213	NaN
C	NaN	NaN	NaN	NaN	NaN	NaN
D	0.972693	0.962149	0.411036	0.849216	0.704441	NaN
E	0.969716	0.425415	0.943507	0.589344	0.670303	NaN
F	0.315732	0.371235	0.943880	0.649541	0.495330	NaN

对DataFrame切片

1	df1.reindex(index=['A','B'])

	c1	c2	c3	c4	c5
A	0.659369	0.096724	0.880128	0.403168	0.282619
B	0.003070	0.661747	0.121026	0.262881	0.557213

对DataFrame进行drop index

1	df1.drop('A',axis=0)

	c1	c2	c3	c4	c5
B	0.003070	0.661747	0.121026	0.262881	0.557213
D	0.972693	0.962149	0.411036	0.849216	0.704441
E	0.969716	0.425415	0.943507	0.589344	0.670303
F	0.315732	0.371235	0.943880	0.649541	0.495330

对DataFrame进行drop columns

1	df1.drop('c2',axis=1)

	c1	c3	c4	c5
A	0.659369	0.880128	0.403168	0.282619
B	0.003070	0.121026	0.262881	0.557213
D	0.972693	0.411036	0.849216	0.704441
E	0.969716	0.943507	0.589344	0.670303
F	0.315732	0.943880	0.649541	0.495330

多级index

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
 
s1 = Series(np.random.randn(6),index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])

s1

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

1	s1[1]['a']

-0.8287592467933553

s1[:,'a']

1 -0.828759
2 0.801019
dtype: float64

多级index Series => DataFrame

1
2
3

df1 = s1.unstack()

df1

	a	b	c
1	-0.828759	1.100893	0.705300
2	0.801019	0.676878	-0.585011

DataFrame => 多级index Series

1	df1.unstack()

a 1 -0.828759
2 0.801019
b 1 1.100893
2 0.676878
c 1 0.705300
2 -0.585011
dtype: float64

df1.T

	1	2
a	-0.828759	0.801019
b	1.100893	0.676878
c	0.705300	-0.585011

1	df1.T.unstack()

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

初始化DataFrame (多级index & columns)

1
2
3

df = DataFrame(np.arange(16).reshape(4,4), index=[['a','a','b','b'],[1,2,1,2]], columns=[['BJ','BJ','SH','GZ'],[8,9,8,8]])

df

		BJ		SH	GZ
		8	9	8	8
a	1	0	1	2	3
a	2	4	5	6	7
b	1	8	9	10	11
b	2	12	13	14	15

访问多级DataFrame元素

df['BJ']

		8	9
a	1	0	1
a	2	4	5
b	1	8	9
b	2	12	13

1	df['BJ'][8]

a 1 0
2 4
b 1 8
2 12
Name: 8, dtype: int64

1	df.columns

MultiIndex(levels=[[‘BJ’, ‘GZ’, ‘SH’], [8, 9]],
codes=[[0, 0, 2, 1], [0, 1, 0, 0]])

创建多级索引

data = Series(np.random.randn(10),index=[['x','x','x','x','x','x','y','y','y','y'],
                                         ['a','a','a','b','b','b','c','c','d','d'],
                                         [1,2,3,1,2,3,1,2,2,3]])
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

检索多级索引

选取外层索引

data['x']

a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
dtype: float64

1	data['x':'y']

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

选取内层索引

1	data['x','a',1]

0.6737686360875892

解除多级索引

data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

默认解除最内层索引，即level=-1

1	data.unstack()

		1	2	3
x	a	0.673769	-1.781723	-2.232971
x	b	-0.832687	-0.141850	0.704339
y	c	1.186975	-1.106244	NaN
y	d	NaN	-0.166537	0.695451

解除其他层索引

1	data.unstack(level=-2)

		a	b	c	d
x	1	0.673769	-0.832687	NaN	NaN
	2	-1.781723	-0.141850	NaN	NaN
	3	-2.232971	0.704339	NaN	NaN
y	1	NaN	NaN	1.186975	NaN
	2	NaN	NaN	-1.106244	-0.166537
	3	NaN	NaN	NaN	0.695451

转化为多级索引

data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

将中间层索引转换为内层索引

1	data.unstack(level=-2).stack()

x 1 a 0.673769
b -0.832687
2 a -1.781723
b -0.141850
3 a -2.232971
b 0.704339
y 1 c 1.186975
2 c -1.106244
d -0.166537
3 d 0.695451
dtype: float64

为不同层索引指定名称(name)

data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

1
2
3

df = data.unstack()

df

		1	2	3
x	a	0.673769	-1.781723	-2.232971
x	b	-0.832687	-0.141850	0.704339
y	c	1.186975	-1.106244	NaN
y	d	NaN	-0.166537	0.695451

df.index.names = ['outer','inner'] # 指定两层索引的名称
df.columns.name = 'hello'

df

	hello	1	2	3
outer	inner
x	a	0.673769	-1.781723	-2.232971
x	b	-0.832687	-0.141850	0.704339
y	c	1.186975	-1.106244	NaN
y	d	NaN	-0.166537	0.695451

Mapping

通过Series为DataFrame新增列

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df1

	城市	人口
0	北京	1000
1	上海	2000
2	广州	1500

1
2
3

df1["GDP"] = Series([1200,1800,1600])

df1

	城市	人口	GDP
0	北京	1000	1200
1	上海	2000	1800
2	广州	1500	1600

通过Map为DataFrame新增列

1
2
3

df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df2

	城市	人口
0	北京	1000
1	上海	2000
2	广州	1500

gdp_map = {"北京": 1200, "广州": 1400, "上海": 1800}

df2['GDP'] = df2['城市'].map(gdp_map)

df2

	城市	人口	GDP
0	北京	1000	1200
1	上海	2000	1800
2	广州	1500	1400

Replace

1
2
3

s1 = Series(np.arange(10))

s1

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

1	s1.replace(1, np.nan)

0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
6 6.0
7 7.0
8 8.0
9 9.0
dtype: float64

1	s1.replace([1,2,3],[10,20,30])

0 0
1 10
2 20
3 30
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

DataFrame的简单数学计算

1 2	df1 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','SH']) df1

	BJ	SH
A	0	1
B	2	3

1
2
3

df2 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ','SH','GZ'])

df2

	BJ	SH	GZ
A	0	1	2
B	3	4	5
C	6	7	8

df1 + df2

	BJ	GZ	SH
A	0.0	NaN	2.0
B	5.0	NaN	7.0
C	NaN	NaN	NaN

1 2	df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]], index=['A','B','C'], columns=['c1','c2','c3']) df3

	c1	c2	c3
A	1	2	3.0
B	4	5	NaN
C	7	8	9.0

对列进行求和

df3.sum()

c1 12.0
c2 15.0
c3 12.0
dtype: float64

对行进行求和

1	df3.sum(axis=1)

A 6.0
B 9.0
C 24.0
dtype: float64

求最小值

df3.min()

c1 1.0
c2 2.0
c3 3.0
dtype: float64

求最大值

df3.max()

c1 7.0
c2 8.0
c3 9.0
dtype: float64

df3

	c1	c2	c3
A	1	2	3.0
B	4	5	NaN
C	7	8	9.0

1	df3.describe()

	c1	c2	c3
count	3.0	3.0	2.000000
mean	4.0	5.0	6.000000
std	3.0	3.0	4.242641
min	1.0	2.0	3.000000
25%	2.5	3.5	4.500000
50%	4.0	5.0	6.000000
75%	5.5	6.5	7.500000
max	7.0	8.0	9.000000

DataFrame排序

1 2	df1 = DataFrame(np.random.randn(40).reshape(8,5), columns=['A','B','C','D','E']) df1

	A	B	C	D	E
0	0.428384	-0.242118	-0.311413	-2.032690	-1.519783
1	-1.763709	-0.299535	-1.712632	-0.662375	-0.434309
2	-1.799746	0.937149	-0.387339	0.222759	-0.040818
3	0.239674	1.813621	-0.010476	-0.013300	1.411217
4	-0.980156	0.889708	1.322267	1.848655	-0.210548
5	1.655132	-0.561405	1.369573	-0.485437	1.772758
6	-0.032585	-2.038827	1.560083	0.638874	0.626102
7	-0.153819	-1.386077	0.779540	-1.441996	1.106010

按column排序

1 2	df2 = df1.sort_values('A', ascending=False) df2

	A	B	C	D	E
5	1.655132	-0.561405	1.369573	-0.485437	1.772758
0	0.428384	-0.242118	-0.311413	-2.032690	-1.519783
3	0.239674	1.813621	-0.010476	-0.013300	1.411217
6	-0.032585	-2.038827	1.560083	0.638874	0.626102
7	-0.153819	-1.386077	0.779540	-1.441996	1.106010
4	-0.980156	0.889708	1.322267	1.848655	-0.210548
1	-1.763709	-0.299535	-1.712632	-0.662375	-0.434309
2	-1.799746	0.937149	-0.387339	0.222759	-0.040818

按index排序

1	df2.sort_index()

	A	B	C	D	E
0	0.428384	-0.242118	-0.311413	-2.032690	-1.519783
1	-1.763709	-0.299535	-1.712632	-0.662375	-0.434309
2	-1.799746	0.937149	-0.387339	0.222759	-0.040818
3	0.239674	1.813621	-0.010476	-0.013300	1.411217
4	-0.980156	0.889708	1.322267	1.848655	-0.210548
5	1.655132	-0.561405	1.369573	-0.485437	1.772758
6	-0.032585	-2.038827	1.560083	0.638874	0.626102
7	-0.153819	-1.386077	0.779540	-1.441996	1.106010

Rename index

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
 
df1 = DataFrame(np.arange(9).reshape(3,3), index=['BJ','SH','GZ'], columns=['A','B','C'])

df1

	A	B	C
BJ	0	1	2
SH	3	4	5
GZ	6	7	8

1
2
3

df1.index = Series(['bj','sh','gz'])

df1

	A	B	C
bj	0	1	2
sh	3	4	5
gz	6	7	8

1 2	df1.index = df1.index.map(str.upper) df1

	A	B	C
BJ	0	1	2
SH	3	4	5
GZ	6	7	8

1	df1.rename(index=str.lower, columns=str.lower)

	a	b	c
bj	0	1	2
sh	3	4	5
gz	6	7	8

df1

	A	B	C
BJ	0	1	2
SH	3	4	5
GZ	6	7	8

1	df1.rename(index={'BJ':'beijing'}, columns={'A':'a'})

	a	B	C
beijing	0	1	2
SH	3	4	5
GZ	6	7	8

数字列表 => 字符串列表

1 2	list1 = [1,2,3,4] list2 = ['1','2','3','4']

1	[str(x) for x in list1]

[‘1’, ‘2’, ‘3’, ‘4’]

1	list(map(str, list1))

[‘1’, ‘2’, ‘3’, ‘4’]

利用自定义map函数重命名index

1 2	def test_map(x): return x + '_ABC'

1	df1.index.map(test_map)

Index([‘BJ_ABC’, ‘SH_ABC’, ‘GZ_ABC’], dtype=’object’)

1	df1.rename(index=test_map)

	A	B	C
BJ_ABC	0	1	2
SH_ABC	3	4	5
GZ_ABC	6	7	8

DataFrame合并 (merge) 操作

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.DataFrame({'key':['X','Y','Z','X'], 'data_set_1':[1,2,3,4]})
df1

	key	data_set_1
0	X	1
1	Y	2
2	Z	3
3	X	4

1 2	df2 = pd.DataFrame({'key':['X','B','C'],'data_set_2':[5,6,7]}) df2

	key	data_set_2
0	X	5
1	B	6
2	C	7

1	pd.merge(df1,df2,on=None)

	key	data_set_1	data_set_2
0	X	1	5
1	X	4	5

指定关联列（默认inner连接）

1	pd.merge(df1,df2,on='key',how='inner')

	key	data_set_1	data_set_2
0	X	1	5
1	X	4	5

左连接

1	pd.merge(df1,df2,on='key',how='left')

	key	data_set_1	data_set_2
0	X	1	5.0
1	Y	2	NaN
2	Z	3	NaN
3	X	4	5.0

右连接

1	pd.merge(df1,df2,on='key',how='right')

	key	data_set_1	data_set_2
0	X	1.0	5
1	X	4.0	5
2	B	NaN	6
3	C	NaN	7

Concatenate

Array Concatenate

1
2
3

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

1 2	arr1 = np.arange(9).reshape(3,3) arr1

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1 2	arr2 = np.arange(9).reshape(3,3) arr2

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1	np.concatenate([arr1,arr2])

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1	np.concatenate([arr1,arr2],axis=1)

array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])

Series Concatenate

1 2	s1 = Series([1,2,3], index=['X','Y','Z']) s1

X 1
Y 2
Z 3
dtype: int64

1 2	s2 = Series([4,5], index=['A','B']) s2

A 4
B 5
dtype: int64

1	pd.concat([s1,s2])

X 1
Y 2
Z 3
A 4
B 5
dtype: int64

1	pd.concat([s1,s2], axis=1)

	0	1
A	NaN	4.0
B	NaN	5.0
X	1.0	NaN
Y	2.0	NaN
Z	3.0	NaN

DataFrame Concatenate

1 2	df1 = DataFrame(np.random.randn(4,3), columns=['X','Y','Z']) df1

	X	Y	Z
0	-2.150094	-0.172915	0.391376
1	-2.943184	1.442629	0.934177
2	-0.015998	-1.204739	0.912159
3	-0.131997	-0.152430	1.080225

1 2	df2 = DataFrame(np.random.randn(3,3), columns=['X','Y','A']) df2

	X	Y	A
0	-0.640785	0.404844	1.140704
1	0.331578	0.490477	1.158403
2	1.542735	-0.693717	1.928202

1	pd.concat([df1,df2])

	A	X	Y	Z
0	NaN	-2.150094	-0.172915	0.391376
1	NaN	-2.943184	1.442629	0.934177
2	NaN	-0.015998	-1.204739	0.912159
3	NaN	-0.131997	-0.152430	1.080225
0	1.140704	-0.640785	0.404844	NaN
1	1.158403	0.331578	0.490477	NaN
2	1.928202	1.542735	-0.693717	NaN

Combine

Series Combine

1 2	s1 = Series([2,np.nan,4,np.nan], index=['A','B','C','D']) s1

A 2.0
B NaN
C 4.0
D NaN
dtype: float64

1 2	s2 = Series([1,2,3,4], index=['A','B','C','D']) s2

A 1
B 2
C 3
D 4
dtype: int64

combine_first - 用后者填充前者中的空值 (如有)

1	s1.combine_first(s2)

A 2.0
B 2.0
C 4.0
D 4.0
dtype: float64

DataFrame Combine

df1 = pd.DataFrame({
    'X':[1,np.nan,3,np.nan],
    'Y':[5,np.nan,7,np.nan],
    'Z':[9,np.nan,11,np.nan]
})
df1

	X	Y	Z
0	1.0	5.0	9.0
1	NaN	NaN	NaN
2	3.0	7.0	11.0
3	NaN	NaN	NaN

df2 = pd.DataFrame({
    'Z':[np.nan,10,np.nan,12],
    'A':[1,2,3,4]
})
df2

	Z	A
0	NaN	1
1	10.0	2
2	NaN	3
3	12.0	4

1	df1.combine_first(df2)

	A	X	Y	Z
0	1.0	1.0	5.0	9.0
1	2.0	NaN	NaN	10.0
2	3.0	3.0	7.0	11.0
3	4.0	NaN	NaN	12.0

利用apply对数据进行处理

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.read_csv("apply_demo.csv")

1	df1.head()

	time	data
0	1473411962	Symbol: APPL Seqno: 0 Price: 1623
1	1473411962	Symbol: APPL Seqno: 0 Price: 1623
2	1473411963	Symbol: APPL Seqno: 0 Price: 1623
3	1473411963	Symbol: APPL Seqno: 0 Price: 1623
4	1473411963	Symbol: APPL Seqno: 1 Price: 1649

df1.size

7978

1	s1 = Series(['a']*7978)

1	df1['A'] = s1

1	df1.head()

	time	data	A
0	1473411962	Symbol: APPL Seqno: 0 Price: 1623	a
1	1473411962	Symbol: APPL Seqno: 0 Price: 1623	a
2	1473411963	Symbol: APPL Seqno: 0 Price: 1623	a
3	1473411963	Symbol: APPL Seqno: 0 Price: 1623	a
4	1473411963	Symbol: APPL Seqno: 1 Price: 1649	a

利用apply批量处理某一列数据

1
2
3

df1['A'] = df1['A'].apply(str.upper)
 
df1.head()

	time	data	A
0	1473411962	Symbol: APPL Seqno: 0 Price: 1623	A
1	1473411962	Symbol: APPL Seqno: 0 Price: 1623	A
2	1473411963	Symbol: APPL Seqno: 0 Price: 1623	A
3	1473411963	Symbol: APPL Seqno: 0 Price: 1623	A
4	1473411963	Symbol: APPL Seqno: 1 Price: 1649	A

1 2	l1 = df1['data'][0].strip().split(' ') l1

[‘Symbol:’, ‘APPL’, ‘Seqno:’, ‘0’, ‘Price:’, ‘1623’]

1	l1[1], l1[3], l1[5]

(‘APPL’, ‘0’, ‘1623’)

编写自定义函数，对列中的字符串进行提取

1
2
3

def foo(line):
    items = line.strip().split(' ')
    return Series([items[1], items[3], items[5]])

df_temp = df1['data'].apply(foo)

df_temp = df_temp.rename(columns={0:"Symbol",1:"Seqno",2:"Price"})

df_temp.head()

	Symbol	Seqno	Price
0	APPL	0	1623
1	APPL	0	1623
2	APPL	0	1623
3	APPL	0	1623
4	APPL	1	1649

1	df_new = df1.combine_first(df_temp)

1	df_new.head()

	A	Price	Seqno	Symbol	data	time
0	A	1623.0	0.0	APPL	Symbol: APPL Seqno: 0 Price: 1623	1473411962
1	A	1623.0	0.0	APPL	Symbol: APPL Seqno: 0 Price: 1623	1473411962
2	A	1623.0	0.0	APPL	Symbol: APPL Seqno: 0 Price: 1623	1473411963
3	A	1623.0	0.0	APPL	Symbol: APPL Seqno: 0 Price: 1623	1473411963
4	A	1649.0	1.0	APPL	Symbol: APPL Seqno: 1 Price: 1649	1473411963

del df_new['data']
del df_new['A']
 
df_new.head()

	Price	Seqno	Symbol	time
0	1623.0	0.0	APPL	1473411962
1	1623.0	0.0	APPL	1473411962
2	1623.0	0.0	APPL	1473411963
3	1623.0	0.0	APPL	1473411963
4	1649.0	1.0	APPL	1473411963

1	df_new.to_csv("demo_duplicate.csv")

利用drop_duplicates去重

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('demo_duplicate.csv')

df.head()

	Unnamed: 0	Price	Seqno	Symbol	time
0	0	1623.0	0.0	APPL	1473411962
1	1	1623.0	0.0	APPL	1473411962
2	2	1623.0	0.0	APPL	1473411963
3	3	1623.0	0.0	APPL	1473411963
4	4	1649.0	1.0	APPL	1473411963

1
2
3

del df['Unnamed: 0']

df.head(10)

	Price	Seqno	Symbol	time
0	1623.0	0.0	APPL	1473411962
1	1623.0	0.0	APPL	1473411962
2	1623.0	0.0	APPL	1473411963
3	1623.0	0.0	APPL	1473411963
4	1649.0	1.0	APPL	1473411963
5	1649.0	1.0	APPL	1473411963
6	1649.0	1.0	APPL	1473411964
7	1649.0	1.0	APPL	1473411964
8	1642.0	2.0	APPL	1473411964
9	1642.0	2.0	APPL	1473411964

df.size

15956

len(df)

3989

1	len(df['Seqno'].unique())

1000

1	df['Seqno'].duplicated()

0 False
1 True
2 True
3 True
4 False
5 True
6 True
7 True
8 False
9 True
10 True
11 True
12 False
13 True
14 True
15 True
16 False
17 True
18 True
19 True
20 False
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 False
29 True
…
3959 True
3960 True
3961 False
3962 True
3963 True
3964 True
3965 False
3966 True
3967 True
3968 True
3969 False
3970 True
3971 True
3972 True
3973 False
3974 True
3975 True
3976 True
3977 False
3978 True
3979 True
3980 True
3981 False
3982 True
3983 True
3984 True
3985 False
3986 True
3987 True
3988 True
Name: Seqno, Length: 3989, dtype: bool

1	df.drop_duplicates(['Seqno'], keep='first')

	Price	Seqno	Symbol	time
0	1623.0	0.0	APPL	1473411962
4	1649.0	1.0	APPL	1473411963
8	1642.0	2.0	APPL	1473411964
12	1636.0	3.0	APPL	1473411965
16	1669.0	4.0	APPL	1473411966
20	1639.0	5.0	APPL	1473411967
24	1611.0	6.0	APPL	1473411968
28	1660.0	7.0	APPL	1473411969
32	1657.0	8.0	APPL	1473411970
36	1509.0	9.0	APPL	1473411971
40	1514.0	10.0	APPL	1473411972
44	1676.0	11.0	APPL	1473411973
48	1596.0	12.0	APPL	1473411974
52	1527.0	13.0	APPL	1473411975
56	1643.0	14.0	APPL	1473411976
60	1632.0	15.0	APPL	1473411977
64	1595.0	16.0	APPL	1473411978
68	1565.0	17.0	APPL	1473411979
72	1521.0	18.0	APPL	1473411980
76	1501.0	19.0	APPL	1473411981
80	1579.0	20.0	APPL	1473411982
84	1511.0	21.0	APPL	1473411983
88	1513.0	22.0	APPL	1473411984
92	1687.0	23.0	APPL	1473411985
96	1539.0	24.0	APPL	1473411986
100	1682.0	25.0	APPL	1473411987
104	1534.0	26.0	APPL	1473411988
108	1628.0	27.0	APPL	1473411989
112	1582.0	28.0	APPL	1473411990
116	1540.0	29.0	APPL	1473411991
...	...	...	...	...
3869	1649.0	970.0	APPL	1473412932
3873	1696.0	971.0	APPL	1473412933
3877	1596.0	972.0	APPL	1473412934
3881	1664.0	973.0	APPL	1473412935
3885	1539.0	974.0	APPL	1473412936
3889	1654.0	975.0	APPL	1473412937
3893	1573.0	976.0	APPL	1473412938
3897	1695.0	977.0	APPL	1473412939
3901	1584.0	978.0	APPL	1473412940
3905	1607.0	979.0	APPL	1473412941
3909	1676.0	980.0	APPL	1473412942
3913	1665.0	981.0	APPL	1473412943
3917	1540.0	982.0	APPL	1473412944
3921	1546.0	983.0	APPL	1473412945
3925	1646.0	984.0	APPL	1473412946
3929	1648.0	985.0	APPL	1473412947
3933	1647.0	986.0	APPL	1473412948
3937	1602.0	987.0	APPL	1473412949
3941	1562.0	988.0	APPL	1473412950
3945	1629.0	989.0	APPL	1473412951
3949	1683.0	990.0	APPL	1473412952
3953	1649.0	991.0	APPL	1473412953
3957	1698.0	992.0	APPL	1473412954
3961	1566.0	993.0	APPL	1473412955
3965	1597.0	994.0	APPL	1473412956
3969	1641.0	995.0	APPL	1473412957
3973	1581.0	996.0	APPL	1473412958
3977	1674.0	997.0	APPL	1473412959
3981	1680.0	998.0	APPL	1473412960
3985	1509.0	999.0	APPL	1473412961

1000 rows × 4 columns

时间序列操作

1
2
3

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

1 2	from datetime import datetime t1 = datetime(2019,7,12)

t1

datetime.datetime(2019, 7, 12, 0, 0)

date_list = [
    datetime(2016,9,1),
    datetime(2016,9,10),
    datetime(2017,9,1),
    datetime(2017,9,20),
    datetime(2017,10,1)
]

date_list

[datetime.datetime(2016, 9, 1, 0, 0),
datetime.datetime(2016, 9, 10, 0, 0),
datetime.datetime(2017, 9, 1, 0, 0),
datetime.datetime(2017, 9, 20, 0, 0),
datetime.datetime(2017, 10, 1, 0, 0)]

1
2
3

s1 = Series(np.random.rand(5),index=date_list)

s1

2016-09-01 0.192942
2016-09-10 0.821524
2017-09-01 0.503137
2017-09-20 0.896424
2017-10-01 0.267054
dtype: float64

s1.values

array([0.19294241, 0.82152442, 0.5031373 , 0.8964235 , 0.26705384])

s1.index

DatetimeIndex([‘2016-09-01’, ‘2016-09-10’, ‘2017-09-01’, ‘2017-09-20’,
‘2017-10-01’],
dtype=’datetime64[ns]’, freq=None)

根据索引位置访问

s1[0]

0.19294241160815995

根据时间序列访问

1	s1[datetime(2016,9,1)]

0.19294241160815995

根据时间访问

1	s1['2016-09-01']

0.19294241160815995

1	s1['20160901']

0.19294241160815995

返回整月数据

1	s1['2016-09']

2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64

返回整年数据

1	s1['2016']

2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64

利用pd.date_range() 生成时间序列

1 2	date_list_new = pd.date_range('2016-01-01', periods=100) date_list_new

DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’, ‘2016-01-11’, ‘2016-01-12’,
‘2016-01-13’, ‘2016-01-14’, ‘2016-01-15’, ‘2016-01-16’,
‘2016-01-17’, ‘2016-01-18’, ‘2016-01-19’, ‘2016-01-20’,
‘2016-01-21’, ‘2016-01-22’, ‘2016-01-23’, ‘2016-01-24’,
‘2016-01-25’, ‘2016-01-26’, ‘2016-01-27’, ‘2016-01-28’,
‘2016-01-29’, ‘2016-01-30’, ‘2016-01-31’, ‘2016-02-01’,
‘2016-02-02’, ‘2016-02-03’, ‘2016-02-04’, ‘2016-02-05’,
‘2016-02-06’, ‘2016-02-07’, ‘2016-02-08’, ‘2016-02-09’,
‘2016-02-10’, ‘2016-02-11’, ‘2016-02-12’, ‘2016-02-13’,
‘2016-02-14’, ‘2016-02-15’, ‘2016-02-16’, ‘2016-02-17’,
‘2016-02-18’, ‘2016-02-19’, ‘2016-02-20’, ‘2016-02-21’,
‘2016-02-22’, ‘2016-02-23’, ‘2016-02-24’, ‘2016-02-25’,
‘2016-02-26’, ‘2016-02-27’, ‘2016-02-28’, ‘2016-02-29’,
‘2016-03-01’, ‘2016-03-02’, ‘2016-03-03’, ‘2016-03-04’,
‘2016-03-05’, ‘2016-03-06’, ‘2016-03-07’, ‘2016-03-08’,
‘2016-03-09’, ‘2016-03-10’, ‘2016-03-11’, ‘2016-03-12’,
‘2016-03-13’, ‘2016-03-14’, ‘2016-03-15’, ‘2016-03-16’,
‘2016-03-17’, ‘2016-03-18’, ‘2016-03-19’, ‘2016-03-20’,
‘2016-03-21’, ‘2016-03-22’, ‘2016-03-23’, ‘2016-03-24’,
‘2016-03-25’, ‘2016-03-26’, ‘2016-03-27’, ‘2016-03-28’,
‘2016-03-29’, ‘2016-03-30’, ‘2016-03-31’, ‘2016-04-01’,
‘2016-04-02’, ‘2016-04-03’, ‘2016-04-04’, ‘2016-04-05’,
‘2016-04-06’, ‘2016-04-07’, ‘2016-04-08’, ‘2016-04-09’],
dtype=’datetime64[ns]’, freq=’D’)

从指定时间的（周一）开始生成时间序列

1 2	date_list_new = pd.date_range('2016-01-01', periods=10, freq='W-MON') date_list_new

DatetimeIndex([‘2016-01-04’, ‘2016-01-11’, ‘2016-01-18’, ‘2016-01-25’,
‘2016-02-01’, ‘2016-02-08’, ‘2016-02-15’, ‘2016-02-22’,
‘2016-02-29’, ‘2016-03-07’],
dtype=’datetime64[ns]’, freq=’W-MON’)

1 2	date_list_new = pd.date_range('2016-01-01', periods=10, freq='5H') date_list_new

DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 10:00:00’, ‘2016-01-01 15:00:00’,
‘2016-01-01 20:00:00’, ‘2016-01-02 01:00:00’,
‘2016-01-02 06:00:00’, ‘2016-01-02 11:00:00’,
‘2016-01-02 16:00:00’, ‘2016-01-02 21:00:00’],
dtype=’datetime64[ns]’, freq=’5H’)

1 2	s2 = Series(np.random.rand(10), index=date_list_new) s2

2016-01-01 00:00:00 0.312884
2016-01-01 05:00:00 0.337516
2016-01-01 10:00:00 0.911010
2016-01-01 15:00:00 0.055390
2016-01-01 20:00:00 0.810770
2016-01-02 01:00:00 0.313263
2016-01-02 06:00:00 0.025229
2016-01-02 11:00:00 0.229192
2016-01-02 16:00:00 0.509811
2016-01-02 21:00:00 0.169681
Freq: 5H, dtype: float64

时间序列数据的采样和画图

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from datetime import datetime

t_range = pd.date_range('2016-01-01','2016-12-31')
t_range

DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’,
…
‘2016-12-22’, ‘2016-12-23’, ‘2016-12-24’, ‘2016-12-25’,
‘2016-12-26’, ‘2016-12-27’, ‘2016-12-28’, ‘2016-12-29’,
‘2016-12-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, length=366, freq=’D’)

1
2
3

s1 = Series(np.random.rand(len(t_range)), index=t_range)

s1['2016-01'].mean()

0.48349408856972315

按月份采样，计算平均值

1
2
3

s1_month = s1.resample('M').mean()

s1_month.index

DatetimeIndex([‘2016-01-31’, ‘2016-02-29’, ‘2016-03-31’, ‘2016-04-30’,
‘2016-05-31’, ‘2016-06-30’, ‘2016-07-31’, ‘2016-08-31’,
‘2016-09-30’, ‘2016-10-31’, ‘2016-11-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, freq=’M’)

按小时采样，并(向后)填充数据

1	s1.resample('H').ffill()

2016-01-01 00:00:00 0.196584
2016-01-01 01:00:00 0.196584
2016-01-01 02:00:00 0.196584
2016-01-01 03:00:00 0.196584
2016-01-01 04:00:00 0.196584
2016-01-01 05:00:00 0.196584
2016-01-01 06:00:00 0.196584
2016-01-01 07:00:00 0.196584
2016-01-01 08:00:00 0.196584
2016-01-01 09:00:00 0.196584
2016-01-01 10:00:00 0.196584
2016-01-01 11:00:00 0.196584
2016-01-01 12:00:00 0.196584
2016-01-01 13:00:00 0.196584
2016-01-01 14:00:00 0.196584
2016-01-01 15:00:00 0.196584
2016-01-01 16:00:00 0.196584
2016-01-01 17:00:00 0.196584
2016-01-01 18:00:00 0.196584
2016-01-01 19:00:00 0.196584
2016-01-01 20:00:00 0.196584
2016-01-01 21:00:00 0.196584
2016-01-01 22:00:00 0.196584
2016-01-01 23:00:00 0.196584
2016-01-02 00:00:00 0.209433
2016-01-02 01:00:00 0.209433
2016-01-02 02:00:00 0.209433
2016-01-02 03:00:00 0.209433
2016-01-02 04:00:00 0.209433
2016-01-02 05:00:00 0.209433
…
2016-12-29 19:00:00 0.287915
2016-12-29 20:00:00 0.287915
2016-12-29 21:00:00 0.287915
2016-12-29 22:00:00 0.287915
2016-12-29 23:00:00 0.287915
2016-12-30 00:00:00 0.023052
2016-12-30 01:00:00 0.023052
2016-12-30 02:00:00 0.023052
2016-12-30 03:00:00 0.023052
2016-12-30 04:00:00 0.023052
2016-12-30 05:00:00 0.023052
2016-12-30 06:00:00 0.023052
2016-12-30 07:00:00 0.023052
2016-12-30 08:00:00 0.023052
2016-12-30 09:00:00 0.023052
2016-12-30 10:00:00 0.023052
2016-12-30 11:00:00 0.023052
2016-12-30 12:00:00 0.023052
2016-12-30 13:00:00 0.023052
2016-12-30 14:00:00 0.023052
2016-12-30 15:00:00 0.023052
2016-12-30 16:00:00 0.023052
2016-12-30 17:00:00 0.023052
2016-12-30 18:00:00 0.023052
2016-12-30 19:00:00 0.023052
2016-12-30 20:00:00 0.023052
2016-12-30 21:00:00 0.023052
2016-12-30 22:00:00 0.023052
2016-12-30 23:00:00 0.023052
2016-12-31 00:00:00 0.515360
Freq: H, Length: 8761, dtype: float64

基于时间序列画图

1
2
3

t_range = pd.date_range('2016-01-01','2016-12-31',freq='H')

t_range

DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 01:00:00’,
‘2016-01-01 02:00:00’, ‘2016-01-01 03:00:00’,
‘2016-01-01 04:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 06:00:00’, ‘2016-01-01 07:00:00’,
‘2016-01-01 08:00:00’, ‘2016-01-01 09:00:00’,
…
‘2016-12-30 15:00:00’, ‘2016-12-30 16:00:00’,
‘2016-12-30 17:00:00’, ‘2016-12-30 18:00:00’,
‘2016-12-30 19:00:00’, ‘2016-12-30 20:00:00’,
‘2016-12-30 21:00:00’, ‘2016-12-30 22:00:00’,
‘2016-12-30 23:00:00’, ‘2016-12-31 00:00:00’],
dtype=’datetime64[ns]’, length=8761, freq=’H’)

stock_df = DataFrame(index=t_range)
stock_df['BABA'] = np.random.randint(80,160, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30,50, size=len(t_range))

stock_df.head()

	BABA	TENCENT
2016-01-01 00:00:00	134	43
2016-01-01 01:00:00	98	35
2016-01-01 02:00:00	150	32
2016-01-01 03:00:00	96	37
2016-01-01 04:00:00	140	40

1	stock_df.plot()

1 2	import matplotlib.pyplot as plt plt.show()

weekly_df = DataFrame()
weekly_df['BABA'] = stock_df['BABA'].resample('W').mean()
weekly_df['TENCENT'] = stock_df['TENCENT'].resample('W').mean()

weekly_df.head()

	BABA	TENCENT
2016-01-03	118.777778	38.638889
2016-01-10	120.690476	40.077381
2016-01-17	120.678571	39.452381
2016-01-24	119.833333	39.386905
2016-01-31	119.577381	39.416667

1	weekly_df.plot()

数据分箱Binning

对list进行分箱

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

score_list = np.random.randint(25,100,size=20)

score_list

array([95, 75, 43, 49, 26, 95, 28, 55, 93, 97, 91, 46, 39, 88, 32, 97, 93,
80, 57, 70])

bins = [0,59,70,80,100]

score_cat = pd.cut(score_list, bins)

pd.value_counts(score_cat)

(0, 59] 9
(80, 100] 8
(70, 80] 2
(59, 70] 1
dtype: int64

对DataFrame进行分箱

df = DataFrame()
df['score'] = score_list

df

	score
0	95
1	75
2	43
3	49
4	26
5	95
6	28
7	55
8	93
9	97
10	91
11	46
12	39
13	88
14	32
15	97
16	93
17	80
18	57
19	70

利用 pd.util.testing.rands() 随机生成指定位数的字符串

1
2
3

 df['student'] = [pd.util.testing.rands(3) for i in range(20)]

df

	score	student
0	95	PqQ
1	75	ds2
2	43	iUV
3	49	JjY
4	26	FK3
5	95	b9b
6	28	DyV
7	55	103
8	93	RUC
9	97	VqI
10	91	sjo
11	46	cCl
12	39	TeO
13	88	xap
14	32	GfG
15	97	FNg
16	93	pk0
17	80	ziG
18	57	5k0
19	70	ozP

通过 pd.cut() 的 labels 参数，对数据进行分类打标签

1
2
3

df['Categories'] = pd.cut(df['score'], bins, labels=['Low', 'OK', 'Good', 'Great'])

df

	score	student	Categories
0	95	PqQ	Great
1	75	ds2	Good
2	43	iUV	Low
3	49	JjY	Low
4	26	FK3	Low
5	95	b9b	Great
6	28	DyV	Low
7	55	103	Low
8	93	RUC	Great
9	97	VqI	Great
10	91	sjo	Great
11	46	cCl	Low
12	39	TeO	Low
13	88	xap	Great
14	32	GfG	Low
15	97	FNg	Great
16	93	pk0	Great
17	80	ziG	Good
18	57	5k0	Low
19	70	ozP	OK

数据分组技术GroupBy

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

 df = pd.read_csv('city_weather.csv')

df.head()

	date	city	temperature	wind
0	03/01/2016	BJ	8	5
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
3	14/02/2016	BJ	-3	3
4	28/02/2016	BJ	19	2

1
2
3

g = df.groupby(df['city'])

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x119b0d6d8>

获取分组内容

g.groups

df_bj = g.get_group('BJ')

df_bj

	date	city	temperature	wind
0	03/01/2016	BJ	8	5
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
3	14/02/2016	BJ	-3	3
4	28/02/2016	BJ	19	2
5	13/03/2016	BJ	5	3

1	df_bj.mean()

temperature 10.000000
wind 2.833333
dtype: float64

对分组取平均值

g.mean()

	temperature	wind
city
BJ	10.000	2.833333
GZ	8.750	4.000000
SH	4.625	3.625000
SZ	5.000	2.500000

对分组取最小值

g.min()

	date	temperature	wind
city
BJ	03/01/2016	-3	2
GZ	14/08/2016	-1	2
SH	03/07/2016	-10	2
SZ	11/09/2016	-10	1

groupby对象 => list => dict

1	dict(list(g))['BJ']

	date	city	temperature	wind
0	03/01/2016	BJ	8	5
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
3	14/02/2016	BJ	-3	3
4	28/02/2016	BJ	19	2
5	13/03/2016	BJ	5	3

for循环遍历groupby变量

1
2
3

for name, group_df in g:
    print(name)
    print(group_df)

BJ
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
GZ
date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
SH
date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ
date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4

数据聚合技术Aggregation

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('city_weather.csv')
g = df.groupby('city')

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11b921cc0>

g.max()

	date	temperature	wind
city
BJ	31/01/2016	19	5
GZ	31/07/2016	25	5
SH	27/03/2016	20	5
SZ	25/09/2016	20	4

g.count()

	date	temperature	wind
city
BJ	6	6	6
GZ	4	4	4
SH	8	8	8
SZ	2	2	2

1	g.describe()

	temperature								wind
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
city
BJ	6.0	10.000	8.532292	-3.0	5.75	10.0	17.25	19.0	6.0	2.833333	1.169045	2.0	2.00	2.5	3.00	5.0
GZ	4.0	8.750	11.842719	-1.0	0.50	5.5	13.75	25.0	4.0	4.000000	1.414214	2.0	3.50	4.5	5.00	5.0
SH	8.0	4.625	12.489281	-10.0	-5.25	2.0	17.50	20.0	8.0	3.625000	1.060660	2.0	3.00	3.5	4.25	5.0
SZ	2.0	5.000	21.213203	-10.0	-2.50	5.0	12.50	20.0	2.0	2.500000	2.121320	1.0	1.75	2.5	3.25	4.0

1	g.agg('min')

	date	temperature	wind
city
BJ	03/01/2016	-3	2
GZ	14/08/2016	-1	2
SH	03/07/2016	-10	2
SZ	11/09/2016	-10	1

自定义聚合函数

1 2	def foo(attr): return attr.max() - attr.min()

1	g.agg(foo)

	temperature	wind
city
BJ	22	3
GZ	26	3
SH	30	3
SZ	30	3

对多个列进行groupby

1
2
3

g_new = df.groupby(['city','wind'])

g_new

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ba83208>

1	g_new.groups

{(‘BJ’, 2): Int64Index([1, 2, 4], dtype=’int64’),
(‘BJ’, 3): Int64Index([3, 5], dtype=’int64’),
(‘BJ’, 5): Int64Index([0], dtype=’int64’),
(‘GZ’, 2): Int64Index([14], dtype=’int64’),
(‘GZ’, 4): Int64Index([17], dtype=’int64’),
(‘GZ’, 5): Int64Index([15, 16], dtype=’int64’),
(‘SH’, 2): Int64Index([10], dtype=’int64’),
(‘SH’, 3): Int64Index([7, 8, 9], dtype=’int64’),
(‘SH’, 4): Int64Index([6, 11], dtype=’int64’),
(‘SH’, 5): Int64Index([12, 13], dtype=’int64’),
(‘SZ’, 1): Int64Index([18], dtype=’int64’),
(‘SZ’, 4): Int64Index([19], dtype=’int64’)}

1	g_new.get_group(('BJ',2))

	date	city	temperature	wind
1	17/01/2016	BJ	12	2
2	31/01/2016	BJ	19	2
4	28/02/2016	BJ	19	2

for循环遍历多列groupby变量

1
2
3

for (name_1, name_2), group in g_new:
    print(name_1,name_2)
    print(group)

BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4

透视表

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_excel('sales-funnel.xlsx')

df

	Account	Name	Rep	Manager	Product	Quantity	Price	Status
0	714466	Trantow-Barrows	Craig Booker	Debra Henley	CPU	1	30000	presented
1	714466	Trantow-Barrows	Craig Booker	Debra Henley	Software	1	10000	presented
2	714466	Trantow-Barrows	Craig Booker	Debra Henley	Maintenance	2	5000	pending
3	737550	Fritsch, Russel and Anderson	Craig Booker	Debra Henley	CPU	1	35000	declined
4	146832	Kiehn-Spinka	Daniel Hilton	Debra Henley	CPU	2	65000	won
5	218895	Kulas Inc	Daniel Hilton	Debra Henley	CPU	2	40000	pending
6	218895	Kulas Inc	Daniel Hilton	Debra Henley	Software	1	10000	presented
7	412290	Jerde-Hilpert	John Smith	Debra Henley	Maintenance	2	5000	pending
8	740150	Barton LLC	John Smith	Debra Henley	CPU	1	35000	declined
9	141962	Herman LLC	Cedric Moss	Fred Anderson	CPU	2	65000	won
10	163416	Purdy-Kunde	Cedric Moss	Fred Anderson	CPU	1	30000	presented
11	239344	Stokes LLC	Cedric Moss	Fred Anderson	Maintenance	1	5000	pending
12	239344	Stokes LLC	Cedric Moss	Fred Anderson	Software	1	10000	presented
13	307599	Kassulke, Ondricka and Metz	Wendy Yule	Fred Anderson	Maintenance	3	7000	won
14	688981	Keeling LLC	Wendy Yule	Fred Anderson	CPU	5	100000	won
15	729833	Koepp Ltd	Wendy Yule	Fred Anderson	CPU	2	65000	declined
16	729833	Koepp Ltd	Wendy Yule	Fred Anderson	Monitor	2	5000	presented

df.size

136

len(df)

17

1	pd.pivot_table(df, index=['Name','Rep','Manager'])

			Account	Price	Quantity
Name	Rep	Manager
Barton LLC	John Smith	Debra Henley	740150	35000	1.000000
Fritsch, Russel and Anderson	Craig Booker	Debra Henley	737550	35000	1.000000
Herman LLC	Cedric Moss	Fred Anderson	141962	65000	2.000000
Jerde-Hilpert	John Smith	Debra Henley	412290	5000	2.000000
Kassulke, Ondricka and Metz	Wendy Yule	Fred Anderson	307599	7000	3.000000
Keeling LLC	Wendy Yule	Fred Anderson	688981	100000	5.000000
Kiehn-Spinka	Daniel Hilton	Debra Henley	146832	65000	2.000000
Koepp Ltd	Wendy Yule	Fred Anderson	729833	35000	2.000000
Kulas Inc	Daniel Hilton	Debra Henley	218895	25000	1.500000
Purdy-Kunde	Cedric Moss	Fred Anderson	163416	30000	1.000000
Stokes LLC	Cedric Moss	Fred Anderson	239344	7500	1.000000
Trantow-Barrows	Craig Booker	Debra Henley	714466	15000	1.333333

1	pd.pivot_table(df, index=['Manager','Rep'], values=['Price', 'Quantity'], aggfunc='sum')

		Price	Quantity
Manager	Rep
Debra Henley	Craig Booker	80000	5
	Daniel Hilton	115000	5
	John Smith	40000	3
Fred Anderson	Cedric Moss	110000	5
Fred Anderson	Wendy Yule	177000	12

1	pd.pivot_table(df, index=['Manager','Rep'], values=['Price'], columns=['Product'], fill_value=0, aggfunc='sum')

		Price
	Product	CPU	Maintenance	Monitor	Software
Manager	Rep
Debra Henley	Craig Booker	65000	5000	0	10000
	Daniel Hilton	105000	0	0	10000
	John Smith	35000	5000	0	0
Fred Anderson	Cedric Moss	95000	5000	0	10000
Fred Anderson	Wendy Yule	165000	7000	5000	0

分组和透视功能

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('usa_flights.csv')

df.head()

	flight_date	unique_carrier	flight_num	origin	dest	arr_delay	distance	carrier_delay	weather_delay	late_aircraft_delay	nas_delay	security_delay	actual_elapsed_time
0	02/01/2015 0:00	AA	1	JFK	LAX	-19.0	2475	NaN	NaN	NaN	NaN	NaN	381.0
1	03/01/2015 0:00	AA	1	JFK	LAX	-39.0	2475	NaN	NaN	NaN	NaN	NaN	358.0
2	04/01/2015 0:00	AA	1	JFK	LAX	-12.0	2475	NaN	NaN	NaN	NaN	NaN	385.0
3	05/01/2015 0:00	AA	1	JFK	LAX	-8.0	2475	NaN	NaN	NaN	NaN	NaN	389.0
4	06/01/2015 0:00	AA	1	JFK	LAX	25.0	2475	0.0	0.0	0.0	25.0	0.0	424.0

df.shape

(201664, 14)

df.tail()

	flight_date	unique_carrier	flight_num	origin	dest	arr_delay	distance	carrier_delay	weather_delay	late_aircraft_delay	nas_delay	security_delay	actual_elapsed_time
201659	10/01/2015 0:00	NK	188	OAK	LAS	-16.0	407	NaN	NaN	NaN	NaN	NaN	77.0
201660	11/01/2015 0:00	NK	188	OAK	LAS	-4.0	407	NaN	NaN	NaN	NaN	NaN	87.0
201661	12/01/2015 0:00	NK	188	OAK	LAS	-7.0	407	NaN	NaN	NaN	NaN	NaN	82.0
201662	13/01/2015 0:00	NK	188	OAK	LAS	23.0	407	3.0	0.0	0.0	20.0	0.0	103.0
201663	14/01/2015 0:00	NK	188	OAK	LAS	-7.0	407	NaN	NaN	NaN	NaN	NaN	82.0

获取延误时间Top 10的航班

1	df.sort_values('arr_delay', ascending=False)[:10]

	flight_date	unique_carrier	flight_num	origin	dest	arr_delay	distance	carrier_delay	late_aircraft_delay	nas_delay	actual_elapsed_time
11073	11/01/2015 0:00	AA	1595	AUS	DFW	1444.0	190	1444.0	0.0	0.0	59.0
10214	13/01/2015 0:00	AA	1487	OMA	DFW	1392.0	583	1392.0	0.0	0.0	117.0
12430	03/01/2015 0:00	AA	1677	MEM	DFW	1384.0	432	1380.0	0.0	4.0	104.0
8443	04/01/2015 0:00	AA	1279	OMA	DFW	1237.0	583	1222.0	15.0	0.0	102.0
10328	05/01/2015 0:00	AA	1495	EGE	DFW	1187.0	721	1019.0	168.0	0.0	127.0
36570	04/01/2015 0:00	DL	1435	MIA	MSP	1174.0	1501	1174.0	0.0	0.0	231.0
36495	04/01/2015 0:00	DL	1367	ROC	ATL	1138.0	749	1112.0	0.0	26.0	171.0
59072	14/01/2015 0:00	DL	1687	SAN	MSP	1084.0	1532	1070.0	0.0	14.0	240.0
32173	05/01/2015 0:00	AA	970	LAS	LAX	1042.0	236	1033.0	9.0	0.0	66.0
56488	12/01/2015 0:00	DL	2117	ATL	COS	1016.0	1184	1016.0	0.0	0.0	193.0

计算延误和没有延误所占比例

1	df['cancelled'].value_counts()

0 196873
1 4791
Name: cancelled, dtype: int64

1
2
3

df['delayed'] = df['arr_delay'].apply(lambda x:x>0)

df.head()

	flight_date	unique_carrier	flight_num	origin	dest	arr_delay	distance	carrier_delay	weather_delay	late_aircraft_delay	nas_delay	security_delay	actual_elapsed_time	delayed
0	02/01/2015 0:00	AA	1	JFK	LAX	-19.0	2475	NaN	NaN	NaN	NaN	NaN	381.0	False
1	03/01/2015 0:00	AA	1	JFK	LAX	-39.0	2475	NaN	NaN	NaN	NaN	NaN	358.0	False
2	04/01/2015 0:00	AA	1	JFK	LAX	-12.0	2475	NaN	NaN	NaN	NaN	NaN	385.0	False
3	05/01/2015 0:00	AA	1	JFK	LAX	-8.0	2475	NaN	NaN	NaN	NaN	NaN	389.0	False
4	06/01/2015 0:00	AA	1	JFK	LAX	25.0	2475	0.0	0.0	0.0	25.0	0.0	424.0	True

1
2
3

delay_data = df['delayed'].value_counts()

delay_data

False 103037
True 98627
Name: delayed, dtype: int64

1	delay_data[1]/(delay_data[0] + delay_data[1])

0.4890659711202793

每个航空公司延误情况

1
2
3

delay_group = df.groupby(['unique_carrier','delayed'])

delay_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118b616d8>

1	delay_group.size()

unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64

1
2
3

df_delay = delay_group.size().unstack()

df_delay

delayed	False	True
unique_carrier
AA	8912	9841
AS	3527	2104
B6	4832	4401
DL	17719	9803
EV	10596	11371
F9	1103	1848
HA	1351	1354
MQ	4692	8060
NK	1550	2133
OO	9977	10804
UA	7885	8624
US	7850	6353
VX	1254	781
WN	21789	21150

1
2
3

import matplotlib.pyplot as plt

df_delay.plot(kind='barh',stacked=True,figsize=[16,6],colormap='winter')

透视表

1
2
3

flight_by_carrier = df.pivot_table(index=['flight_date'],columns=['unique_carrier'],values=['flight_num'],aggfunc='count')

flight_by_carrier.head()

	flight_num
unique_carrier	AA	AS	B6	DL	EV	F9	HA	MQ	NK	OO	UA	US	VX	WN
flight_date
02/01/2015 0:00	1545	477	759	2271	1824	254	224	1046	287	1763	1420	1177	176	3518
03/01/2015 0:00	1453	449	711	2031	1744	192	202	937	285	1681	1233	1028	160	3328
04/01/2015 0:00	1534	458	759	2258	1833	249	206	1027	284	1731	1283	1158	169	3403
05/01/2015 0:00	1532	433	754	2212	1811	264	209	1039	288	1737	1432	1157	174	3506
06/01/2015 0:00	1400	415	692	2054	1686	249	202	966	279	1527	1294	1003	152	3396

Matplotlib

简单绘图之plot

import numpy as np
import matplotlib.pyplot as plt

a = [1,2,3]
plt.plot(a)

plt.show() 开始绘图

1	plt.show()

1 2	b = [4,5,6] plt.plot(a,b)

魔法命令，可直接显示图像

1	%matplotlib inline

显示运行时间

1	%timeit np.arange(10)

594 ns ± 9.52 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

1	plt.plot(a,b,'*')

1	plt.plot(a,b,'r--')

一张图中画多条线

c = [10,8,6]
d = [1,8,3]

plt.plot(a,b,'r--',c,d,'b*')

画三角函数，并设置label，label，title

1 2	t = np.arange(0.0,2.0,0.1) t.size

20

1 2	s = np.sin(t * np.pi) s.size

20

array([ 0.00000000e+00, 3.09016994e-01, 5.87785252e-01, 8.09016994e-01,
9.51056516e-01, 1.00000000e+00, 9.51056516e-01, 8.09016994e-01,
5.87785252e-01, 3.09016994e-01, 1.22464680e-16, -3.09016994e-01,
-5.87785252e-01, -8.09016994e-01, -9.51056516e-01, -1.00000000e+00,
-9.51056516e-01, -8.09016994e-01, -5.87785252e-01, -3.09016994e-01])

plt.plot(t,s,'r--',label='aaaa')
plt.plot(t*2,s,'--',label='bbbb')
plt.xlabel('this is x')
plt.ylabel('this is y')
plt.title('this is a demo')
plt.legend()

简单绘图之subplot

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0,5)
y1 = np.sin(np.pi*x)
y2 = np.sin(np.pi*x*2)

plt.plot(x, y1, 'b--', label='sin(pi*x)')
plt.ylabel('y1 value')
plt.plot(x, y2, 'r--', label='sin(pi*2x)')
plt.ylabel('y2 value')
plt.xlabel('x value')
plt.title('this is x-y value')
plt.legend()

plt.subplot(2,2,1)
plt.plot(x, y1, 'b--')
plt.ylabel('y1')
plt.subplot(2,2,2)
plt.plot(x, y2, 'r--')
plt.ylabel('y2')
plt.xlabel('x')
plt.subplot(2,2,3)
plt.plot(x, y1, 'r*')
plt.subplot(2,2,4)
plt.plot(x, y1, 'b*')

subplots

1	figure, ax = plt.subplots(2,2)

ax[0][0].plot(x,y1)
ax[0][1].plot(x,y2)

figure

Series

import numpy as np
import pandas as pd
from pandas import Series
import matplotlib.pyplot as plt

s1 = Series(np.random.randn(100).cumsum())
s2 = Series(np.random.randn(100).cumsum())

1	s1.plot(style='--')

1
2
3

s1.plot(kind='line', grid=True, label='S1', title='this is Series')
s2.plot(label='S2', style='--')
plt.legend()

1
2
3

figure, ax = plt.subplots(2,1)

figure

1
2
3

s1[0:10].plot(ax=ax[0],label='S1',kind='bar')
s2.plot(ax=ax[1],label='S2')
figure

DataFrame

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

df = DataFrame(
    np.random.randint(1,10,40).reshape(10,4),
    columns=['A','B','C','D']
)

df

	A	B	C	D
0	2	1	9	8
1	2	3	9	1
2	2	5	9	2
3	4	1	8	6
4	1	9	8	9
5	5	4	6	5
6	7	2	1	8
7	1	4	1	8
8	8	1	2	9
9	2	4	5	4

横向柱状图

1	df.plot(kind='barh')

Stacked柱状图

1	df.plot(kind='bar',stacked=True)

区域填充

1	df.plot(kind='area')

指定index画图

1
2
3

a = df.iloc[5]

a.plot()

按index画图（方法1）

for i in df.index:
    df.iloc[i].plot(label=str(i))

plt.legend()

按index画图（方法2）

1	df.T.plot()

按column画图

1	df['A'].plot()

直方图

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

s = Series(np.random.randn(1000))

plt.hist(s, rwidth=0.9)

array([ 1., 9., 42., 129., 243., 257., 198., 93., 24., 4.]),
array([-3.75513064, -3.04405274, -2.33297485, -1.62189695, -0.91081905,
-0.19974115, 0.51133675, 1.22241465, 1.93349255, 2.64457045,
3.35564835])

1	plt.hist(s, rwidth=0.9, bins=20, color='r')

array([ 1., 0., 1., 8., 15., 27., 46., 83., 103., 140., 143.,
114., 120., 78., 61., 32., 15., 9., 3., 1.]),
array([-3.75513064, -3.39959169, -3.04405274, -2.68851379, -2.33297485,
-1.9774359 , -1.62189695, -1.266358 , -0.91081905, -0.5552801 ,
-0.19974115, 0.1557978 , 0.51133675, 0.8668757 , 1.22241465,
1.5779536 , 1.93349255, 2.2890315 , 2.64457045, 3.0001094 ,
3.35564835])

密度图

1	s.plot(kind='kde')

Seaborn

散点图

方法1：matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

iris = pd.read_csv('iris.csv')
iris.head()

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

1	iris.Name.unique()

array([‘Iris-setosa’, ‘Iris-versicolor’, ‘Iris-virginica’], dtype=object)

1
2
3

color_map = dict(zip(iris.Name.unique(),['blue','green','red']))

color_map

{‘Iris-setosa’: ‘blue’, ‘Iris-versicolor’: ‘green’, ‘Iris-virginica’: ‘red’}

for species, group in iris.groupby('Name'):
    plt.scatter(group['PetalLength'], group['SepalLength'],
               color=color_map[species],
               alpha=0.3, edgecolor=None,
               label=species)

plt.legend(frameon=True, title='Name')
plt.xlabel('petalLength')
plt.ylabel('sepalLength')

方法2：Seaborn

1	sns.lmplot('PetalLength','SepalLength', iris, hue='Name', fit_reg=False)

直方图（matplotlib）

1 2	s1 = Series(np.random.randn(1000)) plt.hist(s1)

(array([ 5., 36., 96., 196., 278., 227., 111., 37., 9., 5.]),
array([-3.17567133, -2.48638745, -1.79710358, -1.1078197 , -0.41853582,
0.27074806, 0.96003194, 1.64931581, 2.33859969, 3.02788357,
3.71716745]),
<a list of 10 Patch objects>)

密度图（matplotlib）

1	s1.plot(kind='kde')

直方图（Seaborn）

1	sns.distplot(s1, hist=True, kde=False, rug=True)

密度图（Seaborn）

1	sns.kdeplot(s1, shade=True, color='r')

轴须图（Seaborn）

1	sns.rugplot(s1)

热力图（Seaborn）

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('flights')

df.head()

	year	month	passengers
0	1949	January	112
1	1949	February	118
2	1949	March	132
3	1949	April	129
4	1949	May	121

df.shape

(144, 3)

生成透视表

1 2	df = df.pivot(index='month', columns='year', values='passengers') df

year	1949	1950	1951	1952	1953	1954	1955	1956	1957	1958	1959	1960
month
January	112	115	145	171	196	204	242	284	315	340	360	417
February	118	126	150	180	196	188	233	277	301	318	342	391
March	132	141	178	193	236	235	267	317	356	362	406	419
April	129	135	163	181	235	227	269	313	348	348	396	461
May	121	125	172	183	229	234	270	318	355	363	420	472
June	135	149	178	218	243	264	315	374	422	435	472	535
July	148	170	199	230	264	302	364	413	465	491	548	622
August	148	170	199	242	272	293	347	405	467	505	559	606
September	136	158	184	209	237	259	312	355	404	404	463	508
October	119	133	162	191	211	229	274	306	347	359	407	461
November	104	114	146	172	180	203	237	271	305	310	362	390
December	118	140	166	194	201	229	278	306	336	337	405	432

线形图（Matplotlib）

df.plot()

热力图（Seaborn）

1	sns.heatmap(df, annot=True, fmt='d')

柱状图（Matplotlib）

1	s.plot(kind='bar')

柱状图（Seaborn）

1
2
3

 s = df.sum()

s.index

Int64Index([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
1960],
dtype=’int64’, name=’year’)

s.values

array([1520, 1676, 2042, 2364, 2700, 2867, 3408, 3939, 4421, 4572, 5140,
5714])

1	sns.barplot(x=s.index, y=s.values)

Seaborn设置图形效果

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x = np.linspace(0,14,100)
y1 = np.sin(x)
y2 = np.sin(x+2)*1.25

def sinplot():
    plt.plot(x, y1)
    plt.plot(x, y2)

sinplot()

设置Seaborn显示风格

import seaborn as sns
style = ['darkgrid', 'dark', 'white', 'whitegrid', 'ticks']

sns.set_style(style[0])
sinplot()

1 2	sns.set_style(style[3]) sinplot()

1	sns.axes_style()

{‘axes.facecolor’: ‘white’,
‘axes.edgecolor’: ‘.8’,
‘axes.grid’: True,
‘axes.axisbelow’: True,
‘axes.labelcolor’: ‘.15’,
‘figure.facecolor’: ‘white’,
‘grid.color’: ‘.8’,
‘grid.linestyle’: ‘-‘,
‘text.color’: ‘.15’,
‘xtick.color’: ‘.15’,
‘ytick.color’: ‘.15’,
‘xtick.direction’: ‘out’,
‘ytick.direction’: ‘out’,
‘lines.solid_capstyle’: ‘round’,
‘patch.edgecolor’: ‘w’,
‘image.cmap’: ‘rocket’,
‘font.family’: [‘sans-serif’],
‘font.sans-serif’: [‘Arial’,
‘DejaVu Sans’,
‘Liberation Sans’,
‘Bitstream Vera Sans’,
‘sans-serif’],
‘patch.force_edgecolor’: True,
‘xtick.bottom’: False,
‘xtick.top’: False,
‘ytick.left’: False,
‘ytick.right’: False,
‘axes.spines.left’: True,
‘axes.spines.bottom’: True,
‘axes.spines.right’: True,
‘axes.spines.top’: True}

自定义配置

1 2	sns.set_style(style[0],{'grid.color': 'green'}) sinplot()

清空自定义设置

1 2	sns.set() sinplot()

plotting_context() & set_context()

context = ['paper', 'notebook', 'talk', 'poster']

sns.set_context(context[1], rc={'grid.linewidth': 3.0})
sinplot()

1 2	sns.set_context(context[3]) sinplot()

显示context详细设置

1	sns.plotting_context()

{‘font.size’: 24.0,
‘axes.labelsize’: 24.0,
‘axes.titlesize’: 24.0,
‘xtick.labelsize’: 22.0,
‘ytick.labelsize’: 22.0,
‘legend.fontsize’: 22.0,
‘axes.linewidth’: 2.5,
‘grid.linewidth’: 2.0,
‘lines.linewidth’: 3.0,
‘lines.markersize’: 12.0,
‘patch.linewidth’: 2.0,
‘xtick.major.width’: 2.5,
‘ytick.major.width’: 2.5,
‘xtick.minor.width’: 2.0,
‘ytick.minor.width’: 2.0,
‘xtick.major.size’: 12.0,
‘ytick.major.size’: 12.0,
‘xtick.minor.size’: 8.0,
‘ytick.minor.size’: 8.0}

股票分析

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader as pdr
from datetime import datetime

1
2
3

alibaba = pdr.get_data_yahoo('BABA')

alibaba.head()

	High	Low	Open	Close	Volume	Adj Close
Date
2014-09-19	99.699997	89.949997	92.699997	93.889999	271879400	93.889999
2014-09-22	92.949997	89.500000	92.699997	89.889999	66657800	89.889999
2014-09-23	90.480003	86.620003	88.940002	87.169998	39009800	87.169998
2014-09-24	90.570000	87.220001	88.470001	90.570000	32088000	90.570000
2014-09-25	91.500000	88.500000	91.089996	88.919998	28598000	88.919998

1	alibaba.shape

(1211, 6)

1	alibaba.tail()

	High	Low	Open	Close	Volume	Adj Close
Date
2019-07-08	170.279999	168.350006	170.089996	168.449997	16531000	168.449997
2019-07-09	168.979996	166.500000	167.085007	168.800003	15782800	168.800003
2019-07-10	171.369995	166.860001	171.240005	166.929993	16675600	166.929993
2019-07-11	168.423996	165.000000	167.559998	166.550003	14796400	166.550003
2019-07-12	170.199997	167.649994	168.000000	169.070007	18093300	169.070007

1	alibaba.describe()

	High	Low	Open	Close	Volume	Adj Close
count	1211.000000	1211.000000	1211.000000	1211.000000	1.211000e+03	1211.000000
mean	126.715532	123.620233	125.280824	125.175351	1.760189e+07	125.175351
std	44.698800	43.686183	44.263113	44.208817	1.260813e+07	44.208817
min	58.650002	57.200001	57.299999	57.389999	3.775300e+06	57.389999
25%	85.129997	83.375000	84.594997	84.494999	1.107115e+07	84.494999
50%	108.820000	106.510002	107.809998	107.830002	1.489790e+07	107.830002
75%	173.044998	169.508499	171.739998	171.590004	2.063065e+07	171.590004
max	211.699997	207.509995	209.949997	210.860001	2.718794e+08	210.860001

1	alibaba.info()

<class ‘pandas.core.frame.DataFrame’>
DatetimeIndex: 1211 entries, 2014-09-19 to 2019-07-12
Data columns (total 6 columns):
High 1211 non-null float64
Low 1211 non-null float64
Open 1211 non-null float64
Close 1211 non-null float64
Volume 1211 non-null int64
Adj Close 1211 non-null float64
dtypes: float64(5), int64(1)
memory usage: 66.2 KB

start = datetime(2015,9,20)
alibaba = pdr.get_data_yahoo('BABA', start=start)
amazon = pdr.get_data_yahoo('AMZN', start=start)

alibaba.to_csv('/Users/kwang/jupyter-projects/BABA.csv')
amazon.to_csv('/Users/kwang/jupyter-projects/AMZN.csv')

alibaba.head()

	High	Low	Open	Close	Volume	Adj Close
Date
2015-09-21	66.400002	62.959999	65.379997	63.900002	22355100	63.900002
2015-09-22	63.270000	61.580002	62.939999	61.900002	14897900	61.900002
2015-09-23	62.299999	59.680000	61.959999	60.000000	22684600	60.000000
2015-09-24	60.340000	58.209999	59.419998	59.919998	20645700	59.919998
2015-09-25	60.840000	58.919998	60.630001	59.240002	17009100	59.240002

1	alibaba['Adj Close'].plot(legend=True)

1	alibaba['Volume'].plot(legend=True)

1 2	alibaba['Adj Close'].plot() amazon['Adj Close'].plot()

1
2
3

alibaba['high-low'] = alibaba['High'] - alibaba['Low']

alibaba.head()

	High	Low	Open	Close	Volume	Adj Close	high-low
Date
2015-09-21	66.400002	62.959999	65.379997	63.900002	22355100	63.900002	3.440002
2015-09-22	63.270000	61.580002	62.939999	61.900002	14897900	61.900002	1.689999
2015-09-23	62.299999	59.680000	61.959999	60.000000	22684600	60.000000	2.619999
2015-09-24	60.340000	58.209999	59.419998	59.919998	20645700	59.919998	2.130001
2015-09-25	60.840000	58.919998	60.630001	59.240002	17009100	59.240002	1.920002

1	alibaba['high-low'].plot()

1
2
3

alibaba['daily-return'] = alibaba['Adj Close'].pct_change()

alibaba['daily-return'].plot(figsize=(20,4),linestyle='--',marker='o')

1	alibaba['daily-return'].plot(kind='hist')

1	sns.distplot(alibaba['daily-return'].dropna(),bins=100,color='purple')

start = datetime(2015,1,1)
company = ['AAPL', 'GOOG', 'MSFT', 'AMZN', 'FB']
top_tech_df = pdr.get_data_yahoo(company, start=start)['Adj Close']

top_tech_df.to_csv('/Users/kwang/jupyter-projects/top5.csv')

top_tech_df.head()

Symbols	AAPL	AMZN	FB	GOOG	MSFT
Date
2014-12-31	102.110046	310.350006	78.019997	524.958740	41.982117
2015-01-02	101.138702	308.519989	78.449997	523.373108	42.262299
2015-01-05	98.289474	302.190002	77.190002	512.463013	41.873657
2015-01-06	98.298729	295.290009	76.150002	500.585632	41.259060
2015-01-07	99.677094	298.420013	76.150002	499.727997	41.783279

1
2
3

top_tech_dr = top_tech_df.pct_change()

top_tech_dr.head()

Symbols	AAPL	AMZN	FB	GOOG	MSFT
Date
2014-12-31	NaN	NaN	NaN	NaN	NaN
2015-01-02	-0.009513	-0.005897	0.005511	-0.003020	0.006674
2015-01-05	-0.028171	-0.020517	-0.016061	-0.020846	-0.009196
2015-01-06	0.000094	-0.022833	-0.013473	-0.023177	-0.014677
2015-01-07	0.014022	0.010600	0.000000	-0.001713	0.012706

1	top_tech_df.plot()

1	top_tech_df[['AAPL','FB','MSFT']].plot()

散点图

1	sns.jointplot('AMZN','GOOG',top_tech_dr, kind='scatter')

1	sns.jointplot('MSFT','FB',top_tech_dr, kind='scatter')

1	sns.pairplot(top_tech_dr.dropna())

分位数

1	top_tech_dr['AAPL'].quantile(0.05)

-0.02510024760167844

95%的信心daily-return为-2%

1	top_tech_dr['MSFT'].quantile(0.05)

-0.021409700990842318

1 2	vips = pdr.get_data_yahoo('VIPS', start=start)['Adj Close'] vips.plot()

1	vips.pct_change().quantile(0.2)

-0.02329943700527643

	score
0	95
1	75
2	43
3	49
4	26
5	95
6	28
7	55
8	93
9	97
10	91
11	46
12	39
13	88
14	32
15	97
16	93
17	80
18	57
19	70

	score	student
0	95	PqQ
1	75	ds2
2	43	iUV
3	49	JjY
4	26	FK3
5	95	b9b
6	28	DyV
7	55	103
8	93	RUC
9	97	VqI
10	91	sjo
11	46	cCl
12	39	TeO
13	88	xap
14	32	GfG
15	97	FNg
16	93	pk0
17	80	ziG
18	57	5k0
19	70	ozP

	score
0	95
1	75
2	43
3	49
4	26
5	95
6	28
7	55
8	93
9	97
10	91
11	46
12	39
13	88
14	32
15	97
16	93
17	80
18	57
19	70

	score	student
0	95	PqQ
1	75	ds2
2	43	iUV
3	49	JjY
4	26	FK3
5	95	b9b
6	28	DyV
7	55	103
8	93	RUC
9	97	VqI
10	91	sjo
11	46	cCl
12	39	TeO
13	88	xap
14	32	GfG
15	97	FNg
16	93	pk0
17	80	ziG
18	57	5k0
19	70	ozP

	score
0	95
1	75
2	43
3	49
4	26
5	95
6	28
7	55
8	93
9	97
10	91
11	46
12	39
13	88
14	32
15	97
16	93
17	80
18	57
19	70

	score	student
0	95	PqQ
1	75	ds2
2	43	iUV
3	49	JjY
4	26	FK3
5	95	b9b
6	28	DyV
7	55	103
8	93	RUC
9	97	VqI
10	91	sjo
11	46	cCl
12	39	TeO
13	88	xap
14	32	GfG
15	97	FNg
16	93	pk0
17	80	ziG
18	57	5k0
19	70	ozP