Python 数据科学 学习笔记 - 钢钢更新

Numpy

数组Array

初始化数组

1
2
3
4
import numpy as np

list_1 = [1,2,3,4]
array_1 = np.array(list_1)

array([1, 2, 3, 4])

多维数组

1
2
list_2 = [5,6,7,8]
array_2 = np.array([list_1, list_2])

array([[1, 2, 3, 4],
[5, 6, 7, 8]])

获取数组信息

1
array_2.shape

(2, 4)

获取数组类型

1
array_2.dtype

dtype(‘int64’)

获取数组尺寸

1
array_2.size

8

1
array_4 = np.array([[1.0,2,3],[4.0,5,6]])

array([[1., 2., 3.],
[4., 5., 6.]])

1
array_4.dtype

dtype(‘float64’)

1
array_5 = np.arange(1,10,2)

array([1, 3, 5, 7, 9])

初始化元素为0的数组

1
np.zeros(5)

array([0., 0., 0., 0., 0.])

1
np.zeros([2,3])

array([[0., 0., 0.],
[0., 0., 0.]])

初始化数组

1
a = np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

数组切片

1
a[1]

2

1
a[1:5]

array([2, 3, 4, 5])

1
b = np.array([[1,2,3],[4,5,6]])

array([[1, 2, 3],
[4, 5, 6]])

1
b[1][0]

4

1
b[1,0]

4

1
c = np.array([[1,2,3],[4,5,6],[7,8,9]])

array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])

1
c[:2,1:]

array([[2, 3],
[5, 6]])

1
c[1:]

array([[4, 5, 6],
[7, 8, 9]])

1
c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:,:1]

array([[1],
[4],
[7]])

1
c[0:,:2]

array([[1, 2],
[4, 5],
[7, 8]])

1
c[0:1]

array([[1, 2, 3]])

1
c[1:]

array([[4, 5, 6],
[7, 8, 9]])

1
c[1:,1:]

array([[5, 6],
[8, 9]])

1
c[0:,1:2]

array([[2],
[5],
[8]])

1
c[0:,1:]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:,1:3]

array([[2, 3],
[5, 6],
[8, 9]])

1
c[0:2,1:3]

array([[2, 3],
[5, 6]])

用随机(小)数初始化数组

1
2
3
import numpy as np

np.random.randn(10)

array([-2.98186555, 0.51843113, -0.74563428, -1.84832264, -0.54354513,
1.23002927, 0.99527109, -0.49284623, 0.20869479, -0.46934356])

用随机整数初始化数组

1
np.random.randint(10)

2

1
np.random.randint(10, size=20).reshape(4,5)

array([[3, 2, 5, 1, 3],
[1, 6, 2, 8, 4],
[0, 4, 8, 7, 0],
[4, 4, 8, 5, 3]])

数组运算

1
a = np.random.randint(10, size=20).reshape(4,5)
1
b = np.random.randint(10, size=20).reshape(4,5)
1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
b

array([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

数组相加

1
a + b

array([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

数组相减

1
a - b

array([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

数组相乘

1
a * b

array([[ 0, 45, 72, 45, 32],
[ 0, 27, 5, 20, 0],
[12, 42, 15, 35, 54],
[ 9, 49, 27, 6, 54]])

数组常用函数

1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

获取数组中所有唯一数

1
np.unique(a)

array([0, 1, 3, 4, 5, 6, 7, 8, 9])

对数组每一列求和

1
sum(a)

array([14, 28, 25, 21, 20])

对数组指定行求和

1
sum(a[0])

37

对数组指定列求和

1
sum(a[:,0])

14

获取数组最大值

1
a.max()

9

获取数组指定行最大值

1
max(a[0])

9

获取数组指定列最大值

1
max(a[:,0])

7

使用pickle序列化array

1
2
3
4
import pickle
import numpy as np

x = np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

将数组以二进制的方式写入文件

1
f = open('x.pkl','wb')

写入文件

1
pickle.dump(x, f)

查看文件目录

1
!ls

Array的input和output.ipynb x.pkl
array.ipynb 数组与矩阵运算.ipynb

以二进制方式打开文件

1
f = open('x.pkl', 'rb')

读取文件内容

1
pickle.load(f)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

矩阵Matrix

1
np.mat([[1,2,3],[4,5,6]])

matrix([[1, 2, 3],
[4, 5, 6]])

1
np.mat(a)

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
a

array([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

矩阵的运算

1
A = np.mat(a)
1
B = np.mat(b)
1
A

matrix([[7, 5, 8, 9, 8],
[0, 9, 5, 4, 0],
[4, 7, 3, 7, 6],
[3, 7, 9, 1, 6]])

1
B

matrix([[0, 9, 9, 5, 4],
[7, 3, 1, 5, 7],
[3, 6, 5, 5, 9],
[3, 7, 3, 6, 9]])

矩阵相加

1
A + B

matrix([[ 7, 14, 17, 14, 12],
[ 7, 12, 6, 9, 7],
[ 7, 13, 8, 12, 15],
[ 6, 14, 12, 7, 15]])

矩阵相减

1
A - B

matrix([[ 7, -4, -1, 4, 4],
[-7, 6, 4, -1, -7],
[ 1, 1, -2, 2, -3],
[ 0, 0, 6, -5, -3]])

矩阵相乘

1
2
c = np.mat(np.random.randint(10, size=20).reshape(4,5))
d = np.mat(np.random.randint(10, size=20).reshape(5,4))
1
c

matrix([[8, 5, 2, 1, 8],
[9, 5, 8, 5, 4],
[5, 0, 6, 9, 8],
[3, 9, 7, 9, 6]])

1
d

matrix([[1, 8, 8, 5],
[3, 4, 5, 3],
[9, 3, 5, 2],
[8, 6, 9, 2],
[4, 9, 8, 1]])

1
c * d

matrix([[ 81, 168, 172, 69],
[152, 182, 214, 90],
[163, 184, 215, 63],
[189, 189, 233, 80]])

使用numpy进行输入输出(文件操作)

使用np.save写入文件

1
np.save('one_array', x)
1
!ls
Array的input和output.ipynb   x.pkl
array.ipynb                  数组与矩阵运算.ipynb
one_array.npy

使用np.load读取文件内容

1
np.load('one_array.npy')

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1
y = np.arange(20)

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

使用np.savez将多个数组保存到一个文件中

1
np.savez('two_array.npz', a=x, b=y)
1
!ls

Array的input和output.ipynb two_array.npz
array.ipynb x.pkl
one_array.npy 数组与矩阵运算.ipynb

利用np.load读取文件内容

1
c = np.load('two_array.npz')

利用索引名读取对应数组内容

1
c['a']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

1
c['b']

array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19])

Pandas

Series

Series初始化

1
2
3
4
import numpy as np
import pandas as pd

s1 = pd.Series([1,2,3,4])
1
s1

0 1
1 2
2 3
3 4
dtype: int64

values

1
s1.values

array([1, 2, 3, 4])

index

1
s1.index

RangeIndex(start=0, stop=4, step=1)

1
s2 = pd.Series(np.arange(10))
1
s2

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

指定index

1
s3 = pd.Series({'1':1,'2':2,'3':3})
1
s3

1 1
2 2
3 3
dtype: int64

1
s3.index

Index([‘1’, ‘2’, ‘3’], dtype=’object’)

1
s3.values

array([1, 2, 3])

1
s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])
1
s4

A 1
B 2
C 3
D 4
dtype: int64

1
s4.values

array([1, 2, 3, 4])

1
s4.index

Index([‘A’, ‘B’, ‘C’, ‘D’], dtype=’object’)

访问Series元素

1
s4['A']

1

1
s4[s4>2]

C 3
D 4
dtype: int64

1
s4

A 1
B 2
C 3
D 4
dtype: int64

1
s4.to_dict()

{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}

1
s5 = pd.Series(s4.to_dict())
1
s5

A 1
B 2
C 3
D 4
dtype: int64

新增index

1
2
index_1=['A','B','C','D','E']
s6 = pd.Series(s5, index=index_1)
1
s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

对Series元素进行空值判定

1
pd.isnull(s6)

A False
B False
C False
D False
E True
dtype: bool

对Series元素进行非空判定

1
pd.notnull(s6)

A True
B True
C True
D True
E False
dtype: bool

指定Series名称

1
s6.name = 'demo'
1
s6

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

指定Series索引名称

1
s6.index.name = 'index demo'
1
s6

index demo
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64

1
s6.index

Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’index demo’)

1
s6.values

array([ 1., 2., 3., 4., nan])

1
s1 = pd.Series(data['Country'], index=['A','B','C'])
1
s1

0 Belgium
1 India
2 Brazil
dtype: object

1
s1.values

array([‘Belgium’, ‘India’, ‘Brazil’], dtype=object)

1
s1.index

Index([‘A’, ‘B’, ‘C’], dtype=’object’)

Reindex on Series

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = pd.Series([1,2,3,4], index=['A','B','C','D'])
1
s1

A 1
B 2
C 3
D 4
dtype: int64

添加index

1
s1.reindex(['A','B','C','D','E'])

A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64

用固定数值填充新的index

1
s1.reindex(['A','B','C','D','E'], fill_value=10)

A 1
B 2
C 3
D 4
E 10
dtype: int64

1
s2 = pd.Series(['A','B','C'], index=[1,5,10])
1
s2

1 A
5 B
10 C
dtype: object

用 range() 扩充index

1
s2.reindex(index=range(15))

0 NaN
1 A
2 NaN
3 NaN
4 NaN
5 B
6 NaN
7 NaN
8 NaN
9 NaN
10 C
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object

用ffill填充新增index

1
s2.reindex(index=range(15), method='ffill')

0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object

对Series切片

1
s1

A 1
B 2
C 3
D 4
dtype: int64

1
s1.reindex(index=['A','B'])

A 1
B> 2
dtype: int64

1
s1
A    1
B    2
C    3
D    4
dtype: int64

对Series进行drop index

1
s1.drop('A')

B 2
C 3
D 4
dtype: int64

Series的简单数学计算

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series([1,2,3], index=['A','B','C'])

s1

A 1
B 2
C 3
dtype: int64

1
2
s2 = Series([4,5,6,7], index=['B','C','D','E'])
s2

B 4
C 5
D 6
E 7
dtype: int64

1
s1 + s2

A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64

Series的排序

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series(np.random.randn(10))

s1

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

1
s1.values

array([ 0.03503903, 1.08739272, -0.34281109, -0.5579896 , 0.54898768,
0.05196506, 2.98323291, 0.75773189, -0.39136856, 0.85090778])

1
s1.index

RangeIndex(start=0, stop=10, step=1)

按columns排序

1
2
s2 = s1.sort_values(ascending=False)
s2
6    2.983233
1    1.087393
9    0.850908
7    0.757732
4    0.548988
5    0.051965
0    0.035039
2   -0.342811
8   -0.391369
3   -0.557990
dtype: float64

按index排序

1
s2.sort_index()

0 0.035039
1 1.087393
2 -0.342811
3 -0.557990
4 0.548988
5 0.051965
6 2.983233
7 0.757732
8 -0.391369
9 0.850908
dtype: float64

Nan in Series

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
1
n = np.nan
1
n

nan

1
type(n)

float

1
2
m = 1
m + n

nan

1
s1 = Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
1
s1

A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64

空值判断

1
s1.isnull()

A False
B False
C True
D False
E False
dtype: bool

删除nan的行

1
s1.dropna()

A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64

DataFrame

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)

读取剪贴板内容,并生成DataFrame

1
2
3
df = pd.read_clipboard()

df








































































































Jul 2019 Jul 2018 Change Programming Language Ratings Change.1
0 1 1 NaN Java 15.058% -1.08%
1 2 2 NaN C 14.211% -0.45%
2 3 4 change Python 9.260% +2.90%
3 4 3 change C++ 6.705% -0.91%
4 5 6 change C# 4.365% +0.57%
5 6 5 change Visual Basic .NET 4.208% -0.04%
6 7 8 change JavaScript 2.304% -0.53%
7 8 7 change PHP 2.167% -0.67%
8 9 9 NaN SQL 1.977% -0.36%
9 10 10 NaN Objective-C 1.686% +0.23%

Type

1
type(df)

pandas.core.frame.DataFrame

Columns

1
df.columns
Index(['Jul 2019', 'Jul 2018', 'Change', 'Programming Language', 'Ratings',
       'Change.1'],
      dtype='object')

访问DataFrame列

1
df.Ratings

0 15.058%
1 14.211%
2 9.260%
3 6.705%
4 4.365%
5 4.208%
6 2.304%
7 2.167%
8 1.977%
9 1.686%
Name: Ratings, dtype: object

复制DataFrame

1
df_new = DataFrame(df, columns=['Programming Language','Jul 2018'])
1
df_new




























































Programming Language Jul 2018
0 Java 1
1 C 2
2 Python 4
3 C++ 3
4 C# 6
5 Visual Basic .NET 5
6 JavaScript 8
7 PHP 7
8 SQL 9
9 Objective-C 10

向DataFrame中增加新列

1
df_new = DataFrame(df, columns=['Programming Language','Jul 2018', 'Sep 2018'])
1
df_new







































































Programming Language Jul 2018 Sep 2018
0 Java 1 NaN
1 C 2 NaN
2 Python 4 NaN
3 C++ 3 NaN
4 C# 6 NaN
5 Visual Basic .NET 5 NaN
6 JavaScript 8 NaN
7 PHP 7 NaN
8 SQL 9 NaN
9 Objective-C 10 NaN

向新增列填充数据

方法1:range()

1
df_new['Sep 2018'] = range(0,10)
1
df_new







































































Programming Language Jul 2018 Sep 2018
0 Java 1 0
1 C 2 1
2 Python 4 2
3 C++ 3 3
4 C# 6 4
5 Visual Basic .NET 5 5
6 JavaScript 8 6
7 PHP 7 7
8 SQL 9 8
9 Objective-C 10 9

方法2:np.range()

1
df_new['Sep 2018'] = np.arange(0,10)
1
df_new







































































Programming Language Jul 2018 Sep 2018
0 Java 1 0
1 C 2 1
2 Python 4 2
3 C++ 3 3
4 C# 6 4
5 Visual Basic .NET 5 5
6 JavaScript 8 6
7 PHP 7 7
8 SQL 9 8
9 Objective-C 10 9

方法3:np.arange()

1
df_new['Sep 2018'] = pd.Series(np.arange(0,10))
1
df_new







































































Programming Language Jul 2018 Sep 2018
0 Java 1 0
1 C 2 1
2 Python 4 2
3 C++ 3 3
4 C# 6 4
5 Visual Basic .NET 5 5
6 JavaScript 8 6
7 PHP 7 7
8 SQL 9 8
9 Objective-C 10 9

向新增列中填充数值

1
df_new['Sep 2018'] = pd.Series([100,200],index=[1,2])
1
df_new







































































Programming Language Jul 2018 Sep 2018
0 Java 1 NaN
1 C 2 100.0
2 Python 4 200.0
3 C++ 3 NaN
4 C# 6 NaN
5 Visual Basic .NET 5 NaN
6 JavaScript 8 NaN
7 PHP 7 NaN
8 SQL 9 NaN
9 Objective-C 10 NaN
1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

data = {'Country': ['Belgium','India', 'Brazil'],
'Capital':['Brussels', 'New Delhi', 'Brasilia'],
'Population':[11190846, 1303171035, 207847528]}
1
df1 = pd.DataFrame(data)
1
df1





























Country Capital Population
0 Belgium Brussels 11190846
1 India New Delhi 1303171035
2 Brazil Brasilia 207847528

访问DataFrame某一列

1
df1['Country']

0 Belgium
1 India
2 Brazil
Name: Country, dtype: object

1
cou = df1['Country']
1
type(cou)

pandas.core.series.Series

遍历DataFrame

1
df1.iterrows()
1
2
3
for row in df1.iterrows():
print(row[0]), print(row[1])
break

0
Country Belgium
Capital Brussels
Population 11190846
Name: 0, dtype: object

1
df1





























Country Capital Population
0 Belgium Brussels 11190846
1 India New Delhi 1303171035
2 Brazil Brasilia 207847528
1
data

{‘Country’: [‘Belgium’, ‘India’, ‘Brazil’],
‘Capital’: [‘Brussels’, ‘New Delhi’, ‘Brasilia’],
‘Population’: [11190846, 1303171035, 207847528]}

Series => DataFrame

1
s1 = pd.Series(data['Capital'])
1
s2 = pd.Series(data['Country'])
1
s3 = pd.Series(data['Population'])
1
df_new = pd.DataFrame([s2,s1,s3],index=['Country','Capital','Population'])
1
df_new





























0 1 2
Country Belgium India Brazil
Capital Brussels New Delhi Brasilia
Population 11190846 1303171035 207847528

行转列

1
df_new2 = df_new.T
1
df_new2





























Country Capital Population
0 Belgium Brussels 11190846
1 India New Delhi 1303171035
2 Brazil Brasilia 207847528
1
df1





























Country Capital Population
0 Belgium Brussels 11190846
1 India New Delhi 1303171035
2 Brazil Brasilia 207847528

Nan in DataFrame

1
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
1
dframe



































0 1 2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
3 NaN NaN NaN

空值判断

1
dframe.isnull()



































0 1 2
0 False False False
1 True False False
2 False True False
3 True True True

非空判断

1
dframe.notnull()



































0 1 2
0 True True True
1 False True True
2 True False True
3 False False False

删除存在nan的行

1
df1 = dframe.dropna(axis=0)
1
df1

















0 1 2
0 1.0 2.0 3.0

删除存在nan的列

1
df2 = dframe.dropna(axis=1)
1
df2




















0
1
2
3

仅删除所有值都为nan的行

1
df3 = dframe.dropna(axis=0, how='all')
1
df3





























0 1 2
0 1.0 2.0 3.0
1 NaN 5.0 6.0
2 7.0 NaN 9.0
1
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])
1
dframe2








































0 1 2 3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0
3 1.0 NaN NaN NaN

删除大于thresh阈值的行

1
df4 = dframe2.dropna(thresh=2)
1
df4

































0 1 2 3
0 1.0 2.0 3.0 NaN
1 2.0 NaN 5.0 6.0
2 NaN 7.0 NaN 9.0

利用fillna对空值进行填充

1
dframe2.fillna(value=1)








































0 1 2 3
0 1.0 2.0 3.0 1.0
1 2.0 1.0 5.0 6.0
2 1.0 7.0 1.0 9.0
3 1.0 1.0 1.0 1.0

利用fillna按列级别填充不同数值

1
dframe2.fillna(value={0:0,1:1,2:2,3:5})








































0 1 2 3
0 1.0 2.0 3.0 5.0
1 2.0 1.0 5.0 6.0
2 0.0 7.0 2.0 9.0
3 1.0 1.0 2.0 5.0

DataFrame I/O操作

打开网页

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser

link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)

True

读取剪贴板内容

1
df1 = pd.read_clipboard()
1
df1







































































































Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq

将内容粘贴到剪贴板

1
df1.to_clipboard()

写入CSV (不写入索引值)

1
df1.to_csv('df1.csv', index=False)
1
!ls

2-3 Array的创建及访问.ipynb
2-4 数组与矩阵运算.ipynb
2-5 Array的input和output.ipynb
3-1 Pandas Series.ipynb
3-2 Pandas DataFrame.ipynb
3-3 深入理解Series和Dataframe.ipynb
3-4 Pandas-Dataframe-IO操作.ipynb
df1.csv
one_array.npy
two_array.npz
x.pkl

预览CSV文件内容

1
!more df1.csv

Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
[7mdf1.csv (END)[m[K

Selecting & Indexing

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

imdb = pd.read_csv('/Users/kwang/jupyter-projects/movie_metadata.csv')
1
imdb.shape

(5043, 28)

查看DataFrame前5行内容

1
imdb.head()





















































































































































color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes
0 Color James Cameron 723.0 178.0 0.0 855.0 Joel David Moore 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi 3054.0 English USA PG-13 237000000.0 2009.0 936.0 7.9 1.78 33000
1 Color Gore Verbinski 302.0 169.0 563.0 1000.0 Orlando Bloom 40000.0 309404152.0 Action|Adventure|Fantasy 1238.0 English USA PG-13 300000000.0 2007.0 5000.0 7.1 2.35 0
2 Color Sam Mendes 602.0 148.0 0.0 161.0 Rory Kinnear 11000.0 200074175.0 Action|Adventure|Thriller 994.0 English UK PG-13 245000000.0 2015.0 393.0 6.8 2.35 85000
3 Color Christopher Nolan 813.0 164.0 22000.0 23000.0 Christian Bale 27000.0 448130642.0 Action|Thriller 2701.0 English USA PG-13 250000000.0 2012.0 23000.0 8.5 2.35 164000
4 NaN Doug Walker NaN NaN 131.0 NaN Rob Walker 131.0 NaN Documentary NaN NaN NaN NaN NaN NaN 12.0 7.1 NaN 0

5 rows × 28 columns






#### 查看DataFrame后5行内容
1
imdb.tail()























































































































































color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes
5038 Color Scott Smith 1.0 87.0 2.0 318.0 Daphne Zuniga 637.0 NaN Comedy|Drama 6.0 English Canada NaN NaN 2013.0 470.0 7.7 NaN 84
5039 Color NaN 43.0 43.0 NaN 319.0 Valorie Curry 841.0 NaN Crime|Drama|Mystery|Thriller 359.0 English USA TV-14 NaN NaN 593.0 7.5 16.00 32000
5040 Color Benjamin Roberds 13.0 76.0 0.0 0.0 Maxwell Moody 0.0 NaN Drama|Horror|Thriller 3.0 English USA NaN 1400.0 2013.0 0.0 6.3 NaN 16
5041 Color Daniel Hsia 14.0 100.0 0.0 489.0 Daniel Henney 946.0 10443.0 Comedy|Drama|Romance 9.0 English USA PG-13 NaN 2012.0 719.0 6.3 2.35 660
5042 Color Jon Gunn 43.0 90.0 16.0 16.0 Brian Herzlinger 86.0 85222.0 Documentary 84.0 English USA PG 1100.0 2004.0 23.0 6.6 1.85 456

5 rows × 28 columns





#### 根据指定列生成新的DataFrame
1
sub_df = imdb[['director_name', 'movie_title', 'imdb_score']]


1
sub_df.head()











































director_name movie_title imdb_score
0 James Cameron Avatar 7.9
1 Gore Verbinski Pirates of the Caribbean: At World’s End 7.1
2 Sam Mendes Spectre 6.8
3 Christopher Nolan The Dark Knight Rises 8.5
4 Doug Walker Star Wars: Episode VII - The Force Awakens … 7.1

根据index定位内容(iloc)

1
temp_df = sub_df.iloc[10:20,0:2]
1
temp_df




























































director_name movie_title
10 Zack Snyder Batman v Superman: Dawn of Justice
11 Bryan Singer Superman Returns
12 Marc Forster Quantum of Solace
13 Gore Verbinski Pirates of the Caribbean: Dead Man’s Chest
14 Gore Verbinski The Lone Ranger
15 Zack Snyder Man of Steel
16 Andrew Adamson The Chronicles of Narnia: Prince Caspian
17 Joss Whedon The Avengers
18 Rob Marshall Pirates of the Caribbean: On Stranger Tides
19 Barry Sonnenfeld Men in Black 3
1
temp_df.iloc[2:4,:]




















director_name movie_title
12 Marc Forster Quantum of Solace
13 Gore Verbinski Pirates of the Caribbean: Dead Man’s Chest

根据label定位内容(loc)

1
temp_df.loc[15:17,:'director_name']





















director_name
15 Zack Snyder
16 Andrew Adamson
17 Joss Whedon

Reindex on DataFrame

1
df1 = DataFrame(np.random.rand(25).reshape([5,5]), index=['A','B','D','E','F'], columns=['c1','c2','c3','c4','c5'])
1
df1





















































c1 c2 c3 c4 c5
A 0.659369 0.096724 0.880128 0.403168 0.282619
B 0.003070 0.661747 0.121026 0.262881 0.557213
D 0.972693 0.962149 0.411036 0.849216 0.704441
E 0.969716 0.425415 0.943507 0.589344 0.670303
F 0.315732 0.371235 0.943880 0.649541 0.495330

插入新的行(index)

1
df1.reindex(index=['A','B','C','D','E','F'])





























































c1 c2 c3 c4 c5
A 0.659369 0.096724 0.880128 0.403168 0.282619
B 0.003070 0.661747 0.121026 0.262881 0.557213
C NaN NaN NaN NaN NaN
D 0.972693 0.962149 0.411036 0.849216 0.704441
E 0.969716 0.425415 0.943507 0.589344 0.670303
F 0.315732 0.371235 0.943880 0.649541 0.495330

插入新的列 (columns)

1
df1.reindex(columns=['c1','c2','c3','c4','c5','c6'])



























































c1 c2 c3 c4 c5 c6
A 0.659369 0.096724 0.880128 0.403168 0.282619 NaN
B 0.003070 0.661747 0.121026 0.262881 0.557213 NaN
D 0.972693 0.962149 0.411036 0.849216 0.704441 NaN
E 0.969716 0.425415 0.943507 0.589344 0.670303 NaN
F 0.315732 0.371235 0.943880 0.649541 0.495330 NaN

同时新增index和columns

1
df1.reindex(index=['A','B','C','D','E','F'], columns=['c1','c2','c3','c4','c5','c6'])




































































c1 c2 c3 c4 c5 c6
A 0.659369 0.096724 0.880128 0.403168 0.282619 NaN
B 0.003070 0.661747 0.121026 0.262881 0.557213 NaN
C NaN NaN NaN NaN NaN NaN
D 0.972693 0.962149 0.411036 0.849216 0.704441 NaN
E 0.969716 0.425415 0.943507 0.589344 0.670303 NaN
F 0.315732 0.371235 0.943880 0.649541 0.495330 NaN

对DataFrame切片

1
df1.reindex(index=['A','B'])





























c1 c2 c3 c4 c5
A 0.659369 0.096724 0.880128 0.403168 0.282619
B 0.003070 0.661747 0.121026 0.262881 0.557213

对DataFrame进行drop index

1
df1.drop('A',axis=0)













































c1 c2 c3 c4 c5
B 0.003070 0.661747 0.121026 0.262881 0.557213
D 0.972693 0.962149 0.411036 0.849216 0.704441
E 0.969716 0.425415 0.943507 0.589344 0.670303
F 0.315732 0.371235 0.943880 0.649541 0.495330

对DataFrame进行drop columns

1
df1.drop('c2',axis=1)















































c1 c3 c4 c5
A 0.659369 0.880128 0.403168 0.282619
B 0.003070 0.121026 0.262881 0.557213
D 0.972693 0.411036 0.849216 0.704441
E 0.969716 0.943507 0.589344 0.670303
F 0.315732 0.943880 0.649541 0.495330

多级index

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series(np.random.randn(6),index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])
1
s1

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

1
s1[1]['a']

-0.8287592467933553

1
s1[:,'a']

1 -0.828759
2 0.801019
dtype: float64

多级index Series => DataFrame

1
2
3
df1 = s1.unstack()

df1























a b c
1 -0.828759 1.100893 0.705300
2 0.801019 0.676878 -0.585011

DataFrame => 多级index Series

1
df1.unstack()

a 1 -0.828759
2 0.801019
b 1 1.100893
2 0.676878
c 1 0.705300
2 -0.585011
dtype: float64

1
df1.T

























1 2
a -0.828759 0.801019
b 1.100893 0.676878
c 0.705300 -0.585011
1
df1.T.unstack()

1 a -0.828759
b 1.100893
c 0.705300
2 a 0.801019
b 0.676878
c -0.585011
dtype: float64

初始化DataFrame (多级index & columns)

1
2
3
df = DataFrame(np.arange(16).reshape(4,4), index=[['a','a','b','b'],[1,2,1,2]], columns=[['BJ','BJ','SH','GZ'],[8,9,8,8]])

df


















































BJ SH GZ
8 9 8 8
a 1 0 1 2 3
2 4 5 6 7
b 1 8 9 10 11
2 12 13 14 15

访问多级DataFrame元素

1
df['BJ']

































8 9
a 1 0 1
2 4 5
b 1 8 9
2 12 13
1
df['BJ'][8]

a 1 0
2 4
b 1 8
2 12
Name: 8, dtype: int64

1
df.columns

MultiIndex(levels=[[‘BJ’, ‘GZ’, ‘SH’], [8, 9]],
codes=[[0, 0, 2, 1], [0, 1, 0, 0]])

创建多级索引

1
2
3
4
data = Series(np.random.randn(10),index=[['x','x','x','x','x','x','y','y','y','y'],
['a','a','a','b','b','b','c','c','d','d'],
[1,2,3,1,2,3,1,2,2,3]])
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

检索多级索引

选取外层索引
1
data['x']

a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
dtype: float64

1
data['x':'y']

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

选取内层索引
1
data['x','a',1]

0.6737686360875892

解除多级索引

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

默认解除最内层索引,即level=-1
1
data.unstack()






































1 2 3
x a 0.673769 -1.781723 -2.232971
b -0.832687 -0.141850 0.704339
y c 1.186975 -1.106244 NaN
d NaN -0.166537 0.695451
解除其他层索引
1
data.unstack(level=-2)

























































a b c d
x 1 0.673769 -0.832687 NaN NaN
2 -1.781723 -0.141850 NaN NaN
3 -2.232971 0.704339 NaN NaN
y 1 NaN NaN 1.186975 NaN
2 NaN NaN -1.106244 -0.166537
3 NaN NaN NaN 0.695451

转化为多级索引

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

将中间层索引转换为内层索引
1
data.unstack(level=-2).stack()

x 1 a 0.673769
b -0.832687
2 a -1.781723
b -0.141850
3 a -2.232971
b 0.704339
y 1 c 1.186975
2 c -1.106244
d -0.166537
3 d 0.695451
dtype: float64

为不同层索引指定名称(name)

1
data

x a 1 0.673769
2 -1.781723
3 -2.232971
b 1 -0.832687
2 -0.141850
3 0.704339
y c 1 1.186975
2 -1.106244
d 2 -0.166537
3 0.695451
dtype: float64

1
2
3
df = data.unstack()

df






































1 2 3
x a 0.673769 -1.781723 -2.232971
b -0.832687 -0.141850 0.704339
y c 1.186975 -1.106244 NaN
d NaN -0.166537 0.695451
1
2
3
4
df.index.names = ['outer','inner'] # 指定两层索引的名称
df.columns.name = 'hello'

df













































hello 1 2 3
outer inner
x a 0.673769 -1.781723 -2.232971
b -0.832687 -0.141850 0.704339
y c 1.186975 -1.106244 NaN
d NaN -0.166537 0.695451

Mapping

通过Series为DataFrame新增列

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df1

























城市 人口
0 北京 1000
1 上海 2000
2 广州 1500
1
2
3
df1["GDP"] = Series([1200,1800,1600])

df1





























城市 人口 GDP
0 北京 1000 1200
1 上海 2000 1800
2 广州 1500 1600

通过Map为DataFrame新增列

1
2
3
df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,1500]})

df2

























城市 人口
0 北京 1000
1 上海 2000
2 广州 1500
1
2
3
4
5
gdp_map = {"北京": 1200, "广州": 1400, "上海": 1800}

df2['GDP'] = df2['城市'].map(gdp_map)

df2





























城市 人口 GDP
0 北京 1000 1200
1 上海 2000 1800
2 广州 1500 1400

Replace

1
2
3
s1 = Series(np.arange(10))

s1

0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

1
s1.replace(1, np.nan)

0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
6 6.0
7 7.0
8 8.0
9 9.0
dtype: float64

1
s1.replace([1,2,3],[10,20,30])

0 0
1 10
2 20
3 30
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int64

DataFrame的简单数学计算

1
2
df1 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','SH'])
df1




















BJ SH
A 0 1
B 2 3
1
2
3
df2 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ','SH','GZ'])

df2





























BJ SH GZ
A 0 1 2
B 3 4 5
C 6 7 8
1
df1 + df2





























BJ GZ SH
A 0.0 NaN 2.0
B 5.0 NaN 7.0
C NaN NaN NaN
1
2
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]], index=['A','B','C'], columns=['c1','c2','c3'])
df3





























c1 c2 c3
A 1 2 3.0
B 4 5 NaN
C 7 8 9.0

对列进行求和

1
df3.sum()

c1 12.0
c2 15.0
c3 12.0
dtype: float64

对行进行求和

1
df3.sum(axis=1)

A 6.0
B 9.0
C 24.0
dtype: float64

求最小值

1
df3.min()

c1 1.0
c2 2.0
c3 3.0
dtype: float64

求最大值

1
df3.max()

c1 7.0
c2 8.0
c3 9.0
dtype: float64

1
df3





























c1 c2 c3
A 1 2 3.0
B 4 5 NaN
C 7 8 9.0
1
df3.describe()



























































c1 c2 c3
count 3.0 3.0 2.000000
mean 4.0 5.0 6.000000
std 3.0 3.0 4.242641
min 1.0 2.0 3.000000
25% 2.5 3.5 4.500000
50% 4.0 5.0 6.000000
75% 5.5 6.5 7.500000
max 7.0 8.0 9.000000

DataFrame排序

1
2
df1 = DataFrame(np.random.randn(40).reshape(8,5), columns=['A','B','C','D','E'])
df1













































































A B C D E
0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783
1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309
2 -1.799746 0.937149 -0.387339 0.222759 -0.040818
3 0.239674 1.813621 -0.010476 -0.013300 1.411217
4 -0.980156 0.889708 1.322267 1.848655 -0.210548
5 1.655132 -0.561405 1.369573 -0.485437 1.772758
6 -0.032585 -2.038827 1.560083 0.638874 0.626102
7 -0.153819 -1.386077 0.779540 -1.441996 1.106010

按column排序

1
2
df2 = df1.sort_values('A', ascending=False)
df2













































































A B C D E
5 1.655132 -0.561405 1.369573 -0.485437 1.772758
0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783
3 0.239674 1.813621 -0.010476 -0.013300 1.411217
6 -0.032585 -2.038827 1.560083 0.638874 0.626102
7 -0.153819 -1.386077 0.779540 -1.441996 1.106010
4 -0.980156 0.889708 1.322267 1.848655 -0.210548
1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309
2 -1.799746 0.937149 -0.387339 0.222759 -0.040818

按index排序

1
df2.sort_index()













































































A B C D E
0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783
1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309
2 -1.799746 0.937149 -0.387339 0.222759 -0.040818
3 0.239674 1.813621 -0.010476 -0.013300 1.411217
4 -0.980156 0.889708 1.322267 1.848655 -0.210548
5 1.655132 -0.561405 1.369573 -0.485437 1.772758
6 -0.032585 -2.038827 1.560083 0.638874 0.626102
7 -0.153819 -1.386077 0.779540 -1.441996 1.106010

Rename index

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = DataFrame(np.arange(9).reshape(3,3), index=['BJ','SH','GZ'], columns=['A','B','C'])

df1





























A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
1
2
3
df1.index = Series(['bj','sh','gz'])

df1





























A B C
bj 0 1 2
sh 3 4 5
gz 6 7 8
1
2
df1.index = df1.index.map(str.upper)
df1





























A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
1
df1.rename(index=str.lower, columns=str.lower)





























a b c
bj 0 1 2
sh 3 4 5
gz 6 7 8
1
df1





























A B C
BJ 0 1 2
SH 3 4 5
GZ 6 7 8
1
df1.rename(index={'BJ':'beijing'}, columns={'A':'a'})





























a B C
beijing 0 1 2
SH 3 4 5
GZ 6 7 8

数字列表 => 字符串列表

1
2
list1 = [1,2,3,4]
list2 = ['1','2','3','4']
1
[str(x) for x in list1]

[‘1’, ‘2’, ‘3’, ‘4’]

1
list(map(str, list1))

[‘1’, ‘2’, ‘3’, ‘4’]

利用自定义map函数重命名index

1
2
def test_map(x):
return x + '_ABC'
1
df1.index.map(test_map)

Index([‘BJ_ABC’, ‘SH_ABC’, ‘GZ_ABC’], dtype=’object’)

1
df1.rename(index=test_map)





























A B C
BJ_ABC 0 1 2
SH_ABC 3 4 5
GZ_ABC 6 7 8

DataFrame合并 (merge) 操作

1
2
3
4
5
6
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.DataFrame({'key':['X','Y','Z','X'], 'data_set_1':[1,2,3,4]})
df1






























key data_set_1
0 X 1
1 Y 2
2 Z 3
3 X 4
1
2
df2 = pd.DataFrame({'key':['X','B','C'],'data_set_2':[5,6,7]})
df2

























key data_set_2
0 X 5
1 B 6
2 C 7
1
pd.merge(df1,df2,on=None)























key data_set_1 data_set_2
0 X 1 5
1 X 4 5

指定关联列(默认inner连接)

1
pd.merge(df1,df2,on='key',how='inner')























key data_set_1 data_set_2
0 X 1 5
1 X 4 5

左连接

1
pd.merge(df1,df2,on='key',how='left')



































key data_set_1 data_set_2
0 X 1 5.0
1 Y 2 NaN
2 Z 3 NaN
3 X 4 5.0

右连接

1
pd.merge(df1,df2,on='key',how='right')



































key data_set_1 data_set_2
0 X 1.0 5
1 X 4.0 5
2 B NaN 6
3 C NaN 7

Concatenate

Array Concatenate

1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
1
2
arr1 = np.arange(9).reshape(3,3)
arr1

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
2
arr2 = np.arange(9).reshape(3,3)
arr2

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
np.concatenate([arr1,arr2])

array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])

1
np.concatenate([arr1,arr2],axis=1)

array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])

Series Concatenate

1
2
s1 = Series([1,2,3], index=['X','Y','Z'])
s1

X 1
Y 2
Z 3
dtype: int64

1
2
s2 = Series([4,5], index=['A','B'])
s2

A 4
B 5
dtype: int64

1
pd.concat([s1,s2])

X 1
Y 2
Z 3
A 4
B 5
dtype: int64

1
pd.concat([s1,s2], axis=1)



































0 1
A NaN 4.0
B NaN 5.0
X 1.0 NaN
Y 2.0 NaN
Z 3.0 NaN

DataFrame Concatenate

1
2
df1 = DataFrame(np.random.randn(4,3), columns=['X','Y','Z'])
df1



































X Y Z
0 -2.150094 -0.172915 0.391376
1 -2.943184 1.442629 0.934177
2 -0.015998 -1.204739 0.912159
3 -0.131997 -0.152430 1.080225
1
2
df2 = DataFrame(np.random.randn(3,3), columns=['X','Y','A'])
df2





























X Y A
0 -0.640785 0.404844 1.140704
1 0.331578 0.490477 1.158403
2 1.542735 -0.693717 1.928202
1
pd.concat([df1,df2])





























































A X Y Z
0 NaN -2.150094 -0.172915 0.391376
1 NaN -2.943184 1.442629 0.934177
2 NaN -0.015998 -1.204739 0.912159
3 NaN -0.131997 -0.152430 1.080225
0 1.140704 -0.640785 0.404844 NaN
1 1.158403 0.331578 0.490477 NaN
2 1.928202 1.542735 -0.693717 NaN

Combine

Series Combine

1
2
s1 = Series([2,np.nan,4,np.nan], index=['A','B','C','D'])
s1

A 2.0
B NaN
C 4.0
D NaN
dtype: float64

1
2
s2 = Series([1,2,3,4], index=['A','B','C','D'])
s2

A 1
B 2
C 3
D 4
dtype: int64

combine_first - 用后者填充前者中的空值 (如有)
1
s1.combine_first(s2)

A 2.0
B 2.0
C 4.0
D 4.0
dtype: float64

DataFrame Combine

1
2
3
4
5
6
df1 = pd.DataFrame({
'X':[1,np.nan,3,np.nan],
'Y':[5,np.nan,7,np.nan],
'Z':[9,np.nan,11,np.nan]
})
df1



































X Y Z
0 1.0 5.0 9.0
1 NaN NaN NaN
2 3.0 7.0 11.0
3 NaN NaN NaN
1
2
3
4
5
df2 = pd.DataFrame({
'Z':[np.nan,10,np.nan,12],
'A':[1,2,3,4]
})
df2






























Z A
0 NaN 1
1 10.0 2
2 NaN 3
3 12.0 4
1
df1.combine_first(df2)








































A X Y Z
0 1.0 1.0 5.0 9.0
1 2.0 NaN NaN 10.0
2 3.0 3.0 7.0 11.0
3 4.0 NaN NaN 12.0

利用apply对数据进行处理

1
2
3
4
5
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df1 = pd.read_csv("apply_demo.csv")
1
df1.head()



































time data
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649
1
df1.size

7978

1
s1 = Series(['a']*7978)
1
df1['A'] = s1
1
df1.head()









































time data A
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 a

利用apply批量处理某一列数据

1
2
3
df1['A'] = df1['A'].apply(str.upper)

df1.head()









































time data A
0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A
1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A
2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A
3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A
4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 A
1
2
l1 = df1['data'][0].strip().split(' ')
l1

[‘Symbol:’, ‘APPL’, ‘Seqno:’, ‘0’, ‘Price:’, ‘1623’]

1
l1[1], l1[3], l1[5]

(‘APPL’, ‘0’, ‘1623’)

编写自定义函数,对列中的字符串进行提取

1
2
3
def foo(line):
items = line.strip().split(' ')
return Series([items[1], items[3], items[5]])
1
2
3
4
5
df_temp = df1['data'].apply(foo)

df_temp = df_temp.rename(columns={0:"Symbol",1:"Seqno",2:"Price"})

df_temp.head()









































Symbol Seqno Price
0 APPL 0 1623
1 APPL 0 1623
2 APPL 0 1623
3 APPL 0 1623
4 APPL 1 1649
1
df_new = df1.combine_first(df_temp)
1
df_new.head()



























































A Price Seqno Symbol data time
0 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962
1 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962
2 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963
3 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963
4 A 1649.0 1.0 APPL Symbol: APPL Seqno: 1 Price: 1649 1473411963
1
2
3
4
del df_new['data']
del df_new['A']

df_new.head()















































Price Seqno Symbol time
0 1623.0 0.0 APPL 1473411962
1 1623.0 0.0 APPL 1473411962
2 1623.0 0.0 APPL 1473411963
3 1623.0 0.0 APPL 1473411963
4 1649.0 1.0 APPL 1473411963
1
df_new.to_csv("demo_duplicate.csv")

利用drop_duplicates去重

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('demo_duplicate.csv')

df.head()





















































Unnamed: 0 Price Seqno Symbol time
0 0 1623.0 0.0 APPL 1473411962
1 1 1623.0 0.0 APPL 1473411962
2 2 1623.0 0.0 APPL 1473411963
3 3 1623.0 0.0 APPL 1473411963
4 4 1649.0 1.0 APPL 1473411963
1
2
3
del df['Unnamed: 0']

df.head(10)


















































































Price Seqno Symbol time
0 1623.0 0.0 APPL 1473411962
1 1623.0 0.0 APPL 1473411962
2 1623.0 0.0 APPL 1473411963
3 1623.0 0.0 APPL 1473411963
4 1649.0 1.0 APPL 1473411963
5 1649.0 1.0 APPL 1473411963
6 1649.0 1.0 APPL 1473411964
7 1649.0 1.0 APPL 1473411964
8 1642.0 2.0 APPL 1473411964
9 1642.0 2.0 APPL 1473411964
1
df.size

15956

1
len(df)

3989

1
len(df['Seqno'].unique())

1000

1
df['Seqno'].duplicated()

0 False
1 True
2 True
3 True
4 False
5 True
6 True
7 True
8 False
9 True
10 True
11 True
12 False
13 True
14 True
15 True
16 False
17 True
18 True
19 True
20 False
21 True
22 True
23 True
24 False
25 True
26 True
27 True
28 False
29 True

3959 True
3960 True
3961 False
3962 True
3963 True
3964 True
3965 False
3966 True
3967 True
3968 True
3969 False
3970 True
3971 True
3972 True
3973 False
3974 True
3975 True
3976 True
3977 False
3978 True
3979 True
3980 True
3981 False
3982 True
3983 True
3984 True
3985 False
3986 True
3987 True
3988 True
Name: Seqno, Length: 3989, dtype: bool

1
df.drop_duplicates(['Seqno'], keep='first')























































































































































































































































































































































































































































Price Seqno Symbol time
0 1623.0 0.0 APPL 1473411962
4 1649.0 1.0 APPL 1473411963
8 1642.0 2.0 APPL 1473411964
12 1636.0 3.0 APPL 1473411965
16 1669.0 4.0 APPL 1473411966
20 1639.0 5.0 APPL 1473411967
24 1611.0 6.0 APPL 1473411968
28 1660.0 7.0 APPL 1473411969
32 1657.0 8.0 APPL 1473411970
36 1509.0 9.0 APPL 1473411971
40 1514.0 10.0 APPL 1473411972
44 1676.0 11.0 APPL 1473411973
48 1596.0 12.0 APPL 1473411974
52 1527.0 13.0 APPL 1473411975
56 1643.0 14.0 APPL 1473411976
60 1632.0 15.0 APPL 1473411977
64 1595.0 16.0 APPL 1473411978
68 1565.0 17.0 APPL 1473411979
72 1521.0 18.0 APPL 1473411980
76 1501.0 19.0 APPL 1473411981
80 1579.0 20.0 APPL 1473411982
84 1511.0 21.0 APPL 1473411983
88 1513.0 22.0 APPL 1473411984
92 1687.0 23.0 APPL 1473411985
96 1539.0 24.0 APPL 1473411986
100 1682.0 25.0 APPL 1473411987
104 1534.0 26.0 APPL 1473411988
108 1628.0 27.0 APPL 1473411989
112 1582.0 28.0 APPL 1473411990
116 1540.0 29.0 APPL 1473411991
3869 1649.0 970.0 APPL 1473412932
3873 1696.0 971.0 APPL 1473412933
3877 1596.0 972.0 APPL 1473412934
3881 1664.0 973.0 APPL 1473412935
3885 1539.0 974.0 APPL 1473412936
3889 1654.0 975.0 APPL 1473412937
3893 1573.0 976.0 APPL 1473412938
3897 1695.0 977.0 APPL 1473412939
3901 1584.0 978.0 APPL 1473412940
3905 1607.0 979.0 APPL 1473412941
3909 1676.0 980.0 APPL 1473412942
3913 1665.0 981.0 APPL 1473412943
3917 1540.0 982.0 APPL 1473412944
3921 1546.0 983.0 APPL 1473412945
3925 1646.0 984.0 APPL 1473412946
3929 1648.0 985.0 APPL 1473412947
3933 1647.0 986.0 APPL 1473412948
3937 1602.0 987.0 APPL 1473412949
3941 1562.0 988.0 APPL 1473412950
3945 1629.0 989.0 APPL 1473412951
3949 1683.0 990.0 APPL 1473412952
3953 1649.0 991.0 APPL 1473412953
3957 1698.0 992.0 APPL 1473412954
3961 1566.0 993.0 APPL 1473412955
3965 1597.0 994.0 APPL 1473412956
3969 1641.0 995.0 APPL 1473412957
3973 1581.0 996.0 APPL 1473412958
3977 1674.0 997.0 APPL 1473412959
3981 1680.0 998.0 APPL 1473412960
3985 1509.0 999.0 APPL 1473412961

1000 rows × 4 columns





### 时间序列操作


1
2
3
import numpy as np
import pandas as pd
from pandas import Series, DataFrame


1
2
from datetime import datetime
t1 = datetime(2019,7,12)


1
t1


> datetime.datetime(2019, 7, 12, 0, 0)




1
2
3
4
5
6
7
date_list = [
datetime(2016,9,1),
datetime(2016,9,10),
datetime(2017,9,1),
datetime(2017,9,20),
datetime(2017,10,1)
]


1
date_list


> [datetime.datetime(2016, 9, 1, 0, 0),
datetime.datetime(2016, 9, 10, 0, 0),
datetime.datetime(2017, 9, 1, 0, 0),
datetime.datetime(2017, 9, 20, 0, 0),
datetime.datetime(2017, 10, 1, 0, 0)]




1
2
3
s1 = Series(np.random.rand(5),index=date_list)

s1


> 2016-09-01 0.192942
2016-09-10 0.821524
2017-09-01 0.503137
2017-09-20 0.896424
2017-10-01 0.267054
dtype: float64




1
s1.values


> array([0.19294241, 0.82152442, 0.5031373 , 0.8964235 , 0.26705384])




1
s1.index


> DatetimeIndex([‘2016-09-01’, ‘2016-09-10’, ‘2017-09-01’, ‘2017-09-20’,
‘2017-10-01’],
dtype=’datetime64[ns]’, freq=None)



#### 根据索引位置访问


1
s1[0]


> 0.19294241160815995



#### 根据时间序列访问


1
s1[datetime(2016,9,1)]


> 0.19294241160815995



#### 根据时间访问


1
s1['2016-09-01']


> 0.19294241160815995




1
s1['20160901']


> 0.19294241160815995



#### 返回整月数据


1
s1['2016-09']


> 2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64



#### 返回整年数据


1
s1['2016']


> 2016-09-01 0.192942
2016-09-10 0.821524
dtype: float64



#### 利用pd.date_range() 生成时间序列


1
2
date_list_new = pd.date_range('2016-01-01', periods=100)
date_list_new


> DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’, ‘2016-01-11’, ‘2016-01-12’,
‘2016-01-13’, ‘2016-01-14’, ‘2016-01-15’, ‘2016-01-16’,
‘2016-01-17’, ‘2016-01-18’, ‘2016-01-19’, ‘2016-01-20’,
‘2016-01-21’, ‘2016-01-22’, ‘2016-01-23’, ‘2016-01-24’,
‘2016-01-25’, ‘2016-01-26’, ‘2016-01-27’, ‘2016-01-28’,
‘2016-01-29’, ‘2016-01-30’, ‘2016-01-31’, ‘2016-02-01’,
‘2016-02-02’, ‘2016-02-03’, ‘2016-02-04’, ‘2016-02-05’,
‘2016-02-06’, ‘2016-02-07’, ‘2016-02-08’, ‘2016-02-09’,
‘2016-02-10’, ‘2016-02-11’, ‘2016-02-12’, ‘2016-02-13’,
‘2016-02-14’, ‘2016-02-15’, ‘2016-02-16’, ‘2016-02-17’,
‘2016-02-18’, ‘2016-02-19’, ‘2016-02-20’, ‘2016-02-21’,
‘2016-02-22’, ‘2016-02-23’, ‘2016-02-24’, ‘2016-02-25’,
‘2016-02-26’, ‘2016-02-27’, ‘2016-02-28’, ‘2016-02-29’,
‘2016-03-01’, ‘2016-03-02’, ‘2016-03-03’, ‘2016-03-04’,
‘2016-03-05’, ‘2016-03-06’, ‘2016-03-07’, ‘2016-03-08’,
‘2016-03-09’, ‘2016-03-10’, ‘2016-03-11’, ‘2016-03-12’,
‘2016-03-13’, ‘2016-03-14’, ‘2016-03-15’, ‘2016-03-16’,
‘2016-03-17’, ‘2016-03-18’, ‘2016-03-19’, ‘2016-03-20’,
‘2016-03-21’, ‘2016-03-22’, ‘2016-03-23’, ‘2016-03-24’,
‘2016-03-25’, ‘2016-03-26’, ‘2016-03-27’, ‘2016-03-28’,
‘2016-03-29’, ‘2016-03-30’, ‘2016-03-31’, ‘2016-04-01’,
‘2016-04-02’, ‘2016-04-03’, ‘2016-04-04’, ‘2016-04-05’,
‘2016-04-06’, ‘2016-04-07’, ‘2016-04-08’, ‘2016-04-09’],
dtype=’datetime64[ns]’, freq=’D’)



#### 从指定时间的(周一)开始生成时间序列


1
2
date_list_new = pd.date_range('2016-01-01', periods=10, freq='W-MON')
date_list_new


> DatetimeIndex([‘2016-01-04’, ‘2016-01-11’, ‘2016-01-18’, ‘2016-01-25’,
‘2016-02-01’, ‘2016-02-08’, ‘2016-02-15’, ‘2016-02-22’,
‘2016-02-29’, ‘2016-03-07’],
dtype=’datetime64[ns]’, freq=’W-MON’)




1
2
date_list_new = pd.date_range('2016-01-01', periods=10, freq='5H')
date_list_new


> DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 10:00:00’, ‘2016-01-01 15:00:00’,
‘2016-01-01 20:00:00’, ‘2016-01-02 01:00:00’,
‘2016-01-02 06:00:00’, ‘2016-01-02 11:00:00’,
‘2016-01-02 16:00:00’, ‘2016-01-02 21:00:00’],
dtype=’datetime64[ns]’, freq=’5H’)




1
2
s2 = Series(np.random.rand(10), index=date_list_new)
s2


> 2016-01-01 00:00:00 0.312884
2016-01-01 05:00:00 0.337516
2016-01-01 10:00:00 0.911010
2016-01-01 15:00:00 0.055390
2016-01-01 20:00:00 0.810770
2016-01-02 01:00:00 0.313263
2016-01-02 06:00:00 0.025229
2016-01-02 11:00:00 0.229192
2016-01-02 16:00:00 0.509811
2016-01-02 21:00:00 0.169681
Freq: 5H, dtype: float64





### 时间序列数据的采样和画图


1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from datetime import datetime

t_range = pd.date_range('2016-01-01','2016-12-31')
t_range


> DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’,
‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’,
‘2016-01-09’, ‘2016-01-10’,

‘2016-12-22’, ‘2016-12-23’, ‘2016-12-24’, ‘2016-12-25’,
‘2016-12-26’, ‘2016-12-27’, ‘2016-12-28’, ‘2016-12-29’,
‘2016-12-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, length=366, freq=’D’)




1
2
3
s1 = Series(np.random.rand(len(t_range)), index=t_range)

s1['2016-01'].mean()


> 0.48349408856972315



#### 按月份采样,计算平均值


1
2
3
s1_month = s1.resample('M').mean()

s1_month.index


> DatetimeIndex([‘2016-01-31’, ‘2016-02-29’, ‘2016-03-31’, ‘2016-04-30’,
‘2016-05-31’, ‘2016-06-30’, ‘2016-07-31’, ‘2016-08-31’,
‘2016-09-30’, ‘2016-10-31’, ‘2016-11-30’, ‘2016-12-31’],
dtype=’datetime64[ns]’, freq=’M’)



#### 按小时采样,并(向后)填充数据


1
s1.resample('H').ffill()


> 2016-01-01 00:00:00 0.196584
2016-01-01 01:00:00 0.196584
2016-01-01 02:00:00 0.196584
2016-01-01 03:00:00 0.196584
2016-01-01 04:00:00 0.196584
2016-01-01 05:00:00 0.196584
2016-01-01 06:00:00 0.196584
2016-01-01 07:00:00 0.196584
2016-01-01 08:00:00 0.196584
2016-01-01 09:00:00 0.196584
2016-01-01 10:00:00 0.196584
2016-01-01 11:00:00 0.196584
2016-01-01 12:00:00 0.196584
2016-01-01 13:00:00 0.196584
2016-01-01 14:00:00 0.196584
2016-01-01 15:00:00 0.196584
2016-01-01 16:00:00 0.196584
2016-01-01 17:00:00 0.196584
2016-01-01 18:00:00 0.196584
2016-01-01 19:00:00 0.196584
2016-01-01 20:00:00 0.196584
2016-01-01 21:00:00 0.196584
2016-01-01 22:00:00 0.196584
2016-01-01 23:00:00 0.196584
2016-01-02 00:00:00 0.209433
2016-01-02 01:00:00 0.209433
2016-01-02 02:00:00 0.209433
2016-01-02 03:00:00 0.209433
2016-01-02 04:00:00 0.209433
2016-01-02 05:00:00 0.209433

2016-12-29 19:00:00 0.287915
2016-12-29 20:00:00 0.287915
2016-12-29 21:00:00 0.287915
2016-12-29 22:00:00 0.287915
2016-12-29 23:00:00 0.287915
2016-12-30 00:00:00 0.023052
2016-12-30 01:00:00 0.023052
2016-12-30 02:00:00 0.023052
2016-12-30 03:00:00 0.023052
2016-12-30 04:00:00 0.023052
2016-12-30 05:00:00 0.023052
2016-12-30 06:00:00 0.023052
2016-12-30 07:00:00 0.023052
2016-12-30 08:00:00 0.023052
2016-12-30 09:00:00 0.023052
2016-12-30 10:00:00 0.023052
2016-12-30 11:00:00 0.023052
2016-12-30 12:00:00 0.023052
2016-12-30 13:00:00 0.023052
2016-12-30 14:00:00 0.023052
2016-12-30 15:00:00 0.023052
2016-12-30 16:00:00 0.023052
2016-12-30 17:00:00 0.023052
2016-12-30 18:00:00 0.023052
2016-12-30 19:00:00 0.023052
2016-12-30 20:00:00 0.023052
2016-12-30 21:00:00 0.023052
2016-12-30 22:00:00 0.023052
2016-12-30 23:00:00 0.023052
2016-12-31 00:00:00 0.515360
Freq: H, Length: 8761, dtype: float64



#### 基于时间序列画图


1
2
3
t_range = pd.date_range('2016-01-01','2016-12-31',freq='H')

t_range


> DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 01:00:00’,
‘2016-01-01 02:00:00’, ‘2016-01-01 03:00:00’,
‘2016-01-01 04:00:00’, ‘2016-01-01 05:00:00’,
‘2016-01-01 06:00:00’, ‘2016-01-01 07:00:00’,
‘2016-01-01 08:00:00’, ‘2016-01-01 09:00:00’,

‘2016-12-30 15:00:00’, ‘2016-12-30 16:00:00’,
‘2016-12-30 17:00:00’, ‘2016-12-30 18:00:00’,
‘2016-12-30 19:00:00’, ‘2016-12-30 20:00:00’,
‘2016-12-30 21:00:00’, ‘2016-12-30 22:00:00’,
‘2016-12-30 23:00:00’, ‘2016-12-31 00:00:00’],
dtype=’datetime64[ns]’, length=8761, freq=’H’)




1
2
3
4
5
stock_df = DataFrame(index=t_range)
stock_df['BABA'] = np.random.randint(80,160, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30,50, size=len(t_range))

stock_df.head()





































BABA TENCENT
2016-01-01 00:00:00 134 43
2016-01-01 01:00:00 98 35
2016-01-01 02:00:00 150 32
2016-01-01 03:00:00 96 37
2016-01-01 04:00:00 140 40
1
stock_df.plot()

1
2
import matplotlib.pyplot as plt
plt.show()
1
2
3
4
5
weekly_df = DataFrame()
weekly_df['BABA'] = stock_df['BABA'].resample('W').mean()
weekly_df['TENCENT'] = stock_df['TENCENT'].resample('W').mean()

weekly_df.head()



































BABA TENCENT
2016-01-03 118.777778 38.638889
2016-01-10 120.690476 40.077381
2016-01-17 120.678571 39.452381
2016-01-24 119.833333 39.386905
2016-01-31 119.577381 39.416667
1
weekly_df.plot()

数据分箱Binning

对list进行分箱

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

score_list = np.random.randint(25,100,size=20)

score_list

array([95, 75, 43, 49, 26, 95, 28, 55, 93, 97, 91, 46, 39, 88, 32, 97, 93,
80, 57, 70])

1
2
3
4
5
bins = [0,59,70,80,100]

score_cat = pd.cut(score_list, bins)

pd.value_counts(score_cat)

(0, 59] 9
(80, 100] 8
(70, 80] 2
(59, 70] 1
dtype: int64

对DataFrame进行分箱

1
2
3
4
df = DataFrame()
df['score'] = score_list

df

























































































score
0 95
1 75
2 43
3 49
4 26
5 95
6 28
7 55
8 93
9 97
10 91
11 46
12 39
13 88
14 32
15 97
16 93
17 80
18 57
19 70

利用 pd.util.testing.rands() 随机生成指定位数的字符串

1
2
3
 df['student'] = [pd.util.testing.rands(3) for i in range(20)]

df














































































































score student
0 95 PqQ
1 75 ds2
2 43 iUV
3 49 JjY
4 26 FK3
5 95 b9b
6 28 DyV
7 55 103
8 93 RUC
9 97 VqI
10 91 sjo
11 46 cCl
12 39 TeO
13 88 xap
14 32 GfG
15 97 FNg
16 93 pk0
17 80 ziG
18 57 5k0
19 70 ozP

通过 pd.cut() 的 labels 参数,对数据进行分类打标签

1
2
3
df['Categories'] = pd.cut(df['score'], bins, labels=['Low', 'OK', 'Good', 'Great'])

df



































































































































score student Categories
0 95 PqQ Great
1 75 ds2 Good
2 43 iUV Low
3 49 JjY Low
4 26 FK3 Low
5 95 b9b Great
6 28 DyV Low
7 55 103 Low
8 93 RUC Great
9 97 VqI Great
10 91 sjo Great
11 46 cCl Low
12 39 TeO Low
13 88 xap Great
14 32 GfG Low
15 97 FNg Great
16 93 pk0 Great
17 80 ziG Good
18 57 5k0 Low
19 70 ozP OK

数据分组技术GroupBy

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('city_weather.csv')

df.head()















































date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
1
2
3
g = df.groupby(df['city'])

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x119b0d6d8>

获取分组内容

1
2
3
4
5
g.groups

df_bj = g.get_group('BJ')

df_bj






















































date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
1
df_bj.mean()

temperature 10.000000
wind 2.833333
dtype: float64

对分组取平均值

1
g.mean()



































temperature wind
city
BJ 10.000 2.833333
GZ 8.750 4.000000
SH 4.625 3.625000
SZ 5.000 2.500000

对分组取最小值

1
g.min()









































date temperature wind
city
BJ 03/01/2016 -3 2
GZ 14/08/2016 -1 2
SH 03/07/2016 -10 2
SZ 11/09/2016 -10 1

groupby对象 => list => dict

1
dict(list(g))['BJ']






















































date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3

for循环遍历groupby变量

1
2
3
for name, group_df in g:
print(name)
print(group_df)

BJ
date city temperature wind
0 03/01/2016 BJ 8 5
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
3 14/02/2016 BJ -3 3
4 28/02/2016 BJ 19 2
5 13/03/2016 BJ 5 3
GZ
date city temperature wind
14 17/07/2016 GZ 10 2
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
17 28/08/2016 GZ 25 4
SH
date city temperature wind
6 27/03/2016 SH -4 4
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
10 22/05/2016 SH 4 2
11 05/06/2016 SH -10 4
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ
date city temperature wind
18 11/09/2016 SZ 20 1
19 25/09/2016 SZ -10 4

数据聚合技术Aggregation

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('city_weather.csv')
g = df.groupby('city')

g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11b921cc0>

1
g.max()









































date temperature wind
city
BJ 31/01/2016 19 5
GZ 31/07/2016 25 5
SH 27/03/2016 20 5
SZ 25/09/2016 20 4
1
g.count()









































date temperature wind
city
BJ 6 6 6
GZ 4 4 4
SH 8 8 8
SZ 2 2 2
1
g.describe()




























































































































temperature wind
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
city
BJ 6.0 10.000 8.532292 -3.0 5.75 10.0 17.25 19.0 6.0 2.833333 1.169045 2.0 2.00 2.5 3.00 5.0
GZ 4.0 8.750 11.842719 -1.0 0.50 5.5 13.75 25.0 4.0 4.000000 1.414214 2.0 3.50 4.5 5.00 5.0
SH 8.0 4.625 12.489281 -10.0 -5.25 2.0 17.50 20.0 8.0 3.625000 1.060660 2.0 3.00 3.5 4.25 5.0
SZ 2.0 5.000 21.213203 -10.0 -2.50 5.0 12.50 20.0 2.0 2.500000 2.121320 1.0 1.75 2.5 3.25 4.0
1
g.agg('min')









































date temperature wind
city
BJ 03/01/2016 -3 2
GZ 14/08/2016 -1 2
SH 03/07/2016 -10 2
SZ 11/09/2016 -10 1

自定义聚合函数

1
2
def foo(attr):
return attr.max() - attr.min()
1
g.agg(foo)



































temperature wind
city
BJ 22 3
GZ 26 3
SH 30 3
SZ 30 3

对多个列进行groupby

1
2
3
g_new = df.groupby(['city','wind'])

g_new

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ba83208>

1
g_new.groups

{(‘BJ’, 2): Int64Index([1, 2, 4], dtype=’int64’),
(‘BJ’, 3): Int64Index([3, 5], dtype=’int64’),
(‘BJ’, 5): Int64Index([0], dtype=’int64’),
(‘GZ’, 2): Int64Index([14], dtype=’int64’),
(‘GZ’, 4): Int64Index([17], dtype=’int64’),
(‘GZ’, 5): Int64Index([15, 16], dtype=’int64’),
(‘SH’, 2): Int64Index([10], dtype=’int64’),
(‘SH’, 3): Int64Index([7, 8, 9], dtype=’int64’),
(‘SH’, 4): Int64Index([6, 11], dtype=’int64’),
(‘SH’, 5): Int64Index([12, 13], dtype=’int64’),
(‘SZ’, 1): Int64Index([18], dtype=’int64’),
(‘SZ’, 4): Int64Index([19], dtype=’int64’)}

1
g_new.get_group(('BJ',2))

































date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2

for循环遍历多列groupby变量

1
2
3
for (name_1, name_2), group in g_new:
print(name_1,name_2)
print(group)

BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4

透视表

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_excel('sales-funnel.xlsx')

df











































































































































































































Account Name Rep Manager Product Quantity Price Status
0 714466 Trantow-Barrows Craig Booker Debra Henley CPU 1 30000 presented
1 714466 Trantow-Barrows Craig Booker Debra Henley Software 1 10000 presented
2 714466 Trantow-Barrows Craig Booker Debra Henley Maintenance 2 5000 pending
3 737550 Fritsch, Russel and Anderson Craig Booker Debra Henley CPU 1 35000 declined
4 146832 Kiehn-Spinka Daniel Hilton Debra Henley CPU 2 65000 won
5 218895 Kulas Inc Daniel Hilton Debra Henley CPU 2 40000 pending
6 218895 Kulas Inc Daniel Hilton Debra Henley Software 1 10000 presented
7 412290 Jerde-Hilpert John Smith Debra Henley Maintenance 2 5000 pending
8 740150 Barton LLC John Smith Debra Henley CPU 1 35000 declined
9 141962 Herman LLC Cedric Moss Fred Anderson CPU 2 65000 won
10 163416 Purdy-Kunde Cedric Moss Fred Anderson CPU 1 30000 presented
11 239344 Stokes LLC Cedric Moss Fred Anderson Maintenance 1 5000 pending
12 239344 Stokes LLC Cedric Moss Fred Anderson Software 1 10000 presented
13 307599 Kassulke, Ondricka and Metz Wendy Yule Fred Anderson Maintenance 3 7000 won
14 688981 Keeling LLC Wendy Yule Fred Anderson CPU 5 100000 won
15 729833 Koepp Ltd Wendy Yule Fred Anderson CPU 2 65000 declined
16 729833 Koepp Ltd Wendy Yule Fred Anderson Monitor 2 5000 presented
1
df.size

136

1
len(df)

17

1
pd.pivot_table(df, index=['Name','Rep','Manager'])





















































































































Account Price Quantity
Name Rep Manager
Barton LLC John Smith Debra Henley 740150 35000 1.000000
Fritsch, Russel and Anderson Craig Booker Debra Henley 737550 35000 1.000000
Herman LLC Cedric Moss Fred Anderson 141962 65000 2.000000
Jerde-Hilpert John Smith Debra Henley 412290 5000 2.000000
Kassulke, Ondricka and Metz Wendy Yule Fred Anderson 307599 7000 3.000000
Keeling LLC Wendy Yule Fred Anderson 688981 100000 5.000000
Kiehn-Spinka Daniel Hilton Debra Henley 146832 65000 2.000000
Koepp Ltd Wendy Yule Fred Anderson 729833 35000 2.000000
Kulas Inc Daniel Hilton Debra Henley 218895 25000 1.500000
Purdy-Kunde Cedric Moss Fred Anderson 163416 30000 1.000000
Stokes LLC Cedric Moss Fred Anderson 239344 7500 1.000000
Trantow-Barrows Craig Booker Debra Henley 714466 15000 1.333333
1
pd.pivot_table(df, index=['Manager','Rep'], values=['Price', 'Quantity'], aggfunc='sum')












































Price Quantity
Manager Rep
Debra Henley Craig Booker 80000 5
Daniel Hilton 115000 5
John Smith 40000 3
Fred Anderson Cedric Moss 110000 5
Wendy Yule 177000 12
1
pd.pivot_table(df, index=['Manager','Rep'], values=['Price'], columns=['Product'], fill_value=0, aggfunc='sum')































































Price
Product CPU Maintenance Monitor Software
Manager Rep
Debra Henley Craig Booker 65000 5000 0 10000
Daniel Hilton 105000 0 0 10000
John Smith 35000 5000 0 0
Fred Anderson Cedric Moss 95000 5000 0 10000
Wendy Yule 165000 7000 5000 0

分组和透视功能

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('usa_flights.csv')

df.head()











































































































flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time
0 02/01/2015 0:00 AA 1 JFK LAX -19.0 0 2475 NaN NaN NaN NaN NaN 381.0
1 03/01/2015 0:00 AA 1 JFK LAX -39.0 0 2475 NaN NaN NaN NaN NaN 358.0
2 04/01/2015 0:00 AA 1 JFK LAX -12.0 0 2475 NaN NaN NaN NaN NaN 385.0
3 05/01/2015 0:00 AA 1 JFK LAX -8.0 0 2475 NaN NaN NaN NaN NaN 389.0
4 06/01/2015 0:00 AA 1 JFK LAX 25.0 0 2475 0.0 0.0 0.0 25.0 0.0 424.0
1
df.shape

(201664, 14)

1
df.tail()











































































































flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time
201659 10/01/2015 0:00 NK 188 OAK LAS -16.0 0 407 NaN NaN NaN NaN NaN 77.0
201660 11/01/2015 0:00 NK 188 OAK LAS -4.0 0 407 NaN NaN NaN NaN NaN 87.0
201661 12/01/2015 0:00 NK 188 OAK LAS -7.0 0 407 NaN NaN NaN NaN NaN 82.0
201662 13/01/2015 0:00 NK 188 OAK LAS 23.0 0 407 3.0 0.0 0.0 20.0 0.0 103.0
201663 14/01/2015 0:00 NK 188 OAK LAS -7.0 0 407 NaN NaN NaN NaN NaN 82.0

获取延误时间Top 10的航班

1
df.sort_values('arr_delay', ascending=False)[:10]
































































































































































































flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time
11073 11/01/2015 0:00 AA 1595 AUS DFW 1444.0 0 190 1444.0 0.0 0.0 0.0 0.0 59.0
10214 13/01/2015 0:00 AA 1487 OMA DFW 1392.0 0 583 1392.0 0.0 0.0 0.0 0.0 117.0
12430 03/01/2015 0:00 AA 1677 MEM DFW 1384.0 0 432 1380.0 0.0 0.0 4.0 0.0 104.0
8443 04/01/2015 0:00 AA 1279 OMA DFW 1237.0 0 583 1222.0 0.0 15.0 0.0 0.0 102.0
10328 05/01/2015 0:00 AA 1495 EGE DFW 1187.0 0 721 1019.0 0.0 168.0 0.0 0.0 127.0
36570 04/01/2015 0:00 DL 1435 MIA MSP 1174.0 0 1501 1174.0 0.0 0.0 0.0 0.0 231.0
36495 04/01/2015 0:00 DL 1367 ROC ATL 1138.0 0 749 1112.0 0.0 0.0 26.0 0.0 171.0
59072 14/01/2015 0:00 DL 1687 SAN MSP 1084.0 0 1532 1070.0 0.0 0.0 14.0 0.0 240.0
32173 05/01/2015 0:00 AA 970 LAS LAX 1042.0 0 236 1033.0 0.0 9.0 0.0 0.0 66.0
56488 12/01/2015 0:00 DL 2117 ATL COS 1016.0 0 1184 1016.0 0.0 0.0 0.0 0.0 193.0

计算延误和没有延误所占比例

1
df['cancelled'].value_counts()

0 196873
1 4791
Name: cancelled, dtype: int64

1
2
3
df['delayed'] = df['arr_delay'].apply(lambda x:x>0)

df.head()

















































































































flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time delayed
0 02/01/2015 0:00 AA 1 JFK LAX -19.0 0 2475 NaN NaN NaN NaN NaN 381.0 False
1 03/01/2015 0:00 AA 1 JFK LAX -39.0 0 2475 NaN NaN NaN NaN NaN 358.0 False
2 04/01/2015 0:00 AA 1 JFK LAX -12.0 0 2475 NaN NaN NaN NaN NaN 385.0 False
3 05/01/2015 0:00 AA 1 JFK LAX -8.0 0 2475 NaN NaN NaN NaN NaN 389.0 False
4 06/01/2015 0:00 AA 1 JFK LAX 25.0 0 2475 0.0 0.0 0.0 25.0 0.0 424.0 True
1
2
3
delay_data = df['delayed'].value_counts()

delay_data

False 103037
True 98627
Name: delayed, dtype: int64

1
delay_data[1]/(delay_data[0] + delay_data[1])

0.4890659711202793

每个航空公司延误情况

1
2
3
delay_group = df.groupby(['unique_carrier','delayed'])

delay_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118b616d8>

1
delay_group.size()

unique_carrier delayed
AA False 8912
True 9841
AS False 3527
True 2104
B6 False 4832
True 4401
DL False 17719
True 9803
EV False 10596
True 11371
F9 False 1103
True 1848
HA False 1351
True 1354
MQ False 4692
True 8060
NK False 1550
True 2133
OO False 9977
True 10804
UA False 7885
True 8624
US False 7850
True 6353
VX False 1254
True 781
WN False 21789
True 21150
dtype: int64

1
2
3
df_delay = delay_group.size().unstack()

df_delay





















































































delayed False True
unique_carrier
AA 8912 9841
AS 3527 2104
B6 4832 4401
DL 17719 9803
EV 10596 11371
F9 1103 1848
HA 1351 1354
MQ 4692 8060
NK 1550 2133
OO 9977 10804
UA 7885 8624
US 7850 6353
VX 1254 781
WN 21789 21150
1
2
3
import matplotlib.pyplot as plt

df_delay.plot(kind='barh',stacked=True,figsize=[16,6],colormap='winter')

透视表

1
2
3
flight_by_carrier = df.pivot_table(index=['flight_date'],columns=['unique_carrier'],values=['flight_num'],aggfunc='count')

flight_by_carrier.head()
































































































































flight_num
unique_carrier AA AS B6 DL EV F9 HA MQ NK OO UA US VX WN
flight_date
02/01/2015 0:00 1545 477 759 2271 1824 254 224 1046 287 1763 1420 1177 176 3518
03/01/2015 0:00 1453 449 711 2031 1744 192 202 937 285 1681 1233 1028 160 3328
04/01/2015 0:00 1534 458 759 2258 1833 249 206 1027 284 1731 1283 1158 169 3403
05/01/2015 0:00 1532 433 754 2212 1811 264 209 1039 288 1737 1432 1157 174 3506
06/01/2015 0:00 1400 415 692 2054 1686 249 202 966 279 1527 1294 1003 152 3396

Matplotlib

简单绘图之plot

1
2
3
4
5
import numpy as np
import matplotlib.pyplot as plt

a = [1,2,3]
plt.plot(a)

plt.show() 开始绘图

1
plt.show()
1
2
b = [4,5,6]
plt.plot(a,b)

魔法命令,可直接显示图像

1
%matplotlib inline

显示运行时间

1
%timeit np.arange(10)

594 ns ± 9.52 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

1
plt.plot(a,b,'*')

1
plt.plot(a,b,'r--')

一张图中画多条线

1
2
3
4
c = [10,8,6]
d = [1,8,3]

plt.plot(a,b,'r--',c,d,'b*')

画三角函数,并设置label,label,title

1
2
t = np.arange(0.0,2.0,0.1)
t.size

20

1
2
s = np.sin(t * np.pi)
s.size

20

1
s

array([ 0.00000000e+00, 3.09016994e-01, 5.87785252e-01, 8.09016994e-01,
9.51056516e-01, 1.00000000e+00, 9.51056516e-01, 8.09016994e-01,
5.87785252e-01, 3.09016994e-01, 1.22464680e-16, -3.09016994e-01,
-5.87785252e-01, -8.09016994e-01, -9.51056516e-01, -1.00000000e+00,
-9.51056516e-01, -8.09016994e-01, -5.87785252e-01, -3.09016994e-01])

1
2
3
4
5
6
plt.plot(t,s,'r--',label='aaaa')
plt.plot(t*2,s,'--',label='bbbb')
plt.xlabel('this is x')
plt.ylabel('this is y')
plt.title('this is a demo')
plt.legend()

简单绘图之subplot

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0,5)
y1 = np.sin(np.pi*x)
y2 = np.sin(np.pi*x*2)

plt.plot(x, y1, 'b--', label='sin(pi*x)')
plt.ylabel('y1 value')
plt.plot(x, y2, 'r--', label='sin(pi*2x)')
plt.ylabel('y2 value')
plt.xlabel('x value')
plt.title('this is x-y value')
plt.legend()

1
2
3
4
5
6
7
8
9
10
11
plt.subplot(2,2,1)
plt.plot(x, y1, 'b--')
plt.ylabel('y1')
plt.subplot(2,2,2)
plt.plot(x, y2, 'r--')
plt.ylabel('y2')
plt.xlabel('x')
plt.subplot(2,2,3)
plt.plot(x, y1, 'r*')
plt.subplot(2,2,4)
plt.plot(x, y1, 'b*')

subplots

1
figure, ax = plt.subplots(2,2)

1
2
3
4
ax[0][0].plot(x,y1)
ax[0][1].plot(x,y2)

figure

Series

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series
import matplotlib.pyplot as plt

s1 = Series(np.random.randn(100).cumsum())
s2 = Series(np.random.randn(100).cumsum())
1
s1.plot(style='--')

1
2
3
s1.plot(kind='line', grid=True, label='S1', title='this is Series')
s2.plot(label='S2', style='--')
plt.legend()

1
2
3
figure, ax = plt.subplots(2,1)

figure

1
2
3
s1[0:10].plot(ax=ax[0],label='S1',kind='bar')
s2.plot(ax=ax[1],label='S2')
figure

DataFrame

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

df = DataFrame(
np.random.randint(1,10,40).reshape(10,4),
columns=['A','B','C','D']
)

df


















































































A B C D
0 2 1 9 8
1 2 3 9 1
2 2 5 9 2
3 4 1 8 6
4 1 9 8 9
5 5 4 6 5
6 7 2 1 8
7 1 4 1 8
8 8 1 2 9
9 2 4 5 4

横向柱状图

1
df.plot(kind='barh')

Stacked柱状图

1
df.plot(kind='bar',stacked=True)

区域填充

1
df.plot(kind='area')

指定index画图

1
2
3
a = df.iloc[5]

a.plot()

按index画图(方法1)

1
2
3
4
for i in df.index:
df.iloc[i].plot(label=str(i))

plt.legend()

按index画图(方法2)

1
df.T.plot()

按column画图

1
df['A'].plot()

直方图

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

s = Series(np.random.randn(1000))

plt.hist(s, rwidth=0.9)

array([ 1., 9., 42., 129., 243., 257., 198., 93., 24., 4.]),
array([-3.75513064, -3.04405274, -2.33297485, -1.62189695, -0.91081905,
-0.19974115, 0.51133675, 1.22241465, 1.93349255, 2.64457045,
3.35564835])

1
plt.hist(s, rwidth=0.9, bins=20, color='r')

array([ 1., 0., 1., 8., 15., 27., 46., 83., 103., 140., 143.,
114., 120., 78., 61., 32., 15., 9., 3., 1.]),
array([-3.75513064, -3.39959169, -3.04405274, -2.68851379, -2.33297485,
-1.9774359 , -1.62189695, -1.266358 , -0.91081905, -0.5552801 ,
-0.19974115, 0.1557978 , 0.51133675, 0.8668757 , 1.22241465,
1.5779536 , 1.93349255, 2.2890315 , 2.64457045, 3.0001094 ,
3.35564835])

密度图

1
s.plot(kind='kde')

Seaborn

散点图

方法1:matplotlib

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

iris = pd.read_csv('iris.csv')
iris.head()





















































SepalLength SepalWidth PetalLength PetalWidth Name
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
1
iris.Name.unique()

array([‘Iris-setosa’, ‘Iris-versicolor’, ‘Iris-virginica’], dtype=object)

1
2
3
color_map = dict(zip(iris.Name.unique(),['blue','green','red']))

color_map

{‘Iris-setosa’: ‘blue’, ‘Iris-versicolor’: ‘green’, ‘Iris-virginica’: ‘red’}

1
2
3
4
5
6
7
8
9
for species, group in iris.groupby('Name'):
plt.scatter(group['PetalLength'], group['SepalLength'],
color=color_map[species],
alpha=0.3, edgecolor=None,
label=species)

plt.legend(frameon=True, title='Name')
plt.xlabel('petalLength')
plt.ylabel('sepalLength')

方法2:Seaborn

1
sns.lmplot('PetalLength','SepalLength', iris, hue='Name', fit_reg=False)

直方图(matplotlib)

1
2
s1 = Series(np.random.randn(1000))
plt.hist(s1)

(array([ 5., 36., 96., 196., 278., 227., 111., 37., 9., 5.]),
array([-3.17567133, -2.48638745, -1.79710358, -1.1078197 , -0.41853582,
0.27074806, 0.96003194, 1.64931581, 2.33859969, 3.02788357,
3.71716745]),
)

密度图(matplotlib)

1
s1.plot(kind='kde')

直方图(Seaborn)

1
sns.distplot(s1, hist=True, kde=False, rug=True)

密度图(Seaborn)

1
sns.kdeplot(s1, shade=True, color='r')

轴须图(Seaborn)

1
sns.rugplot(s1)

热力图(Seaborn)

1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('flights')

df.head()









































year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
1
df.shape

(144, 3)

生成透视表

1
2
df = df.pivot(index='month', columns='year', values='passengers')
df























































































































































































































year 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
month
January 112 115 145 171 196 204 242 284 315 340 360 417
February 118 126 150 180 196 188 233 277 301 318 342 391
March 132 141 178 193 236 235 267 317 356 362 406 419
April 129 135 163 181 235 227 269 313 348 348 396 461
May 121 125 172 183 229 234 270 318 355 363 420 472
June 135 149 178 218 243 264 315 374 422 435 472 535
July 148 170 199 230 264 302 364 413 465 491 548 622
August 148 170 199 242 272 293 347 405 467 505 559 606
September 136 158 184 209 237 259 312 355 404 404 463 508
October 119 133 162 191 211 229 274 306 347 359 407 461
November 104 114 146 172 180 203 237 271 305 310 362 390
December 118 140 166 194 201 229 278 306 336 337 405 432

线形图(Matplotlib)

1
df.plot()

热力图(Seaborn)

1
sns.heatmap(df, annot=True, fmt='d')

柱状图(Matplotlib)

1
s.plot(kind='bar')

柱状图(Seaborn)

1
2
3
 s = df.sum()

s.index

Int64Index([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
1960],
dtype=’int64’, name=’year’)

1
s.values

array([1520, 1676, 2042, 2364, 2700, 2867, 3408, 3939, 4421, 4572, 5140,
5714])

1
sns.barplot(x=s.index, y=s.values)

Seaborn设置图形效果

1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x = np.linspace(0,14,100)
y1 = np.sin(x)
y2 = np.sin(x+2)*1.25

def sinplot():
plt.plot(x, y1)
plt.plot(x, y2)
1
sinplot()

设置Seaborn显示风格

1
2
3
4
5
import seaborn as sns
style = ['darkgrid', 'dark', 'white', 'whitegrid', 'ticks']

sns.set_style(style[0])
sinplot()

1
2
sns.set_style(style[3])
sinplot()

1
sns.axes_style()

{‘axes.facecolor’: ‘white’,
‘axes.edgecolor’: ‘.8’,
‘axes.grid’: True,
‘axes.axisbelow’: True,
‘axes.labelcolor’: ‘.15’,
‘figure.facecolor’: ‘white’,
‘grid.color’: ‘.8’,
‘grid.linestyle’: ‘-‘,
‘text.color’: ‘.15’,
‘xtick.color’: ‘.15’,
‘ytick.color’: ‘.15’,
‘xtick.direction’: ‘out’,
‘ytick.direction’: ‘out’,
‘lines.solid_capstyle’: ‘round’,
‘patch.edgecolor’: ‘w’,
‘image.cmap’: ‘rocket’,
‘font.family’: [‘sans-serif’],
‘font.sans-serif’: [‘Arial’,
‘DejaVu Sans’,
‘Liberation Sans’,
‘Bitstream Vera Sans’,
‘sans-serif’],
‘patch.force_edgecolor’: True,
‘xtick.bottom’: False,
‘xtick.top’: False,
‘ytick.left’: False,
‘ytick.right’: False,
‘axes.spines.left’: True,
‘axes.spines.bottom’: True,
‘axes.spines.right’: True,
‘axes.spines.top’: True}

自定义配置

1
2
sns.set_style(style[0],{'grid.color': 'green'})
sinplot()

清空自定义设置

1
2
sns.set()
sinplot()

plotting_context() & set_context()

1
2
3
4
context = ['paper', 'notebook', 'talk', 'poster']

sns.set_context(context[1], rc={'grid.linewidth': 3.0})
sinplot()

1
2
sns.set_context(context[3])
sinplot()

显示context详细设置

1
sns.plotting_context()

{‘font.size’: 24.0,
‘axes.labelsize’: 24.0,
‘axes.titlesize’: 24.0,
‘xtick.labelsize’: 22.0,
‘ytick.labelsize’: 22.0,
‘legend.fontsize’: 22.0,
‘axes.linewidth’: 2.5,
‘grid.linewidth’: 2.0,
‘lines.linewidth’: 3.0,
‘lines.markersize’: 12.0,
‘patch.linewidth’: 2.0,
‘xtick.major.width’: 2.5,
‘ytick.major.width’: 2.5,
‘xtick.minor.width’: 2.0,
‘ytick.minor.width’: 2.0,
‘xtick.major.size’: 12.0,
‘ytick.major.size’: 12.0,
‘xtick.minor.size’: 8.0,
‘ytick.minor.size’: 8.0}

股票分析

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_datareader as pdr
from datetime import datetime
1
2
3
alibaba = pdr.get_data_yahoo('BABA')

alibaba.head()




































































High Low Open Close Volume Adj Close
Date
2014-09-19 99.699997 89.949997 92.699997 93.889999 271879400 93.889999
2014-09-22 92.949997 89.500000 92.699997 89.889999 66657800 89.889999
2014-09-23 90.480003 86.620003 88.940002 87.169998 39009800 87.169998
2014-09-24 90.570000 87.220001 88.470001 90.570000 32088000 90.570000
2014-09-25 91.500000 88.500000 91.089996 88.919998 28598000 88.919998
1
alibaba.shape

(1211, 6)

1
alibaba.tail()




































































High Low Open Close Volume Adj Close
Date
2019-07-08 170.279999 168.350006 170.089996 168.449997 16531000 168.449997
2019-07-09 168.979996 166.500000 167.085007 168.800003 15782800 168.800003
2019-07-10 171.369995 166.860001 171.240005 166.929993 16675600 166.929993
2019-07-11 168.423996 165.000000 167.559998 166.550003 14796400 166.550003
2019-07-12 170.199997 167.649994 168.000000 169.070007 18093300 169.070007
1
alibaba.describe()






















































































High Low Open Close Volume Adj Close
count 1211.000000 1211.000000 1211.000000 1211.000000 1.211000e+03 1211.000000
mean 126.715532 123.620233 125.280824 125.175351 1.760189e+07 125.175351
std 44.698800 43.686183 44.263113 44.208817 1.260813e+07 44.208817
min 58.650002 57.200001 57.299999 57.389999 3.775300e+06 57.389999
25% 85.129997 83.375000 84.594997 84.494999 1.107115e+07 84.494999
50% 108.820000 106.510002 107.809998 107.830002 1.489790e+07 107.830002
75% 173.044998 169.508499 171.739998 171.590004 2.063065e+07 171.590004
max 211.699997 207.509995 209.949997 210.860001 2.718794e+08 210.860001
1
alibaba.info()


DatetimeIndex: 1211 entries, 2014-09-19 to 2019-07-12
Data columns (total 6 columns):
High 1211 non-null float64
Low 1211 non-null float64
Open 1211 non-null float64
Close 1211 non-null float64
Volume 1211 non-null int64
Adj Close 1211 non-null float64
dtypes: float64(5), int64(1)
memory usage: 66.2 KB

1
2
3
4
5
6
7
8
start = datetime(2015,9,20)
alibaba = pdr.get_data_yahoo('BABA', start=start)
amazon = pdr.get_data_yahoo('AMZN', start=start)

alibaba.to_csv('/Users/kwang/jupyter-projects/BABA.csv')
amazon.to_csv('/Users/kwang/jupyter-projects/AMZN.csv')

alibaba.head()




































































High Low Open Close Volume Adj Close
Date
2015-09-21 66.400002 62.959999 65.379997 63.900002 22355100 63.900002
2015-09-22 63.270000 61.580002 62.939999 61.900002 14897900 61.900002
2015-09-23 62.299999 59.680000 61.959999 60.000000 22684600 60.000000
2015-09-24 60.340000 58.209999 59.419998 59.919998 20645700 59.919998
2015-09-25 60.840000 58.919998 60.630001 59.240002 17009100 59.240002
1
alibaba['Adj Close'].plot(legend=True)

1
alibaba['Volume'].plot(legend=True)

1
2
alibaba['Adj Close'].plot()
amazon['Adj Close'].plot()

1
2
3
alibaba['high-low'] = alibaba['High'] - alibaba['Low']

alibaba.head()











































































High Low Open Close Volume Adj Close high-low
Date
2015-09-21 66.400002 62.959999 65.379997 63.900002 22355100 63.900002 3.440002
2015-09-22 63.270000 61.580002 62.939999 61.900002 14897900 61.900002 1.689999
2015-09-23 62.299999 59.680000 61.959999 60.000000 22684600 60.000000 2.619999
2015-09-24 60.340000 58.209999 59.419998 59.919998 20645700 59.919998 2.130001
2015-09-25 60.840000 58.919998 60.630001 59.240002 17009100 59.240002 1.920002
1
alibaba['high-low'].plot()

1
2
3
alibaba['daily-return'] = alibaba['Adj Close'].pct_change()

alibaba['daily-return'].plot(figsize=(20,4),linestyle='--',marker='o')

1
alibaba['daily-return'].plot(kind='hist')

1
sns.distplot(alibaba['daily-return'].dropna(),bins=100,color='purple')

1
2
3
4
5
6
7
start = datetime(2015,1,1)
company = ['AAPL', 'GOOG', 'MSFT', 'AMZN', 'FB']
top_tech_df = pdr.get_data_yahoo(company, start=start)['Adj Close']

top_tech_df.to_csv('/Users/kwang/jupyter-projects/top5.csv')

top_tech_df.head()





























































Symbols AAPL AMZN FB GOOG MSFT
Date
2014-12-31 102.110046 310.350006 78.019997 524.958740 41.982117
2015-01-02 101.138702 308.519989 78.449997 523.373108 42.262299
2015-01-05 98.289474 302.190002 77.190002 512.463013 41.873657
2015-01-06 98.298729 295.290009 76.150002 500.585632 41.259060
2015-01-07 99.677094 298.420013 76.150002 499.727997 41.783279
1
2
3
top_tech_dr = top_tech_df.pct_change()

top_tech_dr.head()





























































Symbols AAPL AMZN FB GOOG MSFT
Date
2014-12-31 NaN NaN NaN NaN NaN
2015-01-02 -0.009513 -0.005897 0.005511 -0.003020 0.006674
2015-01-05 -0.028171 -0.020517 -0.016061 -0.020846 -0.009196
2015-01-06 0.000094 -0.022833 -0.013473 -0.023177 -0.014677
2015-01-07 0.014022 0.010600 0.000000 -0.001713 0.012706
1
top_tech_df.plot()

1
top_tech_df[['AAPL','FB','MSFT']].plot()

散点图

1
sns.jointplot('AMZN','GOOG',top_tech_dr, kind='scatter')

1
sns.jointplot('MSFT','FB',top_tech_dr, kind='scatter')

1
sns.pairplot(top_tech_dr.dropna())

分位数

1
top_tech_dr['AAPL'].quantile(0.05)

-0.02510024760167844

95%的信心daily-return为-2%

1
top_tech_dr['MSFT'].quantile(0.05)

-0.021409700990842318

1
2
vips = pdr.get_data_yahoo('VIPS', start=start)['Adj Close']
vips.plot()

1
vips.pct_change().quantile(0.2)

-0.02329943700527643

本文标题:Python 数据科学 学习笔记 - 钢钢更新

文章作者:王方钢 / Kenny Wang

发布时间:2019年07月12日 - 20:07

最后更新:2019年09月03日 - 20:09

原始链接:https://wangfanggang.com/Python/python-ai/

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

王方钢 / Kenny Wang wechat
0%