Numpy 数组Array 初始化数组 1 2 3 4 import numpy as nplist_1 = [1 ,2 ,3 ,4 ] array_1 = np.array(list_1)
array([1, 2, 3, 4])
多维数组 1 2 list_2 = [5 ,6 ,7 ,8 ] array_2 = np.array([list_1, list_2])
array([[1, 2, 3, 4], [5, 6, 7, 8]])
获取数组信息 (2, 4)
获取数组类型 dtype(‘int64’)
获取数组尺寸 8
1 array_4 = np.array([[1.0 ,2 ,3 ],[4.0 ,5 ,6 ]])
array([[1., 2., 3.], [4., 5., 6.]])
dtype(‘float64’)
1 array_5 = np.arange(1 ,10 ,2 )
array([1, 3, 5, 7, 9])
初始化元素为0的数组 array([0., 0., 0., 0., 0.])
array([[0., 0., 0.], [0., 0., 0.]])
初始化数组 array([1, 2, 3, 4, 5, 6, 7, 8, 9])
数组切片 2
array([2, 3, 4, 5])
1 b = np.array([[1 ,2 ,3 ],[4 ,5 ,6 ]])
array([[1, 2, 3], [4, 5, 6]])
4
4
1 c = np.array([[1 ,2 ,3 ],[4 ,5 ,6 ],[7 ,8 ,9 ]])
array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
array([[2, 3], [5, 6]])
array([[4, 5, 6], [7, 8, 9]])
array([[2, 3], [5, 6], [8, 9]])
array([[1], [4], [7]])
array([[1, 2], [4, 5], [7, 8]])
array([[1, 2, 3]])
array([[4, 5, 6], [7, 8, 9]])
array([[5, 6], [8, 9]])
array([[2], [5], [8]])
array([[2, 3], [5, 6], [8, 9]])
array([[2, 3], [5, 6], [8, 9]])
array([[2, 3], [5, 6]])
用随机(小)数初始化数组 1 2 3 import numpy as npnp.random.randn(10 )
array([-2.98186555, 0.51843113, -0.74563428, -1.84832264, -0.54354513, 1.23002927, 0.99527109, -0.49284623, 0.20869479, -0.46934356])
用随机整数初始化数组 2
1 np.random.randint(10 , size=20 ).reshape(4 ,5 )
array([[3, 2, 5, 1, 3], [1, 6, 2, 8, 4], [0, 4, 8, 7, 0], [4, 4, 8, 5, 3]])
数组运算 1 a = np.random.randint(10 , size=20 ).reshape(4 ,5 )
1 b = np.random.randint(10 , size=20 ).reshape(4 ,5 )
array([[7, 5, 8, 9, 8], [0, 9, 5, 4, 0], [4, 7, 3, 7, 6], [3, 7, 9, 1, 6]])
array([[0, 9, 9, 5, 4], [7, 3, 1, 5, 7], [3, 6, 5, 5, 9], [3, 7, 3, 6, 9]])
数组相加 array([[ 7, 14, 17, 14, 12], [ 7, 12, 6, 9, 7], [ 7, 13, 8, 12, 15], [ 6, 14, 12, 7, 15]])
数组相减 array([[ 7, -4, -1, 4, 4], [-7, 6, 4, -1, -7], [ 1, 1, -2, 2, -3], [ 0, 0, 6, -5, -3]])
数组相乘 array([[ 0, 45, 72, 45, 32], [ 0, 27, 5, 20, 0], [12, 42, 15, 35, 54], [ 9, 49, 27, 6, 54]])
数组常用函数 array([[7, 5, 8, 9, 8], [0, 9, 5, 4, 0], [4, 7, 3, 7, 6], [3, 7, 9, 1, 6]])
获取数组中所有唯一数 array([0, 1, 3, 4, 5, 6, 7, 8, 9])
对数组每一列求和 array([14, 28, 25, 21, 20])
对数组指定行求和 37
对数组指定列求和 14
获取数组最大值 9
获取数组指定行最大值 9
获取数组指定列最大值 7
使用pickle序列化array 1 2 3 4 import pickleimport numpy as npx = np.arange(10 )
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
将数组以二进制的方式写入文件 写入文件 查看文件目录 Array的input和output.ipynb x.pkl array.ipynb 数组与矩阵运算.ipynb
以二进制方式打开文件 读取文件内容 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
矩阵Matrix 1 np.mat([[1 ,2 ,3 ],[4 ,5 ,6 ]])
matrix([[1, 2, 3], [4, 5, 6]])
matrix([[7, 5, 8, 9, 8], [0, 9, 5, 4, 0], [4, 7, 3, 7, 6], [3, 7, 9, 1, 6]])
array([[7, 5, 8, 9, 8], [0, 9, 5, 4, 0], [4, 7, 3, 7, 6], [3, 7, 9, 1, 6]])
矩阵的运算 matrix([[7, 5, 8, 9, 8], [0, 9, 5, 4, 0], [4, 7, 3, 7, 6], [3, 7, 9, 1, 6]])
matrix([[0, 9, 9, 5, 4], [7, 3, 1, 5, 7], [3, 6, 5, 5, 9], [3, 7, 3, 6, 9]])
矩阵相加 matrix([[ 7, 14, 17, 14, 12], [ 7, 12, 6, 9, 7], [ 7, 13, 8, 12, 15], [ 6, 14, 12, 7, 15]])
矩阵相减 matrix([[ 7, -4, -1, 4, 4], [-7, 6, 4, -1, -7], [ 1, 1, -2, 2, -3], [ 0, 0, 6, -5, -3]])
矩阵相乘 1 2 c = np.mat(np.random.randint(10 , size=20 ).reshape(4 ,5 )) d = np.mat(np.random.randint(10 , size=20 ).reshape(5 ,4 ))
matrix([[8, 5, 2, 1, 8], [9, 5, 8, 5, 4], [5, 0, 6, 9, 8], [3, 9, 7, 9, 6]])
matrix([[1, 8, 8, 5], [3, 4, 5, 3], [9, 3, 5, 2], [8, 6, 9, 2], [4, 9, 8, 1]])
matrix([[ 81, 168, 172, 69], [152, 182, 214, 90], [163, 184, 215, 63], [189, 189, 233, 80]])
使用numpy进行输入输出(文件操作) 使用np.save写入文件 Array的input和output.ipynb x.pkl
array.ipynb 数组与矩阵运算.ipynb
one_array.npy
使用np.load读取文件内容 1 np.load('one_array.npy' )
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
使用np.savez将多个数组保存到一个文件中 1 np.savez('two_array.npz' , a=x, b=y)
Array的input和output.ipynb two_array.npz array.ipynb x.pkl one_array.npy 数组与矩阵运算.ipynb
利用np.load读取文件内容 1 c = np.load('two_array.npz' )
利用索引名读取对应数组内容 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
Pandas Series Series初始化 1 2 3 4 import numpy as npimport pandas as pds1 = pd.Series([1 ,2 ,3 ,4 ])
0 1 1 2 2 3 3 4 dtype: int64
values array([1, 2, 3, 4])
index RangeIndex(start=0, stop=4, step=1)
1 s2 = pd.Series(np.arange(10 ))
0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int64
指定index 1 s3 = pd.Series({'1' :1 ,'2' :2 ,'3' :3 })
1 1 2 2 3 3 dtype: int64
Index([‘1’, ‘2’, ‘3’], dtype=’object’)
array([1, 2, 3])
1 s4 = pd.Series([1 ,2 ,3 ,4 ], index=['A' ,'B' ,'C' ,'D' ])
A 1 B 2 C 3 D 4 dtype: int64
array([1, 2, 3, 4])
Index([‘A’, ‘B’, ‘C’, ‘D’], dtype=’object’)
访问Series元素 1
C 3 D 4 dtype: int64
A 1 B 2 C 3 D 4 dtype: int64
{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}
1 s5 = pd.Series(s4.to_dict())
A 1 B 2 C 3 D 4 dtype: int64
新增index 1 2 index_1=['A' ,'B' ,'C' ,'D' ,'E' ] s6 = pd.Series(s5, index=index_1)
A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64
对Series元素进行空值判定 A False B False C False D False E True dtype: bool
对Series元素进行非空判定 A True B True C True D True E False dtype: bool
指定Series名称 A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64
指定Series索引名称 1 s6.index.name = 'index demo'
index demo A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64
Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’index demo’)
array([ 1., 2., 3., 4., nan])
1 s1 = pd.Series(data['Country' ], index=['A' ,'B' ,'C' ])
0 Belgium 1 India 2 Brazil dtype: object
array([‘Belgium’, ‘India’, ‘Brazil’], dtype=object)
Index([‘A’, ‘B’, ‘C’], dtype=’object’)
Reindex on Series 1 2 3 4 5 import numpy as npimport pandas as pdfrom pandas import Series, DataFrames1 = pd.Series([1 ,2 ,3 ,4 ], index=['A' ,'B' ,'C' ,'D' ])
A 1 B 2 C 3 D 4 dtype: int64
添加index 1 s1.reindex(['A' ,'B' ,'C' ,'D' ,'E' ])
A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64
用固定数值填充新的index 1 s1.reindex(['A' ,'B' ,'C' ,'D' ,'E' ], fill_value=10 )
A 1 B 2 C 3 D 4 E 10 dtype: int64
1 s2 = pd.Series(['A' ,'B' ,'C' ], index=[1 ,5 ,10 ])
1 A 5 B 10 C dtype: object
用 range() 扩充index 1 s2.reindex(index=range (15 ))
0 NaN 1 A 2 NaN 3 NaN 4 NaN 5 B 6 NaN 7 NaN 8 NaN 9 NaN 10 C 11 NaN 12 NaN 13 NaN 14 NaN dtype: object
用ffill填充新增index 1 s2.reindex(index=range (15 ), method='ffill' )
0 NaN 1 A 2 A 3 A 4 A 5 B 6 B 7 B 8 B 9 B 10 C 11 C 12 C 13 C 14 C dtype: object
对Series切片 A 1 B 2 C 3 D 4 dtype: int64
1 s1.reindex(index=['A' ,'B' ])
A 1 B> 2 dtype: int64
A 1
B 2
C 3
D 4
dtype: int64
对Series进行drop index B 2 C 3 D 4 dtype: int64
Series的简单数学计算 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrames1 = Series([1 ,2 ,3 ], index=['A' ,'B' ,'C' ]) s1
A 1 B 2 C 3 dtype: int64
1 2 s2 = Series([4 ,5 ,6 ,7 ], index=['B' ,'C' ,'D' ,'E' ]) s2
B 4 C 5 D 6 E 7 dtype: int64
A NaN B 6.0 C 8.0 D NaN E NaN dtype: float64
Series的排序 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrames1 = Series(np.random.randn(10 )) s1
0 0.035039 1 1.087393 2 -0.342811 3 -0.557990 4 0.548988 5 0.051965 6 2.983233 7 0.757732 8 -0.391369 9 0.850908 dtype: float64
array([ 0.03503903, 1.08739272, -0.34281109, -0.5579896 , 0.54898768, 0.05196506, 2.98323291, 0.75773189, -0.39136856, 0.85090778])
RangeIndex(start=0, stop=10, step=1)
按columns排序 1 2 s2 = s1.sort_values(ascending=False ) s2
6 2.983233
1 1.087393
9 0.850908
7 0.757732
4 0.548988
5 0.051965
0 0.035039
2 -0.342811
8 -0.391369
3 -0.557990
dtype: float64
按index排序 0 0.035039 1 1.087393 2 -0.342811 3 -0.557990 4 0.548988 5 0.051965 6 2.983233 7 0.757732 8 -0.391369 9 0.850908 dtype: float64
Nan in Series 1 2 3 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame
nan
float
nan
1 s1 = Series([1 ,2 ,np.nan,3 ,4 ],index=['A' ,'B' ,'C' ,'D' ,'E' ])
A 1.0 B 2.0 C NaN D 3.0 E 4.0 dtype: float64
空值判断 A False B False C True D False E False dtype: bool
删除nan的行 A 1.0 B 2.0 D 3.0 E 4.0 dtype: float64
DataFrame 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport webbrowserlink = 'https://www.tiobe.com/tiobe-index/' webbrowser.open (link)
读取剪贴板内容,并生成DataFrame 1 2 3 df = pd.read_clipboard() df
Jul 2019 Jul 2018 Change Programming Language Ratings Change.1 0 1 1 NaN Java 15.058% -1.08% 1 2 2 NaN C 14.211% -0.45% 2 3 4 change Python 9.260% +2.90% 3 4 3 change C++ 6.705% -0.91% 4 5 6 change C# 4.365% +0.57% 5 6 5 change Visual Basic .NET 4.208% -0.04% 6 7 8 change JavaScript 2.304% -0.53% 7 8 7 change PHP 2.167% -0.67% 8 9 9 NaN SQL 1.977% -0.36% 9 10 10 NaN Objective-C 1.686% +0.23%
Type pandas.core.frame.DataFrame
Columns Index(['Jul 2019', 'Jul 2018', 'Change', 'Programming Language', 'Ratings',
'Change.1'],
dtype='object')
访问DataFrame列 0 15.058% 1 14.211% 2 9.260% 3 6.705% 4 4.365% 5 4.208% 6 2.304% 7 2.167% 8 1.977% 9 1.686% Name: Ratings, dtype: object
复制DataFrame 1 df_new = DataFrame(df, columns=['Programming Language' ,'Jul 2018' ])
Programming Language Jul 2018 0 Java 1 1 C 2 2 Python 4 3 C++ 3 4 C# 6 5 Visual Basic .NET 5 6 JavaScript 8 7 PHP 7 8 SQL 9 9 Objective-C 10
向DataFrame中增加新列 1 df_new = DataFrame(df, columns=['Programming Language' ,'Jul 2018' , 'Sep 2018' ])
Programming Language Jul 2018 Sep 2018 0 Java 1 NaN 1 C 2 NaN 2 Python 4 NaN 3 C++ 3 NaN 4 C# 6 NaN 5 Visual Basic .NET 5 NaN 6 JavaScript 8 NaN 7 PHP 7 NaN 8 SQL 9 NaN 9 Objective-C 10 NaN
向新增列填充数据 方法1:range() 1 df_new['Sep 2018' ] = range (0 ,10 )
Programming Language Jul 2018 Sep 2018 0 Java 1 0 1 C 2 1 2 Python 4 2 3 C++ 3 3 4 C# 6 4 5 Visual Basic .NET 5 5 6 JavaScript 8 6 7 PHP 7 7 8 SQL 9 8 9 Objective-C 10 9
方法2:np.range() 1 df_new['Sep 2018' ] = np.arange(0 ,10 )
Programming Language Jul 2018 Sep 2018 0 Java 1 0 1 C 2 1 2 Python 4 2 3 C++ 3 3 4 C# 6 4 5 Visual Basic .NET 5 5 6 JavaScript 8 6 7 PHP 7 7 8 SQL 9 8 9 Objective-C 10 9
方法3:np.arange() 1 df_new['Sep 2018' ] = pd.Series(np.arange(0 ,10 ))
Programming Language Jul 2018 Sep 2018 0 Java 1 0 1 C 2 1 2 Python 4 2 3 C++ 3 3 4 C# 6 4 5 Visual Basic .NET 5 5 6 JavaScript 8 6 7 PHP 7 7 8 SQL 9 8 9 Objective-C 10 9
向新增列中填充数值 1 df_new['Sep 2018' ] = pd.Series([100 ,200 ],index=[1 ,2 ])
Programming Language Jul 2018 Sep 2018 0 Java 1 NaN 1 C 2 100.0 2 Python 4 200.0 3 C++ 3 NaN 4 C# 6 NaN 5 Visual Basic .NET 5 NaN 6 JavaScript 8 NaN 7 PHP 7 NaN 8 SQL 9 NaN 9 Objective-C 10 NaN
1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedata = {'Country' : ['Belgium' ,'India' , 'Brazil' ], 'Capital' :['Brussels' , 'New Delhi' , 'Brasilia' ], 'Population' :[11190846 , 1303171035 , 207847528 ]}
1 df1 = pd.DataFrame(data)
Country Capital Population 0 Belgium Brussels 11190846 1 India New Delhi 1303171035 2 Brazil Brasilia 207847528
访问DataFrame某一列 0 Belgium 1 India 2 Brazil Name: Country, dtype: object
pandas.core.series.Series
遍历DataFrame <generator object DataFrame.iterrows at 0x118d895e8>
1 2 3 for row in df1.iterrows(): print (row[0 ]), print (row[1 ]) break
0 Country Belgium Capital Brussels Population 11190846 Name: 0, dtype: object
Country Capital Population 0 Belgium Brussels 11190846 1 India New Delhi 1303171035 2 Brazil Brasilia 207847528
{‘Country’: [‘Belgium’, ‘India’, ‘Brazil’], ‘Capital’: [‘Brussels’, ‘New Delhi’, ‘Brasilia’], ‘Population’: [11190846, 1303171035, 207847528]}
Series => DataFrame 1 s1 = pd.Series(data['Capital' ])
1 s2 = pd.Series(data['Country' ])
1 s3 = pd.Series(data['Population' ])
1 df_new = pd.DataFrame([s2,s1,s3],index=['Country' ,'Capital' ,'Population' ])
0 1 2 Country Belgium India Brazil Capital Brussels New Delhi Brasilia Population 11190846 1303171035 207847528
行转列 Country Capital Population 0 Belgium Brussels 11190846 1 India New Delhi 1303171035 2 Brazil Brasilia 207847528
Country Capital Population 0 Belgium Brussels 11190846 1 India New Delhi 1303171035 2 Brazil Brasilia 207847528
Nan in DataFrame 1 dframe = DataFrame([[1 ,2 ,3 ],[np.nan,5 ,6 ],[7 ,np.nan,9 ],[np.nan,np.nan,np.nan]])
0 1 2 0 1.0 2.0 3.0 1 NaN 5.0 6.0 2 7.0 NaN 9.0 3 NaN NaN NaN
空值判断 0 1 2 0 False False False 1 True False False 2 False True False 3 True True True
非空判断 0 1 2 0 True True True 1 False True True 2 True False True 3 False False False
删除存在nan的行 1 df1 = dframe.dropna(axis=0 )
删除存在nan的列 1 df2 = dframe.dropna(axis=1 )
仅删除所有值都为nan的行 1 df3 = dframe.dropna(axis=0 , how='all' )
0 1 2 0 1.0 2.0 3.0 1 NaN 5.0 6.0 2 7.0 NaN 9.0
1 dframe2 = DataFrame([[1 ,2 ,3 ,np.nan],[2 ,np.nan,5 ,6 ],[np.nan,7 ,np.nan,9 ],[1 ,np.nan,np.nan,np.nan]])
0 1 2 3 0 1.0 2.0 3.0 NaN 1 2.0 NaN 5.0 6.0 2 NaN 7.0 NaN 9.0 3 1.0 NaN NaN NaN
删除大于thresh阈值的行 1 df4 = dframe2.dropna(thresh=2 )
0 1 2 3 0 1.0 2.0 3.0 NaN 1 2.0 NaN 5.0 6.0 2 NaN 7.0 NaN 9.0
利用fillna对空值进行填充 0 1 2 3 0 1.0 2.0 3.0 1.0 1 2.0 1.0 5.0 6.0 2 1.0 7.0 1.0 9.0 3 1.0 1.0 1.0 1.0
利用fillna按列级别填充不同数值 1 dframe2.fillna(value={0 :0 ,1 :1 ,2 :2 ,3 :5 })
0 1 2 3 0 1.0 2.0 3.0 5.0 1 2.0 1.0 5.0 6.0 2 0.0 7.0 2.0 9.0 3 1.0 1.0 2.0 5.0
DataFrame I/O操作 打开网页 1 2 3 4 5 6 7 8 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport webbrowserlink = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html' webbrowser.open (link)
True
读取剪贴板内容 1 df1 = pd.read_clipboard()
Format Type Data Description Reader Writer 0 text CSV read_csv to_csv 1 text JSON read_json to_json 2 text HTML read_html to_html 3 text Local clipboard read_clipboard to_clipboard 4 binary MS Excel read_excel to_excel 5 binary HDF5 Format read_hdf to_hdf 6 binary Feather Format read_feather to_feather 7 binary Msgpack read_msgpack to_msgpack 8 binary Stata read_stata to_stata 9 binary SAS read_sas 10 binary Python Pickle Format read_pickle to_pickle 11 SQL SQL read_sql to_sql 12 SQL Google Big Query read_gbq to_gbq
将内容粘贴到剪贴板 写入CSV (不写入索引值) 1 df1.to_csv('df1.csv' , index=False )
2-3 Array的创建及访问.ipynb 2-4 数组与矩阵运算.ipynb 2-5 Array的input和output.ipynb 3-1 Pandas Series.ipynb 3-2 Pandas DataFrame.ipynb 3-3 深入理解Series和Dataframe.ipynb 3-4 Pandas-Dataframe-IO操作.ipynb df1.csv one_array.npy two_array.npz x.pkl
预览CSV文件内容 Format Type,Data Description,Reader,Writer text,CSV,read_csv,to_csv text,JSON,read_json,to_json text,HTML,read_html,to_html text,Local clipboard,read_clipboard,to_clipboard binary,MS Excel,read_excel,to_excel binary,HDF5 Format,read_hdf,to_hdf binary,Feather Format,read_feather,to_feather binary,Msgpack,read_msgpack,to_msgpack binary,Stata,read_stata,to_stata binary,SAS,read_sas, binary,Python Pickle Format,read_pickle,to_pickle SQL,SQL,read_sql,to_sql SQL,Google Big Query,read_gbq,to_gbq [7mdf1.csv (END)[m[K
Selecting & Indexing 1 2 3 4 5 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimdb = pd.read_csv('/Users/kwang/jupyter-projects/movie_metadata.csv' )
(5043, 28)
查看DataFrame前5行内容 color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres ... num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes 0 Color James Cameron 723.0 178.0 0.0 855.0 Joel David Moore 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English USA PG-13 237000000.0 2009.0 936.0 7.9 1.78 33000 1 Color Gore Verbinski 302.0 169.0 563.0 1000.0 Orlando Bloom 40000.0 309404152.0 Action|Adventure|Fantasy ... 1238.0 English USA PG-13 300000000.0 2007.0 5000.0 7.1 2.35 0 2 Color Sam Mendes 602.0 148.0 0.0 161.0 Rory Kinnear 11000.0 200074175.0 Action|Adventure|Thriller ... 994.0 English UK PG-13 245000000.0 2015.0 393.0 6.8 2.35 85000 3 Color Christopher Nolan 813.0 164.0 22000.0 23000.0 Christian Bale 27000.0 448130642.0 Action|Thriller ... 2701.0 English USA PG-13 250000000.0 2012.0 23000.0 8.5 2.35 164000 4 NaN Doug Walker NaN NaN 131.0 NaN Rob Walker 131.0 NaN Documentary ... NaN NaN NaN NaN NaN NaN 12.0 7.1 NaN 0
5 rows × 28 columns
查看DataFrame后5行内容 color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres ... num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes 5038 Color Scott Smith 1.0 87.0 2.0 318.0 Daphne Zuniga 637.0 NaN Comedy|Drama ... 6.0 English Canada NaN NaN 2013.0 470.0 7.7 NaN 84 5039 Color NaN 43.0 43.0 NaN 319.0 Valorie Curry 841.0 NaN Crime|Drama|Mystery|Thriller ... 359.0 English USA TV-14 NaN NaN 593.0 7.5 16.00 32000 5040 Color Benjamin Roberds 13.0 76.0 0.0 0.0 Maxwell Moody 0.0 NaN Drama|Horror|Thriller ... 3.0 English USA NaN 1400.0 2013.0 0.0 6.3 NaN 16 5041 Color Daniel Hsia 14.0 100.0 0.0 489.0 Daniel Henney 946.0 10443.0 Comedy|Drama|Romance ... 9.0 English USA PG-13 NaN 2012.0 719.0 6.3 2.35 660 5042 Color Jon Gunn 43.0 90.0 16.0 16.0 Brian Herzlinger 86.0 85222.0 Documentary ... 84.0 English USA PG 1100.0 2004.0 23.0 6.6 1.85 456
5 rows × 28 columns
根据指定列生成新的DataFrame 1 sub_df = imdb[['director_name' , 'movie_title' , 'imdb_score' ]]
director_name movie_title imdb_score 0 James Cameron Avatar 7.9 1 Gore Verbinski Pirates of the Caribbean: At World's End 7.1 2 Sam Mendes Spectre 6.8 3 Christopher Nolan The Dark Knight Rises 8.5 4 Doug Walker Star Wars: Episode VII - The Force Awakens ... 7.1
根据index定位内容(iloc) 1 temp_df = sub_df.iloc[10 :20 ,0 :2 ]
director_name movie_title 10 Zack Snyder Batman v Superman: Dawn of Justice 11 Bryan Singer Superman Returns 12 Marc Forster Quantum of Solace 13 Gore Verbinski Pirates of the Caribbean: Dead Man's Chest 14 Gore Verbinski The Lone Ranger 15 Zack Snyder Man of Steel 16 Andrew Adamson The Chronicles of Narnia: Prince Caspian 17 Joss Whedon The Avengers 18 Rob Marshall Pirates of the Caribbean: On Stranger Tides 19 Barry Sonnenfeld Men in Black 3
director_name movie_title 12 Marc Forster Quantum of Solace 13 Gore Verbinski Pirates of the Caribbean: Dead Man's Chest
根据label定位内容(loc) 1 temp_df.loc[15 :17 ,:'director_name' ]
director_name 15 Zack Snyder 16 Andrew Adamson 17 Joss Whedon
Reindex on DataFrame 1 df1 = DataFrame(np.random.rand(25 ).reshape([5 ,5 ]), index=['A' ,'B' ,'D' ,'E' ,'F' ], columns=['c1' ,'c2' ,'c3' ,'c4' ,'c5' ])
c1 c2 c3 c4 c5 A 0.659369 0.096724 0.880128 0.403168 0.282619 B 0.003070 0.661747 0.121026 0.262881 0.557213 D 0.972693 0.962149 0.411036 0.849216 0.704441 E 0.969716 0.425415 0.943507 0.589344 0.670303 F 0.315732 0.371235 0.943880 0.649541 0.495330
插入新的行(index) 1 df1.reindex(index=['A' ,'B' ,'C' ,'D' ,'E' ,'F' ])
c1 c2 c3 c4 c5 A 0.659369 0.096724 0.880128 0.403168 0.282619 B 0.003070 0.661747 0.121026 0.262881 0.557213 C NaN NaN NaN NaN NaN D 0.972693 0.962149 0.411036 0.849216 0.704441 E 0.969716 0.425415 0.943507 0.589344 0.670303 F 0.315732 0.371235 0.943880 0.649541 0.495330
插入新的列 (columns) 1 df1.reindex(columns=['c1' ,'c2' ,'c3' ,'c4' ,'c5' ,'c6' ])
c1 c2 c3 c4 c5 c6 A 0.659369 0.096724 0.880128 0.403168 0.282619 NaN B 0.003070 0.661747 0.121026 0.262881 0.557213 NaN D 0.972693 0.962149 0.411036 0.849216 0.704441 NaN E 0.969716 0.425415 0.943507 0.589344 0.670303 NaN F 0.315732 0.371235 0.943880 0.649541 0.495330 NaN
同时新增index和columns 1 df1.reindex(index=['A' ,'B' ,'C' ,'D' ,'E' ,'F' ], columns=['c1' ,'c2' ,'c3' ,'c4' ,'c5' ,'c6' ])
c1 c2 c3 c4 c5 c6 A 0.659369 0.096724 0.880128 0.403168 0.282619 NaN B 0.003070 0.661747 0.121026 0.262881 0.557213 NaN C NaN NaN NaN NaN NaN NaN D 0.972693 0.962149 0.411036 0.849216 0.704441 NaN E 0.969716 0.425415 0.943507 0.589344 0.670303 NaN F 0.315732 0.371235 0.943880 0.649541 0.495330 NaN
对DataFrame切片 1 df1.reindex(index=['A' ,'B' ])
c1 c2 c3 c4 c5 A 0.659369 0.096724 0.880128 0.403168 0.282619 B 0.003070 0.661747 0.121026 0.262881 0.557213
对DataFrame进行drop index c1 c2 c3 c4 c5 B 0.003070 0.661747 0.121026 0.262881 0.557213 D 0.972693 0.962149 0.411036 0.849216 0.704441 E 0.969716 0.425415 0.943507 0.589344 0.670303 F 0.315732 0.371235 0.943880 0.649541 0.495330
对DataFrame进行drop columns c1 c3 c4 c5 A 0.659369 0.880128 0.403168 0.282619 B 0.003070 0.121026 0.262881 0.557213 D 0.972693 0.411036 0.849216 0.704441 E 0.969716 0.943507 0.589344 0.670303 F 0.315732 0.943880 0.649541 0.495330
多级index 1 2 3 4 5 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame s1 = Series(np.random.randn(6 ),index=[[1 ,1 ,1 ,2 ,2 ,2 ],['a' ,'b' ,'c' ,'a' ,'b' ,'c' ]])
1 a -0.828759 b 1.100893 c 0.705300 2 a 0.801019 b 0.676878 c -0.585011 dtype: float64
-0.8287592467933553
1 -0.828759 2 0.801019 dtype: float64
多级index Series => DataFrame a b c 1 -0.828759 1.100893 0.705300 2 0.801019 0.676878 -0.585011
DataFrame => 多级index Series a 1 -0.828759 2 0.801019 b 1 1.100893 2 0.676878 c 1 0.705300 2 -0.585011 dtype: float64
1 2 a -0.828759 0.801019 b 1.100893 0.676878 c 0.705300 -0.585011
1 a -0.828759 b 1.100893 c 0.705300 2 a 0.801019 b 0.676878 c -0.585011 dtype: float64
初始化DataFrame (多级index & columns) 1 2 3 df = DataFrame(np.arange(16 ).reshape(4 ,4 ), index=[['a' ,'a' ,'b' ,'b' ],[1 ,2 ,1 ,2 ]], columns=[['BJ' ,'BJ' ,'SH' ,'GZ' ],[8 ,9 ,8 ,8 ]]) df
BJ SH GZ 8 9 8 8 a 1 0 1 2 3 2 4 5 6 7 b 1 8 9 10 11 2 12 13 14 15
访问多级DataFrame元素 a 1 0 2 4 b 1 8 2 12 Name: 8, dtype: int64
MultiIndex(levels=[[‘BJ’, ‘GZ’, ‘SH’], [8, 9]], codes=[[0, 0, 2, 1], [0, 1, 0, 0]])
创建多级索引 1 2 3 4 data = Series(np.random.randn(10 ),index=[['x' ,'x' ,'x' ,'x' ,'x' ,'x' ,'y' ,'y' ,'y' ,'y' ], ['a' ,'a' ,'a' ,'b' ,'b' ,'b' ,'c' ,'c' ,'d' ,'d' ], [1 ,2 ,3 ,1 ,2 ,3 ,1 ,2 ,2 ,3 ]]) data
x a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 y c 1 1.186975 2 -1.106244 d 2 -0.166537 3 0.695451 dtype: float64
检索多级索引 选取外层索引 a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 dtype: float64
x a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 y c 1 1.186975 2 -1.106244 d 2 -0.166537 3 0.695451 dtype: float64
选取内层索引 0.6737686360875892
解除多级索引 x a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 y c 1 1.186975 2 -1.106244 d 2 -0.166537 3 0.695451 dtype: float64
默认解除最内层索引,即level=-1 1 2 3 x a 0.673769 -1.781723 -2.232971 b -0.832687 -0.141850 0.704339 y c 1.186975 -1.106244 NaN d NaN -0.166537 0.695451
解除其他层索引 a b c d x 1 0.673769 -0.832687 NaN NaN 2 -1.781723 -0.141850 NaN NaN 3 -2.232971 0.704339 NaN NaN y 1 NaN NaN 1.186975 NaN 2 NaN NaN -1.106244 -0.166537 3 NaN NaN NaN 0.695451
转化为多级索引 x a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 y c 1 1.186975 2 -1.106244 d 2 -0.166537 3 0.695451 dtype: float64
将中间层索引转换为内层索引 1 data.unstack(level=-2 ).stack()
x 1 a 0.673769 b -0.832687 2 a -1.781723 b -0.141850 3 a -2.232971 b 0.704339 y 1 c 1.186975 2 c -1.106244 d -0.166537 3 d 0.695451 dtype: float64
为不同层索引指定名称(name) x a 1 0.673769 2 -1.781723 3 -2.232971 b 1 -0.832687 2 -0.141850 3 0.704339 y c 1 1.186975 2 -1.106244 d 2 -0.166537 3 0.695451 dtype: float64
1 2 3 x a 0.673769 -1.781723 -2.232971 b -0.832687 -0.141850 0.704339 y c 1.186975 -1.106244 NaN d NaN -0.166537 0.695451
1 2 3 4 df.index.names = ['outer' ,'inner' ] df.columns.name = 'hello' df
hello 1 2 3 outer inner x a 0.673769 -1.781723 -2.232971 b -0.832687 -0.141850 0.704339 y c 1.186975 -1.106244 NaN d NaN -0.166537 0.695451
Mapping 通过Series为DataFrame新增列 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf1 = DataFrame({"城市" :["北京" ,"上海" ,"广州" ],"人口" :[1000 ,2000 ,1500 ]}) df1
城市 人口 0 北京 1000 1 上海 2000 2 广州 1500
1 2 3 df1["GDP" ] = Series([1200 ,1800 ,1600 ]) df1
城市 人口 GDP 0 北京 1000 1200 1 上海 2000 1800 2 广州 1500 1600
通过Map为DataFrame新增列 1 2 3 df2 = DataFrame({"城市" :["北京" ,"上海" ,"广州" ],"人口" :[1000 ,2000 ,1500 ]}) df2
城市 人口 0 北京 1000 1 上海 2000 2 广州 1500
1 2 3 4 5 gdp_map = {"北京" : 1200 , "广州" : 1400 , "上海" : 1800 } df2['GDP' ] = df2['城市' ].map (gdp_map) df2
城市 人口 GDP 0 北京 1000 1200 1 上海 2000 1800 2 广州 1500 1400
Replace 1 2 3 s1 = Series(np.arange(10 )) s1
0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int64
0 0.0 1 NaN 2 2.0 3 3.0 4 4.0 5 5.0 6 6.0 7 7.0 8 8.0 9 9.0 dtype: float64
1 s1.replace([1 ,2 ,3 ],[10 ,20 ,30 ])
0 0 1 10 2 20 3 30 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int64
DataFrame的简单数学计算 1 2 df1 = DataFrame(np.arange(4 ).reshape(2 ,2 ), index=['A' ,'B' ], columns=['BJ' ,'SH' ]) df1
1 2 3 df2 = DataFrame(np.arange(9 ).reshape(3 ,3 ), index=['A' ,'B' ,'C' ], columns=['BJ' ,'SH' ,'GZ' ]) df2
BJ GZ SH A 0.0 NaN 2.0 B 5.0 NaN 7.0 C NaN NaN NaN
1 2 df3 = DataFrame([[1 ,2 ,3 ],[4 ,5 ,np.nan],[7 ,8 ,9 ]], index=['A' ,'B' ,'C' ], columns=['c1' ,'c2' ,'c3' ]) df3
对列进行求和 c1 12.0 c2 15.0 c3 12.0 dtype: float64
对行进行求和 A 6.0 B 9.0 C 24.0 dtype: float64
求最小值 c1 1.0 c2 2.0 c3 3.0 dtype: float64
求最大值 c1 7.0 c2 8.0 c3 9.0 dtype: float64
c1 c2 c3 count 3.0 3.0 2.000000 mean 4.0 5.0 6.000000 std 3.0 3.0 4.242641 min 1.0 2.0 3.000000 25% 2.5 3.5 4.500000 50% 4.0 5.0 6.000000 75% 5.5 6.5 7.500000 max 7.0 8.0 9.000000
DataFrame排序 1 2 df1 = DataFrame(np.random.randn(40 ).reshape(8 ,5 ), columns=['A' ,'B' ,'C' ,'D' ,'E' ]) df1
A B C D E 0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783 1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309 2 -1.799746 0.937149 -0.387339 0.222759 -0.040818 3 0.239674 1.813621 -0.010476 -0.013300 1.411217 4 -0.980156 0.889708 1.322267 1.848655 -0.210548 5 1.655132 -0.561405 1.369573 -0.485437 1.772758 6 -0.032585 -2.038827 1.560083 0.638874 0.626102 7 -0.153819 -1.386077 0.779540 -1.441996 1.106010
按column排序 1 2 df2 = df1.sort_values('A' , ascending=False ) df2
A B C D E 5 1.655132 -0.561405 1.369573 -0.485437 1.772758 0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783 3 0.239674 1.813621 -0.010476 -0.013300 1.411217 6 -0.032585 -2.038827 1.560083 0.638874 0.626102 7 -0.153819 -1.386077 0.779540 -1.441996 1.106010 4 -0.980156 0.889708 1.322267 1.848655 -0.210548 1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309 2 -1.799746 0.937149 -0.387339 0.222759 -0.040818
按index排序 A B C D E 0 0.428384 -0.242118 -0.311413 -2.032690 -1.519783 1 -1.763709 -0.299535 -1.712632 -0.662375 -0.434309 2 -1.799746 0.937149 -0.387339 0.222759 -0.040818 3 0.239674 1.813621 -0.010476 -0.013300 1.411217 4 -0.980156 0.889708 1.322267 1.848655 -0.210548 5 1.655132 -0.561405 1.369573 -0.485437 1.772758 6 -0.032585 -2.038827 1.560083 0.638874 0.626102 7 -0.153819 -1.386077 0.779540 -1.441996 1.106010
Rename index 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame df1 = DataFrame(np.arange(9 ).reshape(3 ,3 ), index=['BJ' ,'SH' ,'GZ' ], columns=['A' ,'B' ,'C' ]) df1
1 2 3 df1.index = Series(['bj' ,'sh' ,'gz' ]) df1
1 2 df1.index = df1.index.map (str .upper) df1
1 df1.rename(index=str .lower, columns=str .lower)
1 df1.rename(index={'BJ' :'beijing' }, columns={'A' :'a' })
数字列表 => 字符串列表 1 2 list1 = [1 ,2 ,3 ,4 ] list2 = ['1' ,'2' ,'3' ,'4' ]
[‘1’, ‘2’, ‘3’, ‘4’]
[‘1’, ‘2’, ‘3’, ‘4’]
利用自定义map函数重命名index 1 2 def test_map (x ): return x + '_ABC'
Index([‘BJ_ABC’, ‘SH_ABC’, ‘GZ_ABC’], dtype=’object’)
1 df1.rename(index=test_map)
A B C BJ_ABC 0 1 2 SH_ABC 3 4 5 GZ_ABC 6 7 8
DataFrame合并 (merge) 操作 1 2 3 4 5 6 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf1 = pd.DataFrame({'key' :['X' ,'Y' ,'Z' ,'X' ], 'data_set_1' :[1 ,2 ,3 ,4 ]}) df1
key data_set_1 0 X 1 1 Y 2 2 Z 3 3 X 4
1 2 df2 = pd.DataFrame({'key' :['X' ,'B' ,'C' ],'data_set_2' :[5 ,6 ,7 ]}) df2
1 pd.merge(df1,df2,on=None )
key data_set_1 data_set_2 0 X 1 5 1 X 4 5
指定关联列(默认inner连接) 1 pd.merge(df1,df2,on='key' ,how='inner' )
key data_set_1 data_set_2 0 X 1 5 1 X 4 5
左连接 1 pd.merge(df1,df2,on='key' ,how='left' )
key data_set_1 data_set_2 0 X 1 5.0 1 Y 2 NaN 2 Z 3 NaN 3 X 4 5.0
右连接 1 pd.merge(df1,df2,on='key' ,how='right' )
key data_set_1 data_set_2 0 X 1.0 5 1 X 4.0 5 2 B NaN 6 3 C NaN 7
Concatenate Array Concatenate 1 2 3 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame
1 2 arr1 = np.arange(9 ).reshape(3 ,3 ) arr1
array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
1 2 arr2 = np.arange(9 ).reshape(3 ,3 ) arr2
array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
1 np.concatenate([arr1,arr2])
array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 1, 2], [3, 4, 5], [6, 7, 8]])
1 np.concatenate([arr1,arr2],axis=1 )
array([[0, 1, 2, 0, 1, 2], [3, 4, 5, 3, 4, 5], [6, 7, 8, 6, 7, 8]])
Series Concatenate 1 2 s1 = Series([1 ,2 ,3 ], index=['X' ,'Y' ,'Z' ]) s1
X 1 Y 2 Z 3 dtype: int64
1 2 s2 = Series([4 ,5 ], index=['A' ,'B' ]) s2
A 4 B 5 dtype: int64
X 1 Y 2 Z 3 A 4 B 5 dtype: int64
1 pd.concat([s1,s2], axis=1 )
0 1 A NaN 4.0 B NaN 5.0 X 1.0 NaN Y 2.0 NaN Z 3.0 NaN
DataFrame Concatenate 1 2 df1 = DataFrame(np.random.randn(4 ,3 ), columns=['X' ,'Y' ,'Z' ]) df1
X Y Z 0 -2.150094 -0.172915 0.391376 1 -2.943184 1.442629 0.934177 2 -0.015998 -1.204739 0.912159 3 -0.131997 -0.152430 1.080225
1 2 df2 = DataFrame(np.random.randn(3 ,3 ), columns=['X' ,'Y' ,'A' ]) df2
X Y A 0 -0.640785 0.404844 1.140704 1 0.331578 0.490477 1.158403 2 1.542735 -0.693717 1.928202
A X Y Z 0 NaN -2.150094 -0.172915 0.391376 1 NaN -2.943184 1.442629 0.934177 2 NaN -0.015998 -1.204739 0.912159 3 NaN -0.131997 -0.152430 1.080225 0 1.140704 -0.640785 0.404844 NaN 1 1.158403 0.331578 0.490477 NaN 2 1.928202 1.542735 -0.693717 NaN
Combine Series Combine 1 2 s1 = Series([2 ,np.nan,4 ,np.nan], index=['A' ,'B' ,'C' ,'D' ]) s1
A 2.0 B NaN C 4.0 D NaN dtype: float64
1 2 s2 = Series([1 ,2 ,3 ,4 ], index=['A' ,'B' ,'C' ,'D' ]) s2
A 1 B 2 C 3 D 4 dtype: int64
combine_first - 用后者填充前者中的空值 (如有) A 2.0 B 2.0 C 4.0 D 4.0 dtype: float64
DataFrame Combine 1 2 3 4 5 6 df1 = pd.DataFrame({ 'X' :[1 ,np.nan,3 ,np.nan], 'Y' :[5 ,np.nan,7 ,np.nan], 'Z' :[9 ,np.nan,11 ,np.nan] }) df1
X Y Z 0 1.0 5.0 9.0 1 NaN NaN NaN 2 3.0 7.0 11.0 3 NaN NaN NaN
1 2 3 4 5 df2 = pd.DataFrame({ 'Z' :[np.nan,10 ,np.nan,12 ], 'A' :[1 ,2 ,3 ,4 ] }) df2
A X Y Z 0 1.0 1.0 5.0 9.0 1 2.0 NaN NaN 10.0 2 3.0 3.0 7.0 11.0 3 4.0 NaN NaN 12.0
利用apply对数据进行处理 1 2 3 4 5 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf1 = pd.read_csv("apply_demo.csv" )
time data 0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 4 1473411963 Symbol: APPL Seqno: 1 Price: 1649
7978
time data A 0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a 1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 a 2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a 3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 a 4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 a
利用apply批量处理某一列数据 1 2 3 df1['A' ] = df1['A' ].apply(str .upper) df1.head()
time data A 0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A 1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 A 2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A 3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 A 4 1473411963 Symbol: APPL Seqno: 1 Price: 1649 A
1 2 l1 = df1['data' ][0 ].strip().split(' ' ) l1
[‘Symbol:’, ‘APPL’, ‘Seqno:’, ‘0’, ‘Price:’, ‘1623’]
(‘APPL’, ‘0’, ‘1623’)
编写自定义函数,对列中的字符串进行提取 1 2 3 def foo (line ): items = line.strip().split(' ' ) return Series([items[1 ], items[3 ], items[5 ]])
1 2 3 4 5 df_temp = df1['data' ].apply(foo) df_temp = df_temp.rename(columns={0 :"Symbol" ,1 :"Seqno" ,2 :"Price" }) df_temp.head()
Symbol Seqno Price 0 APPL 0 1623 1 APPL 0 1623 2 APPL 0 1623 3 APPL 0 1623 4 APPL 1 1649
1 df_new = df1.combine_first(df_temp)
A Price Seqno Symbol data time 0 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962 1 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411962 2 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963 3 A 1623.0 0.0 APPL Symbol: APPL Seqno: 0 Price: 1623 1473411963 4 A 1649.0 1.0 APPL Symbol: APPL Seqno: 1 Price: 1649 1473411963
1 2 3 4 del df_new['data' ]del df_new['A' ] df_new.head()
Price Seqno Symbol time 0 1623.0 0.0 APPL 1473411962 1 1623.0 0.0 APPL 1473411962 2 1623.0 0.0 APPL 1473411963 3 1623.0 0.0 APPL 1473411963 4 1649.0 1.0 APPL 1473411963
1 df_new.to_csv("demo_duplicate.csv" )
利用drop_duplicates去重 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf = pd.read_csv('demo_duplicate.csv' ) df.head()
Unnamed: 0 Price Seqno Symbol time 0 0 1623.0 0.0 APPL 1473411962 1 1 1623.0 0.0 APPL 1473411962 2 2 1623.0 0.0 APPL 1473411963 3 3 1623.0 0.0 APPL 1473411963 4 4 1649.0 1.0 APPL 1473411963
1 2 3 del df['Unnamed: 0' ]df.head(10 )
Price Seqno Symbol time 0 1623.0 0.0 APPL 1473411962 1 1623.0 0.0 APPL 1473411962 2 1623.0 0.0 APPL 1473411963 3 1623.0 0.0 APPL 1473411963 4 1649.0 1.0 APPL 1473411963 5 1649.0 1.0 APPL 1473411963 6 1649.0 1.0 APPL 1473411964 7 1649.0 1.0 APPL 1473411964 8 1642.0 2.0 APPL 1473411964 9 1642.0 2.0 APPL 1473411964
15956
3989
1 len (df['Seqno' ].unique())
1000
1 df['Seqno' ].duplicated()
0 False 1 True 2 True 3 True 4 False 5 True 6 True 7 True 8 False 9 True 10 True 11 True 12 False 13 True 14 True 15 True 16 False 17 True 18 True 19 True 20 False 21 True 22 True 23 True 24 False 25 True 26 True 27 True 28 False 29 True … 3959 True 3960 True 3961 False 3962 True 3963 True 3964 True 3965 False 3966 True 3967 True 3968 True 3969 False 3970 True 3971 True 3972 True 3973 False 3974 True 3975 True 3976 True 3977 False 3978 True 3979 True 3980 True 3981 False 3982 True 3983 True 3984 True 3985 False 3986 True 3987 True 3988 True Name: Seqno, Length: 3989, dtype: bool
1 df.drop_duplicates(['Seqno' ], keep='first' )
Price Seqno Symbol time 0 1623.0 0.0 APPL 1473411962 4 1649.0 1.0 APPL 1473411963 8 1642.0 2.0 APPL 1473411964 12 1636.0 3.0 APPL 1473411965 16 1669.0 4.0 APPL 1473411966 20 1639.0 5.0 APPL 1473411967 24 1611.0 6.0 APPL 1473411968 28 1660.0 7.0 APPL 1473411969 32 1657.0 8.0 APPL 1473411970 36 1509.0 9.0 APPL 1473411971 40 1514.0 10.0 APPL 1473411972 44 1676.0 11.0 APPL 1473411973 48 1596.0 12.0 APPL 1473411974 52 1527.0 13.0 APPL 1473411975 56 1643.0 14.0 APPL 1473411976 60 1632.0 15.0 APPL 1473411977 64 1595.0 16.0 APPL 1473411978 68 1565.0 17.0 APPL 1473411979 72 1521.0 18.0 APPL 1473411980 76 1501.0 19.0 APPL 1473411981 80 1579.0 20.0 APPL 1473411982 84 1511.0 21.0 APPL 1473411983 88 1513.0 22.0 APPL 1473411984 92 1687.0 23.0 APPL 1473411985 96 1539.0 24.0 APPL 1473411986 100 1682.0 25.0 APPL 1473411987 104 1534.0 26.0 APPL 1473411988 108 1628.0 27.0 APPL 1473411989 112 1582.0 28.0 APPL 1473411990 116 1540.0 29.0 APPL 1473411991 ... ... ... ... ... 3869 1649.0 970.0 APPL 1473412932 3873 1696.0 971.0 APPL 1473412933 3877 1596.0 972.0 APPL 1473412934 3881 1664.0 973.0 APPL 1473412935 3885 1539.0 974.0 APPL 1473412936 3889 1654.0 975.0 APPL 1473412937 3893 1573.0 976.0 APPL 1473412938 3897 1695.0 977.0 APPL 1473412939 3901 1584.0 978.0 APPL 1473412940 3905 1607.0 979.0 APPL 1473412941 3909 1676.0 980.0 APPL 1473412942 3913 1665.0 981.0 APPL 1473412943 3917 1540.0 982.0 APPL 1473412944 3921 1546.0 983.0 APPL 1473412945 3925 1646.0 984.0 APPL 1473412946 3929 1648.0 985.0 APPL 1473412947 3933 1647.0 986.0 APPL 1473412948 3937 1602.0 987.0 APPL 1473412949 3941 1562.0 988.0 APPL 1473412950 3945 1629.0 989.0 APPL 1473412951 3949 1683.0 990.0 APPL 1473412952 3953 1649.0 991.0 APPL 1473412953 3957 1698.0 992.0 APPL 1473412954 3961 1566.0 993.0 APPL 1473412955 3965 1597.0 994.0 APPL 1473412956 3969 1641.0 995.0 APPL 1473412957 3973 1581.0 996.0 APPL 1473412958 3977 1674.0 997.0 APPL 1473412959 3981 1680.0 998.0 APPL 1473412960 3985 1509.0 999.0 APPL 1473412961
1000 rows × 4 columns
时间序列操作 1 2 3 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame
1 2 from datetime import datetimet1 = datetime(2019 ,7 ,12 )
datetime.datetime(2019, 7, 12, 0, 0)
1 2 3 4 5 6 7 date_list = [ datetime(2016 ,9 ,1 ), datetime(2016 ,9 ,10 ), datetime(2017 ,9 ,1 ), datetime(2017 ,9 ,20 ), datetime(2017 ,10 ,1 ) ]
[datetime.datetime(2016, 9, 1, 0, 0), datetime.datetime(2016, 9, 10, 0, 0), datetime.datetime(2017, 9, 1, 0, 0), datetime.datetime(2017, 9, 20, 0, 0), datetime.datetime(2017, 10, 1, 0, 0)]
1 2 3 s1 = Series(np.random.rand(5 ),index=date_list) s1
2016-09-01 0.192942 2016-09-10 0.821524 2017-09-01 0.503137 2017-09-20 0.896424 2017-10-01 0.267054 dtype: float64
array([0.19294241, 0.82152442, 0.5031373 , 0.8964235 , 0.26705384])
DatetimeIndex([‘2016-09-01’, ‘2016-09-10’, ‘2017-09-01’, ‘2017-09-20’, ‘2017-10-01’], dtype=’datetime64[ns]’, freq=None)
根据索引位置访问 0.19294241160815995
根据时间序列访问 0.19294241160815995
根据时间访问 0.19294241160815995
0.19294241160815995
返回整月数据 2016-09-01 0.192942 2016-09-10 0.821524 dtype: float64
返回整年数据 2016-09-01 0.192942 2016-09-10 0.821524 dtype: float64
利用pd.date_range() 生成时间序列 1 2 date_list_new = pd.date_range('2016-01-01' , periods=100 ) date_list_new
DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’, ‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’, ‘2016-01-09’, ‘2016-01-10’, ‘2016-01-11’, ‘2016-01-12’, ‘2016-01-13’, ‘2016-01-14’, ‘2016-01-15’, ‘2016-01-16’, ‘2016-01-17’, ‘2016-01-18’, ‘2016-01-19’, ‘2016-01-20’, ‘2016-01-21’, ‘2016-01-22’, ‘2016-01-23’, ‘2016-01-24’, ‘2016-01-25’, ‘2016-01-26’, ‘2016-01-27’, ‘2016-01-28’, ‘2016-01-29’, ‘2016-01-30’, ‘2016-01-31’, ‘2016-02-01’, ‘2016-02-02’, ‘2016-02-03’, ‘2016-02-04’, ‘2016-02-05’, ‘2016-02-06’, ‘2016-02-07’, ‘2016-02-08’, ‘2016-02-09’, ‘2016-02-10’, ‘2016-02-11’, ‘2016-02-12’, ‘2016-02-13’, ‘2016-02-14’, ‘2016-02-15’, ‘2016-02-16’, ‘2016-02-17’, ‘2016-02-18’, ‘2016-02-19’, ‘2016-02-20’, ‘2016-02-21’, ‘2016-02-22’, ‘2016-02-23’, ‘2016-02-24’, ‘2016-02-25’, ‘2016-02-26’, ‘2016-02-27’, ‘2016-02-28’, ‘2016-02-29’, ‘2016-03-01’, ‘2016-03-02’, ‘2016-03-03’, ‘2016-03-04’, ‘2016-03-05’, ‘2016-03-06’, ‘2016-03-07’, ‘2016-03-08’, ‘2016-03-09’, ‘2016-03-10’, ‘2016-03-11’, ‘2016-03-12’, ‘2016-03-13’, ‘2016-03-14’, ‘2016-03-15’, ‘2016-03-16’, ‘2016-03-17’, ‘2016-03-18’, ‘2016-03-19’, ‘2016-03-20’, ‘2016-03-21’, ‘2016-03-22’, ‘2016-03-23’, ‘2016-03-24’, ‘2016-03-25’, ‘2016-03-26’, ‘2016-03-27’, ‘2016-03-28’, ‘2016-03-29’, ‘2016-03-30’, ‘2016-03-31’, ‘2016-04-01’, ‘2016-04-02’, ‘2016-04-03’, ‘2016-04-04’, ‘2016-04-05’, ‘2016-04-06’, ‘2016-04-07’, ‘2016-04-08’, ‘2016-04-09’], dtype=’datetime64[ns]’, freq=’D’)
从指定时间的(周一)开始生成时间序列 1 2 date_list_new = pd.date_range('2016-01-01' , periods=10 , freq='W-MON' ) date_list_new
DatetimeIndex([‘2016-01-04’, ‘2016-01-11’, ‘2016-01-18’, ‘2016-01-25’, ‘2016-02-01’, ‘2016-02-08’, ‘2016-02-15’, ‘2016-02-22’, ‘2016-02-29’, ‘2016-03-07’], dtype=’datetime64[ns]’, freq=’W-MON’)
1 2 date_list_new = pd.date_range('2016-01-01' , periods=10 , freq='5H' ) date_list_new
DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 05:00:00’, ‘2016-01-01 10:00:00’, ‘2016-01-01 15:00:00’, ‘2016-01-01 20:00:00’, ‘2016-01-02 01:00:00’, ‘2016-01-02 06:00:00’, ‘2016-01-02 11:00:00’, ‘2016-01-02 16:00:00’, ‘2016-01-02 21:00:00’], dtype=’datetime64[ns]’, freq=’5H’)
1 2 s2 = Series(np.random.rand(10 ), index=date_list_new) s2
2016-01-01 00:00:00 0.312884 2016-01-01 05:00:00 0.337516 2016-01-01 10:00:00 0.911010 2016-01-01 15:00:00 0.055390 2016-01-01 20:00:00 0.810770 2016-01-02 01:00:00 0.313263 2016-01-02 06:00:00 0.025229 2016-01-02 11:00:00 0.229192 2016-01-02 16:00:00 0.509811 2016-01-02 21:00:00 0.169681 Freq: 5H, dtype: float64
时间序列数据的采样和画图 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramefrom datetime import datetimet_range = pd.date_range('2016-01-01' ,'2016-12-31' ) t_range
DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’, ‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01-08’, ‘2016-01-09’, ‘2016-01-10’, … ‘2016-12-22’, ‘2016-12-23’, ‘2016-12-24’, ‘2016-12-25’, ‘2016-12-26’, ‘2016-12-27’, ‘2016-12-28’, ‘2016-12-29’, ‘2016-12-30’, ‘2016-12-31’], dtype=’datetime64[ns]’, length=366, freq=’D’)
1 2 3 s1 = Series(np.random.rand(len (t_range)), index=t_range) s1['2016-01' ].mean()
0.48349408856972315
按月份采样,计算平均值 1 2 3 s1_month = s1.resample('M' ).mean() s1_month.index
DatetimeIndex([‘2016-01-31’, ‘2016-02-29’, ‘2016-03-31’, ‘2016-04-30’, ‘2016-05-31’, ‘2016-06-30’, ‘2016-07-31’, ‘2016-08-31’, ‘2016-09-30’, ‘2016-10-31’, ‘2016-11-30’, ‘2016-12-31’], dtype=’datetime64[ns]’, freq=’M’)
按小时采样,并(向后)填充数据 1 s1.resample('H' ).ffill()
2016-01-01 00:00:00 0.196584 2016-01-01 01:00:00 0.196584 2016-01-01 02:00:00 0.196584 2016-01-01 03:00:00 0.196584 2016-01-01 04:00:00 0.196584 2016-01-01 05:00:00 0.196584 2016-01-01 06:00:00 0.196584 2016-01-01 07:00:00 0.196584 2016-01-01 08:00:00 0.196584 2016-01-01 09:00:00 0.196584 2016-01-01 10:00:00 0.196584 2016-01-01 11:00:00 0.196584 2016-01-01 12:00:00 0.196584 2016-01-01 13:00:00 0.196584 2016-01-01 14:00:00 0.196584 2016-01-01 15:00:00 0.196584 2016-01-01 16:00:00 0.196584 2016-01-01 17:00:00 0.196584 2016-01-01 18:00:00 0.196584 2016-01-01 19:00:00 0.196584 2016-01-01 20:00:00 0.196584 2016-01-01 21:00:00 0.196584 2016-01-01 22:00:00 0.196584 2016-01-01 23:00:00 0.196584 2016-01-02 00:00:00 0.209433 2016-01-02 01:00:00 0.209433 2016-01-02 02:00:00 0.209433 2016-01-02 03:00:00 0.209433 2016-01-02 04:00:00 0.209433 2016-01-02 05:00:00 0.209433 … 2016-12-29 19:00:00 0.287915 2016-12-29 20:00:00 0.287915 2016-12-29 21:00:00 0.287915 2016-12-29 22:00:00 0.287915 2016-12-29 23:00:00 0.287915 2016-12-30 00:00:00 0.023052 2016-12-30 01:00:00 0.023052 2016-12-30 02:00:00 0.023052 2016-12-30 03:00:00 0.023052 2016-12-30 04:00:00 0.023052 2016-12-30 05:00:00 0.023052 2016-12-30 06:00:00 0.023052 2016-12-30 07:00:00 0.023052 2016-12-30 08:00:00 0.023052 2016-12-30 09:00:00 0.023052 2016-12-30 10:00:00 0.023052 2016-12-30 11:00:00 0.023052 2016-12-30 12:00:00 0.023052 2016-12-30 13:00:00 0.023052 2016-12-30 14:00:00 0.023052 2016-12-30 15:00:00 0.023052 2016-12-30 16:00:00 0.023052 2016-12-30 17:00:00 0.023052 2016-12-30 18:00:00 0.023052 2016-12-30 19:00:00 0.023052 2016-12-30 20:00:00 0.023052 2016-12-30 21:00:00 0.023052 2016-12-30 22:00:00 0.023052 2016-12-30 23:00:00 0.023052 2016-12-31 00:00:00 0.515360 Freq: H, Length: 8761, dtype: float64
基于时间序列画图 1 2 3 t_range = pd.date_range('2016-01-01' ,'2016-12-31' ,freq='H' ) t_range
DatetimeIndex([‘2016-01-01 00:00:00’, ‘2016-01-01 01:00:00’, ‘2016-01-01 02:00:00’, ‘2016-01-01 03:00:00’, ‘2016-01-01 04:00:00’, ‘2016-01-01 05:00:00’, ‘2016-01-01 06:00:00’, ‘2016-01-01 07:00:00’, ‘2016-01-01 08:00:00’, ‘2016-01-01 09:00:00’, … ‘2016-12-30 15:00:00’, ‘2016-12-30 16:00:00’, ‘2016-12-30 17:00:00’, ‘2016-12-30 18:00:00’, ‘2016-12-30 19:00:00’, ‘2016-12-30 20:00:00’, ‘2016-12-30 21:00:00’, ‘2016-12-30 22:00:00’, ‘2016-12-30 23:00:00’, ‘2016-12-31 00:00:00’], dtype=’datetime64[ns]’, length=8761, freq=’H’)
1 2 3 4 5 stock_df = DataFrame(index=t_range) stock_df['BABA' ] = np.random.randint(80 ,160 , size=len (t_range)) stock_df['TENCENT' ] = np.random.randint(30 ,50 , size=len (t_range)) stock_df.head()
BABA TENCENT 2016-01-01 00:00:00 134 43 2016-01-01 01:00:00 98 35 2016-01-01 02:00:00 150 32 2016-01-01 03:00:00 96 37 2016-01-01 04:00:00 140 40
1 2 import matplotlib.pyplot as pltplt.show()
1 2 3 4 5 weekly_df = DataFrame() weekly_df['BABA' ] = stock_df['BABA' ].resample('W' ).mean() weekly_df['TENCENT' ] = stock_df['TENCENT' ].resample('W' ).mean() weekly_df.head()
BABA TENCENT 2016-01-03 118.777778 38.638889 2016-01-10 120.690476 40.077381 2016-01-17 120.678571 39.452381 2016-01-24 119.833333 39.386905 2016-01-31 119.577381 39.416667
数据分箱Binning 对list进行分箱 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramescore_list = np.random.randint(25 ,100 ,size=20 ) score_list
array([95, 75, 43, 49, 26, 95, 28, 55, 93, 97, 91, 46, 39, 88, 32, 97, 93, 80, 57, 70])
1 2 3 4 5 bins = [0 ,59 ,70 ,80 ,100 ] score_cat = pd.cut(score_list, bins) pd.value_counts(score_cat)
(0, 59] 9 (80, 100] 8 (70, 80] 2 (59, 70] 1 dtype: int64
对DataFrame进行分箱 1 2 3 4 df = DataFrame() df['score' ] = score_list df
score 0 95 1 75 2 43 3 49 4 26 5 95 6 28 7 55 8 93 9 97 10 91 11 46 12 39 13 88 14 32 15 97 16 93 17 80 18 57 19 70
利用 pd.util.testing.rands() 随机生成指定位数的字符串 1 2 3 df['student' ] = [pd.util.testing.rands(3 ) for i in range (20 )] df
score student 0 95 PqQ 1 75 ds2 2 43 iUV 3 49 JjY 4 26 FK3 5 95 b9b 6 28 DyV 7 55 103 8 93 RUC 9 97 VqI 10 91 sjo 11 46 cCl 12 39 TeO 13 88 xap 14 32 GfG 15 97 FNg 16 93 pk0 17 80 ziG 18 57 5k0 19 70 ozP
通过 pd.cut() 的 labels 参数,对数据进行分类打标签 1 2 3 df['Categories' ] = pd.cut(df['score' ], bins, labels=['Low' , 'OK' , 'Good' , 'Great' ]) df
score student Categories 0 95 PqQ Great 1 75 ds2 Good 2 43 iUV Low 3 49 JjY Low 4 26 FK3 Low 5 95 b9b Great 6 28 DyV Low 7 55 103 Low 8 93 RUC Great 9 97 VqI Great 10 91 sjo Great 11 46 cCl Low 12 39 TeO Low 13 88 xap Great 14 32 GfG Low 15 97 FNg Great 16 93 pk0 Great 17 80 ziG Good 18 57 5k0 Low 19 70 ozP OK
数据分组技术GroupBy 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrame df = pd.read_csv('city_weather.csv' ) df.head()
date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2
1 2 3 g = df.groupby(df['city' ]) g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x119b0d6d8>
获取分组内容 1 2 3 4 5 g.groups df_bj = g.get_group('BJ' ) df_bj
date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3
temperature 10.000000 wind 2.833333 dtype: float64
对分组取平均值 temperature wind city BJ 10.000 2.833333 GZ 8.750 4.000000 SH 4.625 3.625000 SZ 5.000 2.500000
对分组取最小值 date temperature wind city BJ 03/01/2016 -3 2 GZ 14/08/2016 -1 2 SH 03/07/2016 -10 2 SZ 11/09/2016 -10 1
groupby对象 => list => dict date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3
for循环遍历groupby变量 1 2 3 for name, group_df in g: print (name) print (group_df)
BJ date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3 GZ date city temperature wind 14 17/07/2016 GZ 10 2 15 31/07/2016 GZ -1 5 16 14/08/2016 GZ 1 5 17 28/08/2016 GZ 25 4 SH date city temperature wind 6 27/03/2016 SH -4 4 7 10/04/2016 SH 19 3 8 24/04/2016 SH 20 3 9 08/05/2016 SH 17 3 10 22/05/2016 SH 4 2 11 05/06/2016 SH -10 4 12 19/06/2016 SH 0 5 13 03/07/2016 SH -9 5 SZ date city temperature wind 18 11/09/2016 SZ 20 1 19 25/09/2016 SZ -10 4
数据聚合技术Aggregation 1 2 3 4 5 6 7 8 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf = pd.read_csv('city_weather.csv' ) g = df.groupby('city' ) g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11b921cc0>
date temperature wind city BJ 31/01/2016 19 5 GZ 31/07/2016 25 5 SH 27/03/2016 20 5 SZ 25/09/2016 20 4
date temperature wind city BJ 6 6 6 GZ 4 4 4 SH 8 8 8 SZ 2 2 2
temperature wind count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max city BJ 6.0 10.000 8.532292 -3.0 5.75 10.0 17.25 19.0 6.0 2.833333 1.169045 2.0 2.00 2.5 3.00 5.0 GZ 4.0 8.750 11.842719 -1.0 0.50 5.5 13.75 25.0 4.0 4.000000 1.414214 2.0 3.50 4.5 5.00 5.0 SH 8.0 4.625 12.489281 -10.0 -5.25 2.0 17.50 20.0 8.0 3.625000 1.060660 2.0 3.00 3.5 4.25 5.0 SZ 2.0 5.000 21.213203 -10.0 -2.50 5.0 12.50 20.0 2.0 2.500000 2.121320 1.0 1.75 2.5 3.25 4.0
date temperature wind city BJ 03/01/2016 -3 2 GZ 14/08/2016 -1 2 SH 03/07/2016 -10 2 SZ 11/09/2016 -10 1
自定义聚合函数 1 2 def foo (attr ): return attr.max () - attr.min ()
temperature wind city BJ 22 3 GZ 26 3 SH 30 3 SZ 30 3
对多个列进行groupby 1 2 3 g_new = df.groupby(['city' ,'wind' ]) g_new
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ba83208>
{(‘BJ’, 2): Int64Index([1, 2, 4], dtype=’int64’), (‘BJ’, 3): Int64Index([3, 5], dtype=’int64’), (‘BJ’, 5): Int64Index([0], dtype=’int64’), (‘GZ’, 2): Int64Index([14], dtype=’int64’), (‘GZ’, 4): Int64Index([17], dtype=’int64’), (‘GZ’, 5): Int64Index([15, 16], dtype=’int64’), (‘SH’, 2): Int64Index([10], dtype=’int64’), (‘SH’, 3): Int64Index([7, 8, 9], dtype=’int64’), (‘SH’, 4): Int64Index([6, 11], dtype=’int64’), (‘SH’, 5): Int64Index([12, 13], dtype=’int64’), (‘SZ’, 1): Int64Index([18], dtype=’int64’), (‘SZ’, 4): Int64Index([19], dtype=’int64’)}
1 g_new.get_group(('BJ' ,2 ))
date city temperature wind 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 4 28/02/2016 BJ 19 2
for循环遍历多列groupby变量 1 2 3 for (name_1, name_2), group in g_new: print (name_1,name_2) print (group)
BJ 2 date city temperature wind 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 4 28/02/2016 BJ 19 2 BJ 3 date city temperature wind 3 14/02/2016 BJ -3 3 5 13/03/2016 BJ 5 3 BJ 5 date city temperature wind 0 03/01/2016 BJ 8 5 GZ 2 date city temperature wind 14 17/07/2016 GZ 10 2 GZ 4 date city temperature wind 17 28/08/2016 GZ 25 4 GZ 5 date city temperature wind 15 31/07/2016 GZ -1 5 16 14/08/2016 GZ 1 5 SH 2 date city temperature wind 10 22/05/2016 SH 4 2 SH 3 date city temperature wind 7 10/04/2016 SH 19 3 8 24/04/2016 SH 20 3 9 08/05/2016 SH 17 3 SH 4 date city temperature wind 6 27/03/2016 SH -4 4 11 05/06/2016 SH -10 4 SH 5 date city temperature wind 12 19/06/2016 SH 0 5 13 03/07/2016 SH -9 5 SZ 1 date city temperature wind 18 11/09/2016 SZ 20 1 SZ 4 date city temperature wind 19 25/09/2016 SZ -10 4
透视表 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf = pd.read_excel('sales-funnel.xlsx' ) df
Account Name Rep Manager Product Quantity Price Status 0 714466 Trantow-Barrows Craig Booker Debra Henley CPU 1 30000 presented 1 714466 Trantow-Barrows Craig Booker Debra Henley Software 1 10000 presented 2 714466 Trantow-Barrows Craig Booker Debra Henley Maintenance 2 5000 pending 3 737550 Fritsch, Russel and Anderson Craig Booker Debra Henley CPU 1 35000 declined 4 146832 Kiehn-Spinka Daniel Hilton Debra Henley CPU 2 65000 won 5 218895 Kulas Inc Daniel Hilton Debra Henley CPU 2 40000 pending 6 218895 Kulas Inc Daniel Hilton Debra Henley Software 1 10000 presented 7 412290 Jerde-Hilpert John Smith Debra Henley Maintenance 2 5000 pending 8 740150 Barton LLC John Smith Debra Henley CPU 1 35000 declined 9 141962 Herman LLC Cedric Moss Fred Anderson CPU 2 65000 won 10 163416 Purdy-Kunde Cedric Moss Fred Anderson CPU 1 30000 presented 11 239344 Stokes LLC Cedric Moss Fred Anderson Maintenance 1 5000 pending 12 239344 Stokes LLC Cedric Moss Fred Anderson Software 1 10000 presented 13 307599 Kassulke, Ondricka and Metz Wendy Yule Fred Anderson Maintenance 3 7000 won 14 688981 Keeling LLC Wendy Yule Fred Anderson CPU 5 100000 won 15 729833 Koepp Ltd Wendy Yule Fred Anderson CPU 2 65000 declined 16 729833 Koepp Ltd Wendy Yule Fred Anderson Monitor 2 5000 presented
136
17
1 pd.pivot_table(df, index=['Name' ,'Rep' ,'Manager' ])
Account Price Quantity Name Rep Manager Barton LLC John Smith Debra Henley 740150 35000 1.000000 Fritsch, Russel and Anderson Craig Booker Debra Henley 737550 35000 1.000000 Herman LLC Cedric Moss Fred Anderson 141962 65000 2.000000 Jerde-Hilpert John Smith Debra Henley 412290 5000 2.000000 Kassulke, Ondricka and Metz Wendy Yule Fred Anderson 307599 7000 3.000000 Keeling LLC Wendy Yule Fred Anderson 688981 100000 5.000000 Kiehn-Spinka Daniel Hilton Debra Henley 146832 65000 2.000000 Koepp Ltd Wendy Yule Fred Anderson 729833 35000 2.000000 Kulas Inc Daniel Hilton Debra Henley 218895 25000 1.500000 Purdy-Kunde Cedric Moss Fred Anderson 163416 30000 1.000000 Stokes LLC Cedric Moss Fred Anderson 239344 7500 1.000000 Trantow-Barrows Craig Booker Debra Henley 714466 15000 1.333333
1 pd.pivot_table(df, index=['Manager' ,'Rep' ], values=['Price' , 'Quantity' ], aggfunc='sum' )
Price Quantity Manager Rep Debra Henley Craig Booker 80000 5 Daniel Hilton 115000 5 John Smith 40000 3 Fred Anderson Cedric Moss 110000 5 Wendy Yule 177000 12
1 pd.pivot_table(df, index=['Manager' ,'Rep' ], values=['Price' ], columns=['Product' ], fill_value=0 , aggfunc='sum' )
Price Product CPU Maintenance Monitor Software Manager Rep Debra Henley Craig Booker 65000 5000 0 10000 Daniel Hilton 105000 0 0 10000 John Smith 35000 5000 0 0 Fred Anderson Cedric Moss 95000 5000 0 10000 Wendy Yule 165000 7000 5000 0
分组和透视功能 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFramedf = pd.read_csv('usa_flights.csv' ) df.head()
flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time 0 02/01/2015 0:00 AA 1 JFK LAX -19.0 0 2475 NaN NaN NaN NaN NaN 381.0 1 03/01/2015 0:00 AA 1 JFK LAX -39.0 0 2475 NaN NaN NaN NaN NaN 358.0 2 04/01/2015 0:00 AA 1 JFK LAX -12.0 0 2475 NaN NaN NaN NaN NaN 385.0 3 05/01/2015 0:00 AA 1 JFK LAX -8.0 0 2475 NaN NaN NaN NaN NaN 389.0 4 06/01/2015 0:00 AA 1 JFK LAX 25.0 0 2475 0.0 0.0 0.0 25.0 0.0 424.0
(201664, 14)
flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time 201659 10/01/2015 0:00 NK 188 OAK LAS -16.0 0 407 NaN NaN NaN NaN NaN 77.0 201660 11/01/2015 0:00 NK 188 OAK LAS -4.0 0 407 NaN NaN NaN NaN NaN 87.0 201661 12/01/2015 0:00 NK 188 OAK LAS -7.0 0 407 NaN NaN NaN NaN NaN 82.0 201662 13/01/2015 0:00 NK 188 OAK LAS 23.0 0 407 3.0 0.0 0.0 20.0 0.0 103.0 201663 14/01/2015 0:00 NK 188 OAK LAS -7.0 0 407 NaN NaN NaN NaN NaN 82.0
获取延误时间Top 10的航班 1 df.sort_values('arr_delay' , ascending=False )[:10 ]
flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time 11073 11/01/2015 0:00 AA 1595 AUS DFW 1444.0 0 190 1444.0 0.0 0.0 0.0 0.0 59.0 10214 13/01/2015 0:00 AA 1487 OMA DFW 1392.0 0 583 1392.0 0.0 0.0 0.0 0.0 117.0 12430 03/01/2015 0:00 AA 1677 MEM DFW 1384.0 0 432 1380.0 0.0 0.0 4.0 0.0 104.0 8443 04/01/2015 0:00 AA 1279 OMA DFW 1237.0 0 583 1222.0 0.0 15.0 0.0 0.0 102.0 10328 05/01/2015 0:00 AA 1495 EGE DFW 1187.0 0 721 1019.0 0.0 168.0 0.0 0.0 127.0 36570 04/01/2015 0:00 DL 1435 MIA MSP 1174.0 0 1501 1174.0 0.0 0.0 0.0 0.0 231.0 36495 04/01/2015 0:00 DL 1367 ROC ATL 1138.0 0 749 1112.0 0.0 0.0 26.0 0.0 171.0 59072 14/01/2015 0:00 DL 1687 SAN MSP 1084.0 0 1532 1070.0 0.0 0.0 14.0 0.0 240.0 32173 05/01/2015 0:00 AA 970 LAS LAX 1042.0 0 236 1033.0 0.0 9.0 0.0 0.0 66.0 56488 12/01/2015 0:00 DL 2117 ATL COS 1016.0 0 1184 1016.0 0.0 0.0 0.0 0.0 193.0
计算延误和没有延误所占比例 1 df['cancelled' ].value_counts()
0 196873 1 4791 Name: cancelled, dtype: int64
1 2 3 df['delayed' ] = df['arr_delay' ].apply(lambda x:x>0 ) df.head()
flight_date unique_carrier flight_num origin dest arr_delay cancelled distance carrier_delay weather_delay late_aircraft_delay nas_delay security_delay actual_elapsed_time delayed 0 02/01/2015 0:00 AA 1 JFK LAX -19.0 0 2475 NaN NaN NaN NaN NaN 381.0 False 1 03/01/2015 0:00 AA 1 JFK LAX -39.0 0 2475 NaN NaN NaN NaN NaN 358.0 False 2 04/01/2015 0:00 AA 1 JFK LAX -12.0 0 2475 NaN NaN NaN NaN NaN 385.0 False 3 05/01/2015 0:00 AA 1 JFK LAX -8.0 0 2475 NaN NaN NaN NaN NaN 389.0 False 4 06/01/2015 0:00 AA 1 JFK LAX 25.0 0 2475 0.0 0.0 0.0 25.0 0.0 424.0 True
1 2 3 delay_data = df['delayed' ].value_counts() delay_data
False 103037 True 98627 Name: delayed, dtype: int64
1 delay_data[1 ]/(delay_data[0 ] + delay_data[1 ])
0.4890659711202793
每个航空公司延误情况 1 2 3 delay_group = df.groupby(['unique_carrier' ,'delayed' ]) delay_group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118b616d8>
unique_carrier delayed AA False 8912 True 9841 AS False 3527 True 2104 B6 False 4832 True 4401 DL False 17719 True 9803 EV False 10596 True 11371 F9 False 1103 True 1848 HA False 1351 True 1354 MQ False 4692 True 8060 NK False 1550 True 2133 OO False 9977 True 10804 UA False 7885 True 8624 US False 7850 True 6353 VX False 1254 True 781 WN False 21789 True 21150 dtype: int64
1 2 3 df_delay = delay_group.size().unstack() df_delay
delayed False True unique_carrier AA 8912 9841 AS 3527 2104 B6 4832 4401 DL 17719 9803 EV 10596 11371 F9 1103 1848 HA 1351 1354 MQ 4692 8060 NK 1550 2133 OO 9977 10804 UA 7885 8624 US 7850 6353 VX 1254 781 WN 21789 21150
1 2 3 import matplotlib.pyplot as pltdf_delay.plot(kind='barh' ,stacked=True ,figsize=[16 ,6 ],colormap='winter' )
透视表 1 2 3 flight_by_carrier = df.pivot_table(index=['flight_date' ],columns=['unique_carrier' ],values=['flight_num' ],aggfunc='count' ) flight_by_carrier.head()
flight_num unique_carrier AA AS B6 DL EV F9 HA MQ NK OO UA US VX WN flight_date 02/01/2015 0:00 1545 477 759 2271 1824 254 224 1046 287 1763 1420 1177 176 3518 03/01/2015 0:00 1453 449 711 2031 1744 192 202 937 285 1681 1233 1028 160 3328 04/01/2015 0:00 1534 458 759 2258 1833 249 206 1027 284 1731 1283 1158 169 3403 05/01/2015 0:00 1532 433 754 2212 1811 264 209 1039 288 1737 1432 1157 174 3506 06/01/2015 0:00 1400 415 692 2054 1686 249 202 966 279 1527 1294 1003 152 3396
Matplotlib 简单绘图之plot 1 2 3 4 5 import numpy as npimport matplotlib.pyplot as plta = [1 ,2 ,3 ] plt.plot(a)
plt.show() 开始绘图 1 2 b = [4 ,5 ,6 ] plt.plot(a,b)
魔法命令,可直接显示图像 显示运行时间 594 ns ± 9.52 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
一张图中画多条线 1 2 3 4 c = [10 ,8 ,6 ] d = [1 ,8 ,3 ] plt.plot(a,b,'r--' ,c,d,'b*' )
画三角函数,并设置label,label,title 1 2 t = np.arange(0.0 ,2.0 ,0.1 ) t.size
20
1 2 s = np.sin(t * np.pi) s.size
20
array([ 0.00000000e+00, 3.09016994e-01, 5.87785252e-01, 8.09016994e-01, 9.51056516e-01, 1.00000000e+00, 9.51056516e-01, 8.09016994e-01, 5.87785252e-01, 3.09016994e-01, 1.22464680e-16, -3.09016994e-01, -5.87785252e-01, -8.09016994e-01, -9.51056516e-01, -1.00000000e+00, -9.51056516e-01, -8.09016994e-01, -5.87785252e-01, -3.09016994e-01])
1 2 3 4 5 6 plt.plot(t,s,'r--' ,label='aaaa' ) plt.plot(t*2 ,s,'--' ,label='bbbb' ) plt.xlabel('this is x' ) plt.ylabel('this is y' ) plt.title('this is a demo' ) plt.legend()
简单绘图之subplot 1 2 3 4 5 6 7 8 9 10 11 12 13 14 import numpy as npimport matplotlib.pyplot as pltx = np.linspace(0 ,5 ) y1 = np.sin(np.pi*x) y2 = np.sin(np.pi*x*2 ) plt.plot(x, y1, 'b--' , label='sin(pi*x)' ) plt.ylabel('y1 value' ) plt.plot(x, y2, 'r--' , label='sin(pi*2x)' ) plt.ylabel('y2 value' ) plt.xlabel('x value' ) plt.title('this is x-y value' ) plt.legend()
1 2 3 4 5 6 7 8 9 10 11 plt.subplot(2 ,2 ,1 ) plt.plot(x, y1, 'b--' ) plt.ylabel('y1' ) plt.subplot(2 ,2 ,2 ) plt.plot(x, y2, 'r--' ) plt.ylabel('y2' ) plt.xlabel('x' ) plt.subplot(2 ,2 ,3 ) plt.plot(x, y1, 'r*' ) plt.subplot(2 ,2 ,4 ) plt.plot(x, y1, 'b*' )
subplots 1 figure, ax = plt.subplots(2 ,2 )
1 2 3 4 ax[0 ][0 ].plot(x,y1) ax[0 ][1 ].plot(x,y2) figure
Series 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Seriesimport matplotlib.pyplot as plts1 = Series(np.random.randn(100 ).cumsum()) s2 = Series(np.random.randn(100 ).cumsum())
1 2 3 s1.plot(kind='line' , grid=True , label='S1' , title='this is Series' ) s2.plot(label='S2' , style='--' ) plt.legend()
1 2 3 figure, ax = plt.subplots(2 ,1 ) figure
1 2 3 s1[0 :10 ].plot(ax=ax[0 ],label='S1' ,kind='bar' ) s2.plot(ax=ax[1 ],label='S2' ) figure
DataFrame 1 2 3 4 5 6 7 8 9 10 11 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport matplotlib.pyplot as pltdf = DataFrame( np.random.randint(1 ,10 ,40 ).reshape(10 ,4 ), columns=['A' ,'B' ,'C' ,'D' ] ) df
A B C D 0 2 1 9 8 1 2 3 9 1 2 2 5 9 2 3 4 1 8 6 4 1 9 8 9 5 5 4 6 5 6 7 2 1 8 7 1 4 1 8 8 8 1 2 9 9 2 4 5 4
横向柱状图
Stacked柱状图 1 df.plot(kind='bar' ,stacked=True )
区域填充
指定index画图 1 2 3 a = df.iloc[5 ] a.plot()
按index画图(方法1) 1 2 3 4 for i in df.index: df.iloc[i].plot(label=str (i)) plt.legend()
按index画图(方法2)
按column画图
直方图 1 2 3 4 5 6 7 8 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport matplotlib.pyplot as plts = Series(np.random.randn(1000 )) plt.hist(s, rwidth=0.9 )
array([ 1., 9., 42., 129., 243., 257., 198., 93., 24., 4.]), array([-3.75513064, -3.04405274, -2.33297485, -1.62189695, -0.91081905, -0.19974115, 0.51133675, 1.22241465, 1.93349255, 2.64457045, 3.35564835])
1 plt.hist(s, rwidth=0.9 , bins=20 , color='r' )
array([ 1., 0., 1., 8., 15., 27., 46., 83., 103., 140., 143., 114., 120., 78., 61., 32., 15., 9., 3., 1.]), array([-3.75513064, -3.39959169, -3.04405274, -2.68851379, -2.33297485, -1.9774359 , -1.62189695, -1.266358 , -0.91081905, -0.5552801 , -0.19974115, 0.1557978 , 0.51133675, 0.8668757 , 1.22241465, 1.5779536 , 1.93349255, 2.2890315 , 2.64457045, 3.0001094 , 3.35564835])
密度图
Seaborn
散点图 方法1:matplotlib 1 2 3 4 5 6 7 import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsiris = pd.read_csv('iris.csv' ) iris.head()
SepalLength SepalWidth PetalLength PetalWidth Name 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
array([‘Iris-setosa’, ‘Iris-versicolor’, ‘Iris-virginica’], dtype=object)
1 2 3 color_map = dict (zip (iris.Name.unique(),['blue' ,'green' ,'red' ])) color_map
{‘Iris-setosa’: ‘blue’, ‘Iris-versicolor’: ‘green’, ‘Iris-virginica’: ‘red’}
1 2 3 4 5 6 7 8 9 for species, group in iris.groupby('Name' ): plt.scatter(group['PetalLength' ], group['SepalLength' ], color=color_map[species], alpha=0.3 , edgecolor=None , label=species) plt.legend(frameon=True , title='Name' ) plt.xlabel('petalLength' ) plt.ylabel('sepalLength' )
方法2:Seaborn 1 sns.lmplot('PetalLength' ,'SepalLength' , iris, hue='Name' , fit_reg=False )
直方图(matplotlib) 1 2 s1 = Series(np.random.randn(1000 )) plt.hist(s1)
(array([ 5., 36., 96., 196., 278., 227., 111., 37., 9., 5.]), array([-3.17567133, -2.48638745, -1.79710358, -1.1078197 , -0.41853582, 0.27074806, 0.96003194, 1.64931581, 2.33859969, 3.02788357, 3.71716745]), <a list of 10 Patch objects>)
密度图(matplotlib)
直方图(Seaborn) 1 sns.distplot(s1, hist=True , kde=False , rug=True )
密度图(Seaborn) 1 sns.kdeplot(s1, shade=True , color='r' )
轴须图(Seaborn)
热力图(Seaborn) 1 2 3 4 5 6 7 8 9 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport matplotlib.pyplot as pltimport seaborn as snsdf = sns.load_dataset('flights' ) df.head()
year month passengers 0 1949 January 112 1 1949 February 118 2 1949 March 132 3 1949 April 129 4 1949 May 121
(144, 3)
生成透视表 1 2 df = df.pivot(index='month' , columns='year' , values='passengers' ) df
year 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 month January 112 115 145 171 196 204 242 284 315 340 360 417 February 118 126 150 180 196 188 233 277 301 318 342 391 March 132 141 178 193 236 235 267 317 356 362 406 419 April 129 135 163 181 235 227 269 313 348 348 396 461 May 121 125 172 183 229 234 270 318 355 363 420 472 June 135 149 178 218 243 264 315 374 422 435 472 535 July 148 170 199 230 264 302 364 413 465 491 548 622 August 148 170 199 242 272 293 347 405 467 505 559 606 September 136 158 184 209 237 259 312 355 404 404 463 508 October 119 133 162 191 211 229 274 306 347 359 407 461 November 104 114 146 172 180 203 237 271 305 310 362 390 December 118 140 166 194 201 229 278 306 336 337 405 432
线形图(Matplotlib)
热力图(Seaborn) 1 sns.heatmap(df, annot=True , fmt='d' )
柱状图(Matplotlib)
柱状图(Seaborn) Int64Index([1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960], dtype=’int64’, name=’year’)
array([1520, 1676, 2042, 2364, 2700, 2867, 3408, 3939, 4421, 4572, 5140, 5714])
1 sns.barplot(x=s.index, y=s.values)
Seaborn设置图形效果 1 2 3 4 5 6 7 8 9 10 11 import numpy as npimport pandas as pdimport matplotlib.pyplot as pltx = np.linspace(0 ,14 ,100 ) y1 = np.sin(x) y2 = np.sin(x+2 )*1.25 def sinplot (): plt.plot(x, y1) plt.plot(x, y2)
设置Seaborn显示风格 1 2 3 4 5 import seaborn as snsstyle = ['darkgrid' , 'dark' , 'white' , 'whitegrid' , 'ticks' ] sns.set_style(style[0 ]) sinplot()
1 2 sns.set_style(style[3 ]) sinplot()
{‘axes.facecolor’: ‘white’, ‘axes.edgecolor’: ‘.8’, ‘axes.grid’: True, ‘axes.axisbelow’: True, ‘axes.labelcolor’: ‘.15’, ‘figure.facecolor’: ‘white’, ‘grid.color’: ‘.8’, ‘grid.linestyle’: ‘-‘, ‘text.color’: ‘.15’, ‘xtick.color’: ‘.15’, ‘ytick.color’: ‘.15’, ‘xtick.direction’: ‘out’, ‘ytick.direction’: ‘out’, ‘lines.solid_capstyle’: ‘round’, ‘patch.edgecolor’: ‘w’, ‘image.cmap’: ‘rocket’, ‘font.family’: [‘sans-serif’], ‘font.sans-serif’: [‘Arial’, ‘DejaVu Sans’, ‘Liberation Sans’, ‘Bitstream Vera Sans’, ‘sans-serif’], ‘patch.force_edgecolor’: True, ‘xtick.bottom’: False, ‘xtick.top’: False, ‘ytick.left’: False, ‘ytick.right’: False, ‘axes.spines.left’: True, ‘axes.spines.bottom’: True, ‘axes.spines.right’: True, ‘axes.spines.top’: True}
自定义配置 1 2 sns.set_style(style[0 ],{'grid.color' : 'green' }) sinplot()
清空自定义设置
plotting_context() & set_context() 1 2 3 4 context = ['paper' , 'notebook' , 'talk' , 'poster' ] sns.set_context(context[1 ], rc={'grid.linewidth' : 3.0 }) sinplot()
1 2 sns.set_context(context[3 ]) sinplot()
显示context详细设置 {‘font.size’: 24.0, ‘axes.labelsize’: 24.0, ‘axes.titlesize’: 24.0, ‘xtick.labelsize’: 22.0, ‘ytick.labelsize’: 22.0, ‘legend.fontsize’: 22.0, ‘axes.linewidth’: 2.5, ‘grid.linewidth’: 2.0, ‘lines.linewidth’: 3.0, ‘lines.markersize’: 12.0, ‘patch.linewidth’: 2.0, ‘xtick.major.width’: 2.5, ‘ytick.major.width’: 2.5, ‘xtick.minor.width’: 2.0, ‘ytick.minor.width’: 2.0, ‘xtick.major.size’: 12.0, ‘ytick.major.size’: 12.0, ‘xtick.minor.size’: 8.0, ‘ytick.minor.size’: 8.0}
股票分析 1 2 3 4 5 6 7 import numpy as npimport pandas as pdfrom pandas import Series, DataFrameimport matplotlib.pyplot as pltimport seaborn as snsimport pandas_datareader as pdrfrom datetime import datetime
1 2 3 alibaba = pdr.get_data_yahoo('BABA' ) alibaba.head()
High Low Open Close Volume Adj Close Date 2014-09-19 99.699997 89.949997 92.699997 93.889999 271879400 93.889999 2014-09-22 92.949997 89.500000 92.699997 89.889999 66657800 89.889999 2014-09-23 90.480003 86.620003 88.940002 87.169998 39009800 87.169998 2014-09-24 90.570000 87.220001 88.470001 90.570000 32088000 90.570000 2014-09-25 91.500000 88.500000 91.089996 88.919998 28598000 88.919998
(1211, 6)
High Low Open Close Volume Adj Close Date 2019-07-08 170.279999 168.350006 170.089996 168.449997 16531000 168.449997 2019-07-09 168.979996 166.500000 167.085007 168.800003 15782800 168.800003 2019-07-10 171.369995 166.860001 171.240005 166.929993 16675600 166.929993 2019-07-11 168.423996 165.000000 167.559998 166.550003 14796400 166.550003 2019-07-12 170.199997 167.649994 168.000000 169.070007 18093300 169.070007
High Low Open Close Volume Adj Close count 1211.000000 1211.000000 1211.000000 1211.000000 1.211000e+03 1211.000000 mean 126.715532 123.620233 125.280824 125.175351 1.760189e+07 125.175351 std 44.698800 43.686183 44.263113 44.208817 1.260813e+07 44.208817 min 58.650002 57.200001 57.299999 57.389999 3.775300e+06 57.389999 25% 85.129997 83.375000 84.594997 84.494999 1.107115e+07 84.494999 50% 108.820000 106.510002 107.809998 107.830002 1.489790e+07 107.830002 75% 173.044998 169.508499 171.739998 171.590004 2.063065e+07 171.590004 max 211.699997 207.509995 209.949997 210.860001 2.718794e+08 210.860001
<class ‘pandas.core.frame.DataFrame’> DatetimeIndex: 1211 entries, 2014-09-19 to 2019-07-12 Data columns (total 6 columns): High 1211 non-null float64 Low 1211 non-null float64 Open 1211 non-null float64 Close 1211 non-null float64 Volume 1211 non-null int64 Adj Close 1211 non-null float64 dtypes: float64(5), int64(1) memory usage: 66.2 KB
1 2 3 4 5 6 7 8 start = datetime(2015 ,9 ,20 ) alibaba = pdr.get_data_yahoo('BABA' , start=start) amazon = pdr.get_data_yahoo('AMZN' , start=start) alibaba.to_csv('/Users/kwang/jupyter-projects/BABA.csv' ) amazon.to_csv('/Users/kwang/jupyter-projects/AMZN.csv' ) alibaba.head()
High Low Open Close Volume Adj Close Date 2015-09-21 66.400002 62.959999 65.379997 63.900002 22355100 63.900002 2015-09-22 63.270000 61.580002 62.939999 61.900002 14897900 61.900002 2015-09-23 62.299999 59.680000 61.959999 60.000000 22684600 60.000000 2015-09-24 60.340000 58.209999 59.419998 59.919998 20645700 59.919998 2015-09-25 60.840000 58.919998 60.630001 59.240002 17009100 59.240002
1 alibaba['Adj Close' ].plot(legend=True )
1 alibaba['Volume' ].plot(legend=True )
1 2 alibaba['Adj Close' ].plot() amazon['Adj Close' ].plot()
1 2 3 alibaba['high-low' ] = alibaba['High' ] - alibaba['Low' ] alibaba.head()
High Low Open Close Volume Adj Close high-low Date 2015-09-21 66.400002 62.959999 65.379997 63.900002 22355100 63.900002 3.440002 2015-09-22 63.270000 61.580002 62.939999 61.900002 14897900 61.900002 1.689999 2015-09-23 62.299999 59.680000 61.959999 60.000000 22684600 60.000000 2.619999 2015-09-24 60.340000 58.209999 59.419998 59.919998 20645700 59.919998 2.130001 2015-09-25 60.840000 58.919998 60.630001 59.240002 17009100 59.240002 1.920002
1 alibaba['high-low' ].plot()
1 2 3 alibaba['daily-return' ] = alibaba['Adj Close' ].pct_change() alibaba['daily-return' ].plot(figsize=(20 ,4 ),linestyle='--' ,marker='o' )
1 alibaba['daily-return' ].plot(kind='hist' )
1 sns.distplot(alibaba['daily-return' ].dropna(),bins=100 ,color='purple' )
1 2 3 4 5 6 7 start = datetime(2015 ,1 ,1 ) company = ['AAPL' , 'GOOG' , 'MSFT' , 'AMZN' , 'FB' ] top_tech_df = pdr.get_data_yahoo(company, start=start)['Adj Close' ] top_tech_df.to_csv('/Users/kwang/jupyter-projects/top5.csv' ) top_tech_df.head()
Symbols AAPL AMZN FB GOOG MSFT Date 2014-12-31 102.110046 310.350006 78.019997 524.958740 41.982117 2015-01-02 101.138702 308.519989 78.449997 523.373108 42.262299 2015-01-05 98.289474 302.190002 77.190002 512.463013 41.873657 2015-01-06 98.298729 295.290009 76.150002 500.585632 41.259060 2015-01-07 99.677094 298.420013 76.150002 499.727997 41.783279
1 2 3 top_tech_dr = top_tech_df.pct_change() top_tech_dr.head()
Symbols AAPL AMZN FB GOOG MSFT Date 2014-12-31 NaN NaN NaN NaN NaN 2015-01-02 -0.009513 -0.005897 0.005511 -0.003020 0.006674 2015-01-05 -0.028171 -0.020517 -0.016061 -0.020846 -0.009196 2015-01-06 0.000094 -0.022833 -0.013473 -0.023177 -0.014677 2015-01-07 0.014022 0.010600 0.000000 -0.001713 0.012706
1 top_tech_df[['AAPL' ,'FB' ,'MSFT' ]].plot()
散点图 1 sns.jointplot('AMZN' ,'GOOG' ,top_tech_dr, kind='scatter' )
1 sns.jointplot('MSFT' ,'FB' ,top_tech_dr, kind='scatter' )
1 sns.pairplot(top_tech_dr.dropna())
分位数 1 top_tech_dr['AAPL' ].quantile(0.05 )
-0.02510024760167844
95%的信心daily-return为-2% 1 top_tech_dr['MSFT' ].quantile(0.05 )
-0.021409700990842318
1 2 vips = pdr.get_data_yahoo('VIPS' , start=start)['Adj Close' ] vips.plot()
1 vips.pct_change().quantile(0.2 )
-0.02329943700527643