- 取得連結
- X
- 以電子郵件傳送
- 其他應用程式
程式語言:Python
Package:pandas
官方網站
官方文件
功能:數據處理分析
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)
參考
Pandas Cheat Sheet: Data Wrangling in Python
Package:pandas
官方網站
官方文件
功能:數據處理分析
import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(np.linspace(1, 4, num=4),index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) print(df) # A B C D E F # 0 1.0 2013-01-02 1.0 3 test foo # 1 1.0 2013-01-02 2.0 3 train foo # 2 1.0 2013-01-02 3.0 3 test foo # 3 1.0 2013-01-02 4.0 3 train foo print(df.dtypes) # A float64 # B datetime64[ns] # C float32 # D int32 # E category # F object # dtype: object df[['A', 'C']].plot() plt.legend(loc='best') plt.show()
資料架構
class pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)- pandas.Series
- 一維資料
- data
- 輸入的資料
- array-like
- dict
- scalar value (like 10)
- index
- 加入對應的 row name
- dtype
- 資料類型
- name
- 資料名字
- 轉換為 DataFrame 時,會成為 column name
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
- fastpath
- 內部參數
範例
numpy 建立
dict 或 純值 建立
import numpy as np import pandas as pd data = np.linspace(1, 3, num=3) print(data) # [ 1. 2. 3.] s1 = pd.Series(data, index=['a', 'b', 'c']) print(s1) # a 1.0 # b 2.0 # c 3.0 # dtype: float64 s2 = pd.Series(data, index=['a', 'b', 'c'], copy=True, name='s2') print(s2) # a 1.0 # b 2.0 # c 3.0 # Name: s2, dtype: float64 # 可看到 column name 為 's2' print(pd.DataFrame(s2)) # s2 # a 1.0 # b 2.0 # c 3.0 print(s1[0]) # 1 print(s1['a']) # 1 data[0] = 10 # 可看到只有 s2 無改變,因是重新 copy print(data) # [ 10. 2. 3.] print(s1) # a 10.0 # b 2.0 # c 3.0 # dtype: float64 print(s2) # a 1.0 # b 2.0 # c 3.0 # dtype: float64
dict 或 純值 建立
import pandas as pd s3 = pd.Series({'a':2., 'b':5}) print(s3) # a 2.0 # b 5.0 # dtype: float64 s4 = pd.Series(10, index=['a', 'b', 'c', 'd', 'e']) print(s4) # a 10 # b 10 # c 10 # d 10 # e 10 # dtype: int64
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
- pandas.DataFrame
- 二維資料
- data
- 輸入的資料
- array-like
- Dict of 1D ndarrays, lists, dicts, or Series
- 單個 Series
- 另一個 DataFrame
- index
- 輸入資料擁有 row name,例:dict of Series, dicts
為指定讀取的 row name - 輸入資料無 row name,例:list、structured arrays
為加入對應的 row name - columns
- 輸入資料擁有 column name,例:dict of Series, dicts、structured arrays
為指定讀取的 column name - 輸入資料無 column name,例:list
為加入對應的 column name - dtype
- 資料類型
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
範例
Series
Dict
Structured arrays
Multi-indexed frame
import numpy as np import pandas as pd # Series d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} df = pd.DataFrame(d) print(df) # one two # a 1.0 1.0 # b 2.0 2.0 # c 3.0 3.0 # d NaN 4.0 # 指定 index print(pd.DataFrame(d, index=['d', 'b', 'a'])) # one two # d NaN 4.0 # b 2.0 2.0 # a 1.0 1.0 # 指定 column name print(pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'one', 'three'])) # two three # d 4.0 NaN # b 2.0 NaN # a 1.0 NaN
Dict
import numpy as np import pandas as pd # dict d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]} print(pd.DataFrame(d)) # one two # 0 1.0 4.0 # 1 2.0 3.0 # 2 3.0 2.0 # 3 4.0 1.0 # 加入 index print(pd.DataFrame(d, index=['a', 'b', 'c', 'd'])) # one two # a 1.0 4.0 # b 2.0 3.0 # c 3.0 2.0 # d 4.0 1.0
Structured arrays
import numpy as np import pandas as pd # structured arrays # 建立四個空白資料,並擁有三種資料 'A' 'B' 'C',其對應資料類型為 integer 4 bytes, float 4 bytes, string 10 bytes data = np.zeros((4,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'S10')]) data[:] = [(1,2.,'Hello'), (2,3.,"World"), (6,8.,"!!"), (9,11.,"??")] print(data) # [(1, 2., b'Hello') (2, 3., b'World') (6, 8., b'!!') (9, 11., b'??')] # 印出 A 的資料 print(data['A']) # [1 2 6 9] print(pd.DataFrame(data)) # A B C # 0 1 2.0 b'Hello' # 1 2 3.0 b'World' # 2 6 8.0 b'!!' # 3 9 11.0 b'??' print(pd.DataFrame(data, index=['first', 'second', 'three', 'four'])) # A B C # first 1 2.0 b'Hello' # second 2 3.0 b'World' # three 6 8.0 b'!!' # four 9 11.0 b'??' print(pd.DataFrame(data, columns=['C', 'A', 'B'])) # C A B # 0 b'Hello' 1 2.0 # 1 b'World' 2 3.0 # 2 b'!!' 6 8.0 # 3 b'??' 9 11.0
Multi-indexed frame
import numpy as np import pandas as pd # multi-indexed frame df = pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) print(df) # a b # a b c a b # A B 4.0 1.0 5.0 8.0 10.0 # C 3.0 2.0 6.0 7.0 NaN # D NaN NaN NaN NaN 9.0 print(df['b']['a']['A']['C']) # 7.0
class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)
- pandas.Panel
- 三維資料,但不常使用
- data
- 輸入的資料
- items
- 資料大項的名字
- major_axis
- 加入 row name
- minor_axis
- 加入 column name
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
- dtype
- 資料類型
範例
numpy 建立
DataFrame 建立
import numpy as np import pandas as pd # 3D ndarray wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) print(wp) # <class 'pandas.core.panel.Panel'> # Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis) # Items axis: Item1 to Item2 # Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00 # Minor_axis axis: A to D
DataFrame 建立
import numpy as np import pandas as pd # Series d = {'item1' : pd.DataFrame({'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}), 'item2' : pd.DataFrame({'thr' : pd.Series([1., 2., 3.], index=['e', 'f', 'g'])})} wp = pd.Panel(d) print(wp) # <class 'pandas.core.panel.Panel'> # Dimensions: 2 (items) x 7 (major_axis) x 3 (minor_axis) # Items axis: item1 to item2 # Major_axis axis: a to g # Minor_axis axis: one to two # filter_observations 預設為 True,表示不顯示含有 NaN 的欄位 # 所以轉換後為 Empty DataFrame print(wp.to_frame()) # Empty DataFrame # Columns: [item1, item2] # Index: [] print(wp.to_frame(filter_observations=False)) # item1 item2 # major minor # a one 1.0 NaN # thr NaN NaN # two 1.0 NaN # b one 2.0 NaN # thr NaN NaN # two 2.0 NaN # c one 3.0 NaN # thr NaN NaN # two 3.0 NaN # d one NaN NaN # thr NaN NaN # two 4.0 NaN # e one NaN NaN # thr NaN 1.0 # two NaN NaN # f one NaN NaN # thr NaN 2.0 # two NaN NaN # g one NaN NaN # thr NaN 3.0 # two NaN NaN
基本操作
得值
原始資料
Getting
Selection by Label
Selection by Position
Selection by Mix
Boolean Indexing
isin
values
ravel()
import numpy as np import pandas as pd dates = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) print(df) # A B C D # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006 # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 # 2013-01-06 1.014020 0.812423 0.420242 -0.633087
Getting
# get by [],只能一個 index print(df['A']) # 等同 df.A # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64 # Freq 表示間隔頻率為一天 # get by [[]],可以多個 columns print(df[['A', 'B']]) # A B # 2013-01-01 0.254520 -0.263404 # 2013-01-02 0.024263 -0.014928 # 2013-01-03 -0.628894 -0.419761 # 2013-01-04 -1.116615 -0.027055 # 2013-01-05 0.637412 -0.188929 # 2013-01-06 1.014020 0.812423 print(df[0:10]) # A B C D # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006 # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 # 2013-01-06 1.014020 0.812423 0.420242 -0.633087 print(df['20130102':'20130104']) # A B C D # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006
Selection by Label
print(df.loc['20130102':'20130104', 'A']) # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # Freq: D, Name: A, dtype: float64
Selection by Position
# get by index # index 需正確,可用 df.reset_index(drop=True) print(df.iloc[0:10, 0]) # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64
Selection by Mix
# get by index, label print(df.ix[0:10, 'A']) # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64
Boolean Indexing
print(df[df.A > 0]) # A B C D # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 # 2013-01-06 1.014020 0.812423 0.420242 -0.633087
isin
df2 = df.copy() df2['E'] = ['one', 'one','two','three','four','three'] print(df2) # A B C D E # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 one # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 one # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 two # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006 three # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 four # 2013-01-06 1.014020 0.812423 0.420242 -0.633087 three print(df2[df2['E'].isin(['two','four'])]) # A B C D E # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 two # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 four
values
# 轉換為 numpy 格式 print(df.values) # array([[ 0.254520, -0.263404, -2.210508, 0.807513], # [ 0.024263, -0.014928, -0.601224, 0.561930], # [-0.628894, -0.419761, 1.051119, -0.593262], # [-1.116615, -0.027055, 0.537114, -0.955006], # [ 0.637412, -0.188929, -0.354975, 1.693871], # [ 1.014020, 0.812423, 0.420242, -0.633087]])
ravel()
# 轉換為 numpy 一維格式 print(df['A'].ravel()) # array([0.254520, 0.024263, -0.628894, -1.116615, 0.637412, 1.014020])
賦值
利用 series, label , index, numpy
import numpy as np import pandas as pd df = pd.DataFrame() date = pd.date_range('20130102', periods=6) s1 = pd.Series([1,2,3,4,5,6], index=date) # by series df['F'] = s1 # by label df.at[date[2],'A'] = 1000 # by index df.iat[0,1] = 0 # by np df.loc[:,'D'] = np.array([5] * len(df)) print(df) # F A D # 2013-01-02 1 0.0 5 # 2013-01-03 2 NaN 5 # 2013-01-04 3 1000.0 5 # 2013-01-05 4 NaN 5 # 2013-01-06 5 NaN 5 # 2013-01-07 6 NaN 5 # by 任意得值的方法 df.loc['20130102':'20130103', 'A'] = 1 # index 需正確,可用 df.reset_index(drop=True) df.iloc[3:5, 0] = 10 df.ix[5, 'A'] = 0 print(df) # F A D # 2013-01-02 1 1.0 5 # 2013-01-03 2 1.0 5 # 2013-01-04 3 1000.0 5 # 2013-01-05 4 10.0 5 # 2013-01-06 5 10.0 5 # 2013-01-07 6 0.0 5 # 只有符合條件才會賦值 df[df > 10] = -df print(df) # F A D # 2013-01-02 1 1.0 5 # 2013-01-03 2 1.0 5 # 2013-01-04 3 -1000.0 5 # 2013-01-05 4 10.0 5 # 2013-01-06 5 10.0 5 # 2013-01-07 6 0.0 5
合併表格
Merge, join, and concatenate
import pandas as pd df1 = pd.DataFrame({'X1':['a', 'b', 'c'], 'X2':[1, 2, 3],}) # X1 X2 # 0 a 1 # 1 b 2 # 2 c 3 df2 = pd.DataFrame({'X1':['a', 'b', 'd'], 'X3':[4, None, 6],}) # X1 X3 # 0 a 4.0 # 1 b NaN # 2 d 6.0 df1.merge(df2, how='left', on='X1') # X1 X2 X3 # 0 a 1 4.0 # 1 b 2 NaN # 2 c 3 NaN df1.merge(df2, how='right', on='X1') # X1 X2 X3 # 0 a 1.0 4.0 # 1 b 2.0 NaN # 2 d NaN 6.0 df1.merge(df2, how='outer', on='X1') # X1 X2 X3 # 0 a 1.0 4.0 # 1 b 2.0 NaN # 2 c 3.0 NaN # 3 d NaN 6.0 # join 必需有相同的 index 才行 df1.set_index('X1').join(df2.set_index('X1'), how='right') # X2 X3 # X1 # a 1.0 4.0 # b 2.0 NaN # d NaN 6.0 # 方向為 index df1.append(df2) # X1 X2 X3 # 0 a 1.0 NaN # 1 b 2.0 NaN # 2 c 3.0 NaN # 0 a NaN 4.0 # 1 b NaN NaN # 2 d NaN 6.0 # 方向為 index pd.concat([df1, df2]) # X1 X2 X3 # 0 a 1.0 NaN # 1 b 2.0 NaN # 2 c 3.0 NaN # 0 a NaN 4.0 # 1 b NaN NaN # 2 d NaN 6.0 # 方向為 columns pd.concat([df1, df2], axis='columns') # X1 X2 X1 X3 # 0 a 1 a 4.0 # 1 b 2 b NaN # 2 c 3 d 6.0
調整表格
import pandas as pd df = pd.DataFrame({'foo': ['one','one','one','two','two','two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6]}) # foo bar baz # 0 one A 1 # 1 one B 2 # 2 one C 3 # 3 two A 4 # 4 two B 5 # 5 two C 6 # 重新調整表格,指定排列方式 df.pivot(index='foo', columns='bar', values='baz') # A B C # one 1 2 3 # two 4 5 6 # pivot 的進階版,指定排列方式,可設定值的處理,預設為平均 df.pivot_table(index='foo', columns='bar', values='baz') # A B C # one 1 2 3 # two 4 5 6
import pandas as pd import numpy as np df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": ["small", "large", "large", "small", "small", "large", "small", "small", "large"], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) # A B C D # 0 foo one small 1 # 1 foo one large 2 # 2 foo one large 2 # 3 foo two small 3 # 4 foo two small 3 # 5 bar one large 4 # 6 bar one small 5 # 7 bar two small 6 # 8 bar two large 7 df.pivot_table(index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum) # C large small # A B # bar one 4.0 5.0 # two 7.0 6.0 # foo one 4.0 1.0 # two NaN 6.0
import pandas as pd df = pd.DataFrame({'a': {'one':1.0, 'two':2.0}, 'b': {'one':3.0, 'two':4.0}}) # a b # one 1.0 3.0 # two 2.0 4.0 # 把 columns 移到 index,可能會得到 Series df.stack() # one a 1.0 # b 3.0 # two a 2.0 # b 4.0 # 把 index 移到 columns,可能會得到 Series df.stack().unstack() # a b # one 1.0 3.0 # two 2.0 4.0 # 重新排列 index 並把 index 移至 columns,仍是 DataFrame df.reset_index() # index a b # 0 one 1.0 3.0 # 1 two 2.0 4.0
import pandas as pd df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, 'B': {0: 1, 1: 3, 2: 5}, 'C': {0: 2, 1: 4, 2: 6}}) # A B C # 0 a 1 2 # 1 b 3 4 # 2 c 5 6 # 新增對應 B C 的行與其對應的值 df.melt(id_vars=['A'], value_vars=['B', 'C']) # A variable value # 0 a B 1 # 1 b B 3 # 2 c B 5 # 3 a C 2 # 4 b C 4 # 5 c C 6
統計運算
原始資料
數學運算
統計計算
import numpy as np import pandas as pd dates = pd.date_range('20130101', periods=6) df = pd.DataFrame(np.ones((6, 4))*2, index=dates, columns=list('ABCD'), dtype='float64') print(df) # A B C D # 2013-01-01 2.0 2.0 2.0 2.0 # 2013-01-02 2.0 2.0 2.0 2.0 # 2013-01-03 2.0 2.0 2.0 2.0 # 2013-01-04 2.0 2.0 2.0 2.0 # 2013-01-05 2.0 2.0 2.0 2.0 # 2013-01-06 2.0 2.0 2.0 2.0
數學運算
# 數學運算 print(df+10) # A B C D # 2013-01-01 12.0 12.0 12.0 12.0 # 2013-01-02 12.0 12.0 12.0 12.0 # 2013-01-03 12.0 12.0 12.0 12.0 # 2013-01-04 12.0 12.0 12.0 12.0 # 2013-01-05 12.0 12.0 12.0 12.0 # 2013-01-06 12.0 12.0 12.0 12.0 print(df-10) # A B C D # 2013-01-01 -8.0 -8.0 -8.0 -8.0 # 2013-01-02 -8.0 -8.0 -8.0 -8.0 # 2013-01-03 -8.0 -8.0 -8.0 -8.0 # 2013-01-04 -8.0 -8.0 -8.0 -8.0 # 2013-01-05 -8.0 -8.0 -8.0 -8.0 # 2013-01-06 -8.0 -8.0 -8.0 -8.0 print(df/10) # A B C D # 2013-01-01 0.2 0.2 0.2 0.2 # 2013-01-02 0.2 0.2 0.2 0.2 # 2013-01-03 0.2 0.2 0.2 0.2 # 2013-01-04 0.2 0.2 0.2 0.2 # 2013-01-05 0.2 0.2 0.2 0.2 # 2013-01-06 0.2 0.2 0.2 0.2 print(df*10) # A B C D # 2013-01-01 20.0 20.0 20.0 20.0 # 2013-01-02 20.0 20.0 20.0 20.0 # 2013-01-03 20.0 20.0 20.0 20.0 # 2013-01-04 20.0 20.0 20.0 20.0 # 2013-01-05 20.0 20.0 20.0 20.0 # 2013-01-06 20.0 20.0 20.0 20.0 print(df**2) # A B C D # 2013-01-01 4.0 4.0 4.0 4.0 # 2013-01-02 4.0 4.0 4.0 4.0 # 2013-01-03 4.0 4.0 4.0 4.0 # 2013-01-04 4.0 4.0 4.0 4.0 # 2013-01-05 4.0 4.0 4.0 4.0 # 2013-01-06 4.0 4.0 4.0 4.0
統計計算
# 統計 print(df.describe()) # A B C D # count 6.0 6.0 6.0 6.0 # mean 2.0 2.0 2.0 2.0 # std 0.0 0.0 0.0 0.0 # min 2.0 2.0 2.0 2.0 # 25% 2.0 2.0 2.0 2.0 # 50% 2.0 2.0 2.0 2.0 # 75% 2.0 2.0 2.0 2.0 # max 2.0 2.0 2.0 2.0 # mean 值 print(df.mean()) # A 2.0 # B 2.0 # C 2.0 # D 2.0 # dtype: float64 # 累積計算 print(df.cumsum()) # A B C D # 2013-01-01 2.0 2.0 2.0 2.0 # 2013-01-02 4.0 4.0 4.0 4.0 # 2013-01-03 6.0 6.0 6.0 6.0 # 2013-01-04 8.0 8.0 8.0 8.0 # 2013-01-05 10.0 10.0 10.0 10.0 # 2013-01-06 12.0 12.0 12.0 12.0
缺值
缺值處理,丟棄或給予其他值
import numpy as np import pandas as pd df = pd.DataFrame() date = pd.date_range('20130102', periods=6) s1 = pd.Series([1,2,3,4,5,6], index=date) # by series df['F'] = s1 # by label df.at[date[2],'A'] = 1000 # by index df.iat[0,1] = 0 # by np df.loc[:,'D'] = np.array([5] * len(df)) print(df) # F A D # 2013-01-02 1 0.0 5 # 2013-01-03 2 NaN 5 # 2013-01-04 3 1000.0 5 # 2013-01-05 4 NaN 5 # 2013-01-06 5 NaN 5 # 2013-01-07 6 NaN 5 # DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) # 預設 axis=0 => "index" # how='any',只要有 NaN 就丟掉 # how='all',全部為 NaN 才丟掉 print(df.dropna(how='any')) # F A D # 2013-01-02 1 0.0 5 # 2013-01-04 3 1000.0 5 # 將 NaN 填入值 print(df.fillna(value='Test')) # F A D # 2013-01-02 1 0 5 # 2013-01-03 2 Test 5 # 2013-01-04 3 1000 5 # 2013-01-05 4 Test 5 # 2013-01-06 5 Test 5 # 2013-01-07 6 Test 5
Grouping
分類處理
import numpy as np import pandas as pd df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.linspace(1, 8, num=8), 'D' : np.linspace(-1, -8, num=8)}) print(df) # A B C D # 0 foo one 1.0 -1.0 # 1 bar one 2.0 -2.0 # 2 foo two 3.0 -3.0 # 3 bar three 4.0 -4.0 # 4 foo two 5.0 -5.0 # 5 bar two 6.0 -6.0 # 6 foo one 7.0 -7.0 print(df.groupby('A').sum()) # C D # A # bar 12.0 -12.0 # foo 24.0 -24.0 print(df.groupby(['A','B']).sum()) # C D # A B # bar one 2.0 -2.0 # three 4.0 -4.0 # two 6.0 -6.0 # foo one 8.0 -8.0 # three 8.0 -8.0 # two 8.0 -8.0
讀寫檔
- 通常回傳的是 DataFrame
- 列出比較常用,詳細可參考 IO Tools
- sep 可用正規表示法
但若超過一個字且不為 '\s+' 會強迫使用 python parsing engine - pandas.read_pickle(path)
- pandas.read_clipboard(**kwargs)
- pandas.read_table(filepath_or_buffer[, sep, ...])
- pandas.read_csv(filepath_or_buffer[, sep, ...])
- pandas.read_excel(io[, sheetname, header, ...])
- pandas.read_json([path_or_buf, orient, typ, dtype, ...])
- pandas.read_html(io[, match, flavor, header, ...])
- pandas.read_sql(sql, con[, index_col, ...])
import pandas as pd # 預設 sep=',' pd.read_csv('test.csv') # a b c # 0 1 2 3 # 1 4 5 6 # 預設 sep='\t' # 自定 column names pd.read_table('test.txt', names=['aa', 'bb', 'cc']) # aa bb cc # 0 a b c # 1 1 2 3 # 2 4 5 6 # 需要 xlrd package # 資料無標題 pd.read_excel('test.xlsx', header=None) # 0 1 2 3 4 # 0 1 2 3 4 5 # 1 6 7 8 9 10 # 中文路徑會有問題,可用 open 取代 # sep 可用正規表示法,超過一個字且不為 '\s+' 不指定 engine 會有 warning with open('新文字文件.txt', 'r') as f: pd.read_table(f, sep='[a-z]', engine='python') # a b c # 0 1 2 3 # 1 4 5 6
Pandas Cheat Sheet: Data Wrangling in Python
留言
張貼留言