- 取得連結
- X
- 以電子郵件傳送
- 其他應用程式
程式語言:Python
Package:pandas
官方網站
官方文件
功能:數據處理分析
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)
參考
Pandas Cheat Sheet: Data Wrangling in Python
Package:pandas
官方網站
官方文件
功能:數據處理分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(np.linspace(1, 4, num=4),index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
print(df)
# A B C D E F
# 0 1.0 2013-01-02 1.0 3 test foo
# 1 1.0 2013-01-02 2.0 3 train foo
# 2 1.0 2013-01-02 3.0 3 test foo
# 3 1.0 2013-01-02 4.0 3 train foo
print(df.dtypes)
# A float64
# B datetime64[ns]
# C float32
# D int32
# E category
# F object
# dtype: object
df[['A', 'C']].plot()
plt.legend(loc='best')
plt.show()
資料架構
class pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)- pandas.Series
- 一維資料
- data
- 輸入的資料
- array-like
- dict
- scalar value (like 10)
- index
- 加入對應的 row name
- dtype
- 資料類型
- name
- 資料名字
- 轉換為 DataFrame 時,會成為 column name
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
- fastpath
- 內部參數
範例
numpy 建立
dict 或 純值 建立
import numpy as np import pandas as pd data = np.linspace(1, 3, num=3) print(data) # [ 1. 2. 3.] s1 = pd.Series(data, index=['a', 'b', 'c']) print(s1) # a 1.0 # b 2.0 # c 3.0 # dtype: float64 s2 = pd.Series(data, index=['a', 'b', 'c'], copy=True, name='s2') print(s2) # a 1.0 # b 2.0 # c 3.0 # Name: s2, dtype: float64 # 可看到 column name 為 's2' print(pd.DataFrame(s2)) # s2 # a 1.0 # b 2.0 # c 3.0 print(s1[0]) # 1 print(s1['a']) # 1 data[0] = 10 # 可看到只有 s2 無改變,因是重新 copy print(data) # [ 10. 2. 3.] print(s1) # a 10.0 # b 2.0 # c 3.0 # dtype: float64 print(s2) # a 1.0 # b 2.0 # c 3.0 # dtype: float64
dict 或 純值 建立
import pandas as pd
s3 = pd.Series({'a':2., 'b':5})
print(s3)
# a 2.0
# b 5.0
# dtype: float64
s4 = pd.Series(10, index=['a', 'b', 'c', 'd', 'e'])
print(s4)
# a 10
# b 10
# c 10
# d 10
# e 10
# dtype: int64
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
- pandas.DataFrame
- 二維資料
- data
- 輸入的資料
- array-like
- Dict of 1D ndarrays, lists, dicts, or Series
- 單個 Series
- 另一個 DataFrame
- index
- 輸入資料擁有 row name,例:dict of Series, dicts
為指定讀取的 row name - 輸入資料無 row name,例:list、structured arrays
為加入對應的 row name - columns
- 輸入資料擁有 column name,例:dict of Series, dicts、structured arrays
為指定讀取的 column name - 輸入資料無 column name,例:list
為加入對應的 column name - dtype
- 資料類型
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
範例
Series
Dict
Structured arrays
Multi-indexed frame
import numpy as np
import pandas as pd
# Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print(df)
# one two
# a 1.0 1.0
# b 2.0 2.0
# c 3.0 3.0
# d NaN 4.0
# 指定 index
print(pd.DataFrame(d, index=['d', 'b', 'a']))
# one two
# d NaN 4.0
# b 2.0 2.0
# a 1.0 1.0
# 指定 column name
print(pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'one', 'three']))
# two three
# d 4.0 NaN
# b 2.0 NaN
# a 1.0 NaN
Dict
import numpy as np
import pandas as pd
# dict
d = {'one' : [1., 2., 3., 4.],
'two' : [4., 3., 2., 1.]}
print(pd.DataFrame(d))
# one two
# 0 1.0 4.0
# 1 2.0 3.0
# 2 3.0 2.0
# 3 4.0 1.0
# 加入 index
print(pd.DataFrame(d, index=['a', 'b', 'c', 'd']))
# one two
# a 1.0 4.0
# b 2.0 3.0
# c 3.0 2.0
# d 4.0 1.0
Structured arrays
import numpy as np
import pandas as pd
# structured arrays
# 建立四個空白資料,並擁有三種資料 'A' 'B' 'C',其對應資料類型為 integer 4 bytes, float 4 bytes, string 10 bytes
data = np.zeros((4,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'S10')])
data[:] = [(1,2.,'Hello'), (2,3.,"World"), (6,8.,"!!"), (9,11.,"??")]
print(data)
# [(1, 2., b'Hello') (2, 3., b'World') (6, 8., b'!!') (9, 11., b'??')]
# 印出 A 的資料
print(data['A'])
# [1 2 6 9]
print(pd.DataFrame(data))
# A B C
# 0 1 2.0 b'Hello'
# 1 2 3.0 b'World'
# 2 6 8.0 b'!!'
# 3 9 11.0 b'??'
print(pd.DataFrame(data, index=['first', 'second', 'three', 'four']))
# A B C
# first 1 2.0 b'Hello'
# second 2 3.0 b'World'
# three 6 8.0 b'!!'
# four 9 11.0 b'??'
print(pd.DataFrame(data, columns=['C', 'A', 'B']))
# C A B
# 0 b'Hello' 1 2.0
# 1 b'World' 2 3.0
# 2 b'!!' 6 8.0
# 3 b'??' 9 11.0
Multi-indexed frame
import numpy as np
import pandas as pd
# multi-indexed frame
df = pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
print(df)
# a b
# a b c a b
# A B 4.0 1.0 5.0 8.0 10.0
# C 3.0 2.0 6.0 7.0 NaN
# D NaN NaN NaN NaN 9.0
print(df['b']['a']['A']['C'])
# 7.0
class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)
- pandas.Panel
- 三維資料,但不常使用
- data
- 輸入的資料
- items
- 資料大項的名字
- major_axis
- 加入 row name
- minor_axis
- 加入 column name
- copy
- 是否重新 copy
- 若無 copy,請小心資料的更改
- dtype
- 資料類型
範例
numpy 建立
DataFrame 建立
import numpy as np
import pandas as pd
# 3D ndarray
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
major_axis=pd.date_range('1/1/2000', periods=5),
minor_axis=['A', 'B', 'C', 'D'])
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
# Items axis: Item1 to Item2
# Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
# Minor_axis axis: A to D
DataFrame 建立
import numpy as np
import pandas as pd
# Series
d = {'item1' : pd.DataFrame({'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}),
'item2' : pd.DataFrame({'thr' : pd.Series([1., 2., 3.], index=['e', 'f', 'g'])})}
wp = pd.Panel(d)
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 7 (major_axis) x 3 (minor_axis)
# Items axis: item1 to item2
# Major_axis axis: a to g
# Minor_axis axis: one to two
# filter_observations 預設為 True,表示不顯示含有 NaN 的欄位
# 所以轉換後為 Empty DataFrame
print(wp.to_frame())
# Empty DataFrame
# Columns: [item1, item2]
# Index: []
print(wp.to_frame(filter_observations=False))
# item1 item2
# major minor
# a one 1.0 NaN
# thr NaN NaN
# two 1.0 NaN
# b one 2.0 NaN
# thr NaN NaN
# two 2.0 NaN
# c one 3.0 NaN
# thr NaN NaN
# two 3.0 NaN
# d one NaN NaN
# thr NaN NaN
# two 4.0 NaN
# e one NaN NaN
# thr NaN 1.0
# two NaN NaN
# f one NaN NaN
# thr NaN 2.0
# two NaN NaN
# g one NaN NaN
# thr NaN 3.0
# two NaN NaN
基本操作
得值
原始資料
Getting
Selection by Label
Selection by Position
Selection by Mix
Boolean Indexing
isin
values
ravel()
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(df)
# A B C D
# 2013-01-01 0.254520 -0.263404 -2.210508 0.807513
# 2013-01-02 0.024263 -0.014928 -0.601224 0.561930
# 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006
# 2013-01-05 0.637412 -0.188929 -0.354975 1.693871
# 2013-01-06 1.014020 0.812423 0.420242 -0.633087
Getting
# get by [],只能一個 index print(df['A']) # 等同 df.A # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64 # Freq 表示間隔頻率為一天 # get by [[]],可以多個 columns print(df[['A', 'B']]) # A B # 2013-01-01 0.254520 -0.263404 # 2013-01-02 0.024263 -0.014928 # 2013-01-03 -0.628894 -0.419761 # 2013-01-04 -1.116615 -0.027055 # 2013-01-05 0.637412 -0.188929 # 2013-01-06 1.014020 0.812423 print(df[0:10]) # A B C D # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006 # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 # 2013-01-06 1.014020 0.812423 0.420242 -0.633087 print(df['20130102':'20130104']) # A B C D # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006
Selection by Label
print(df.loc['20130102':'20130104', 'A']) # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # Freq: D, Name: A, dtype: float64
Selection by Position
# get by index # index 需正確,可用 df.reset_index(drop=True) print(df.iloc[0:10, 0]) # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64
Selection by Mix
# get by index, label print(df.ix[0:10, 'A']) # 2013-01-01 0.254520 # 2013-01-02 0.024263 # 2013-01-03 -0.628894 # 2013-01-04 -1.116615 # 2013-01-05 0.637412 # 2013-01-06 1.014020 # Freq: D, Name: A, dtype: float64
Boolean Indexing
print(df[df.A > 0]) # A B C D # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 # 2013-01-06 1.014020 0.812423 0.420242 -0.633087
isin
df2 = df.copy() df2['E'] = ['one', 'one','two','three','four','three'] print(df2) # A B C D E # 2013-01-01 0.254520 -0.263404 -2.210508 0.807513 one # 2013-01-02 0.024263 -0.014928 -0.601224 0.561930 one # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 two # 2013-01-04 -1.116615 -0.027055 0.537114 -0.955006 three # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 four # 2013-01-06 1.014020 0.812423 0.420242 -0.633087 three print(df2[df2['E'].isin(['two','four'])]) # A B C D E # 2013-01-03 -0.628894 -0.419761 1.051119 -0.593262 two # 2013-01-05 0.637412 -0.188929 -0.354975 1.693871 four
values
# 轉換為 numpy 格式 print(df.values) # array([[ 0.254520, -0.263404, -2.210508, 0.807513], # [ 0.024263, -0.014928, -0.601224, 0.561930], # [-0.628894, -0.419761, 1.051119, -0.593262], # [-1.116615, -0.027055, 0.537114, -0.955006], # [ 0.637412, -0.188929, -0.354975, 1.693871], # [ 1.014020, 0.812423, 0.420242, -0.633087]])
ravel()
# 轉換為 numpy 一維格式 print(df['A'].ravel()) # array([0.254520, 0.024263, -0.628894, -1.116615, 0.637412, 1.014020])
賦值
利用 series, label , index, numpy
import numpy as np
import pandas as pd
df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)
s1 = pd.Series([1,2,3,4,5,6], index=date)
# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))
print(df)
# F A D
# 2013-01-02 1 0.0 5
# 2013-01-03 2 NaN 5
# 2013-01-04 3 1000.0 5
# 2013-01-05 4 NaN 5
# 2013-01-06 5 NaN 5
# 2013-01-07 6 NaN 5
# by 任意得值的方法
df.loc['20130102':'20130103', 'A'] = 1
# index 需正確,可用 df.reset_index(drop=True)
df.iloc[3:5, 0] = 10
df.ix[5, 'A'] = 0
print(df)
# F A D
# 2013-01-02 1 1.0 5
# 2013-01-03 2 1.0 5
# 2013-01-04 3 1000.0 5
# 2013-01-05 4 10.0 5
# 2013-01-06 5 10.0 5
# 2013-01-07 6 0.0 5
# 只有符合條件才會賦值
df[df > 10] = -df
print(df)
# F A D
# 2013-01-02 1 1.0 5
# 2013-01-03 2 1.0 5
# 2013-01-04 3 -1000.0 5
# 2013-01-05 4 10.0 5
# 2013-01-06 5 10.0 5
# 2013-01-07 6 0.0 5
合併表格
Merge, join, and concatenate
import pandas as pd
df1 = pd.DataFrame({'X1':['a', 'b', 'c'],
'X2':[1, 2, 3],})
# X1 X2
# 0 a 1
# 1 b 2
# 2 c 3
df2 = pd.DataFrame({'X1':['a', 'b', 'd'],
'X3':[4, None, 6],})
# X1 X3
# 0 a 4.0
# 1 b NaN
# 2 d 6.0
df1.merge(df2, how='left', on='X1')
# X1 X2 X3
# 0 a 1 4.0
# 1 b 2 NaN
# 2 c 3 NaN
df1.merge(df2, how='right', on='X1')
# X1 X2 X3
# 0 a 1.0 4.0
# 1 b 2.0 NaN
# 2 d NaN 6.0
df1.merge(df2, how='outer', on='X1')
# X1 X2 X3
# 0 a 1.0 4.0
# 1 b 2.0 NaN
# 2 c 3.0 NaN
# 3 d NaN 6.0
# join 必需有相同的 index 才行
df1.set_index('X1').join(df2.set_index('X1'), how='right')
# X2 X3
# X1
# a 1.0 4.0
# b 2.0 NaN
# d NaN 6.0
# 方向為 index
df1.append(df2)
# X1 X2 X3
# 0 a 1.0 NaN
# 1 b 2.0 NaN
# 2 c 3.0 NaN
# 0 a NaN 4.0
# 1 b NaN NaN
# 2 d NaN 6.0
# 方向為 index
pd.concat([df1, df2])
# X1 X2 X3
# 0 a 1.0 NaN
# 1 b 2.0 NaN
# 2 c 3.0 NaN
# 0 a NaN 4.0
# 1 b NaN NaN
# 2 d NaN 6.0
# 方向為 columns
pd.concat([df1, df2], axis='columns')
# X1 X2 X1 X3
# 0 a 1 a 4.0
# 1 b 2 b NaN
# 2 c 3 d 6.0
調整表格
import pandas as pd
df = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
'baz': [1, 2, 3, 4, 5, 6]})
# foo bar baz
# 0 one A 1
# 1 one B 2
# 2 one C 3
# 3 two A 4
# 4 two B 5
# 5 two C 6
# 重新調整表格,指定排列方式
df.pivot(index='foo', columns='bar', values='baz')
# A B C
# one 1 2 3
# two 4 5 6
# pivot 的進階版,指定排列方式,可設定值的處理,預設為平均
df.pivot_table(index='foo', columns='bar', values='baz')
# A B C
# one 1 2 3
# two 4 5 6
import pandas as pd
import numpy as np
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
"bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two",
"one", "one", "two", "two"],
"C": ["small", "large", "large", "small",
"small", "large", "small", "small",
"large"],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
# A B C D
# 0 foo one small 1
# 1 foo one large 2
# 2 foo one large 2
# 3 foo two small 3
# 4 foo two small 3
# 5 bar one large 4
# 6 bar one small 5
# 7 bar two small 6
# 8 bar two large 7
df.pivot_table(index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum)
# C large small
# A B
# bar one 4.0 5.0
# two 7.0 6.0
# foo one 4.0 1.0
# two NaN 6.0
import pandas as pd
df = pd.DataFrame({'a': {'one':1.0, 'two':2.0},
'b': {'one':3.0, 'two':4.0}})
# a b
# one 1.0 3.0
# two 2.0 4.0
# 把 columns 移到 index,可能會得到 Series
df.stack()
# one a 1.0
# b 3.0
# two a 2.0
# b 4.0
# 把 index 移到 columns,可能會得到 Series
df.stack().unstack()
# a b
# one 1.0 3.0
# two 2.0 4.0
# 重新排列 index 並把 index 移至 columns,仍是 DataFrame
df.reset_index()
# index a b
# 0 one 1.0 3.0
# 1 two 2.0 4.0
import pandas as pd
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
'B': {0: 1, 1: 3, 2: 5},
'C': {0: 2, 1: 4, 2: 6}})
# A B C
# 0 a 1 2
# 1 b 3 4
# 2 c 5 6
# 新增對應 B C 的行與其對應的值
df.melt(id_vars=['A'], value_vars=['B', 'C'])
# A variable value
# 0 a B 1
# 1 b B 3
# 2 c B 5
# 3 a C 2
# 4 b C 4
# 5 c C 6
統計運算
原始資料
數學運算
統計計算
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.ones((6, 4))*2, index=dates, columns=list('ABCD'), dtype='float64')
print(df)
# A B C D
# 2013-01-01 2.0 2.0 2.0 2.0
# 2013-01-02 2.0 2.0 2.0 2.0
# 2013-01-03 2.0 2.0 2.0 2.0
# 2013-01-04 2.0 2.0 2.0 2.0
# 2013-01-05 2.0 2.0 2.0 2.0
# 2013-01-06 2.0 2.0 2.0 2.0
數學運算
# 數學運算 print(df+10) # A B C D # 2013-01-01 12.0 12.0 12.0 12.0 # 2013-01-02 12.0 12.0 12.0 12.0 # 2013-01-03 12.0 12.0 12.0 12.0 # 2013-01-04 12.0 12.0 12.0 12.0 # 2013-01-05 12.0 12.0 12.0 12.0 # 2013-01-06 12.0 12.0 12.0 12.0 print(df-10) # A B C D # 2013-01-01 -8.0 -8.0 -8.0 -8.0 # 2013-01-02 -8.0 -8.0 -8.0 -8.0 # 2013-01-03 -8.0 -8.0 -8.0 -8.0 # 2013-01-04 -8.0 -8.0 -8.0 -8.0 # 2013-01-05 -8.0 -8.0 -8.0 -8.0 # 2013-01-06 -8.0 -8.0 -8.0 -8.0 print(df/10) # A B C D # 2013-01-01 0.2 0.2 0.2 0.2 # 2013-01-02 0.2 0.2 0.2 0.2 # 2013-01-03 0.2 0.2 0.2 0.2 # 2013-01-04 0.2 0.2 0.2 0.2 # 2013-01-05 0.2 0.2 0.2 0.2 # 2013-01-06 0.2 0.2 0.2 0.2 print(df*10) # A B C D # 2013-01-01 20.0 20.0 20.0 20.0 # 2013-01-02 20.0 20.0 20.0 20.0 # 2013-01-03 20.0 20.0 20.0 20.0 # 2013-01-04 20.0 20.0 20.0 20.0 # 2013-01-05 20.0 20.0 20.0 20.0 # 2013-01-06 20.0 20.0 20.0 20.0 print(df**2) # A B C D # 2013-01-01 4.0 4.0 4.0 4.0 # 2013-01-02 4.0 4.0 4.0 4.0 # 2013-01-03 4.0 4.0 4.0 4.0 # 2013-01-04 4.0 4.0 4.0 4.0 # 2013-01-05 4.0 4.0 4.0 4.0 # 2013-01-06 4.0 4.0 4.0 4.0
統計計算
# 統計 print(df.describe()) # A B C D # count 6.0 6.0 6.0 6.0 # mean 2.0 2.0 2.0 2.0 # std 0.0 0.0 0.0 0.0 # min 2.0 2.0 2.0 2.0 # 25% 2.0 2.0 2.0 2.0 # 50% 2.0 2.0 2.0 2.0 # 75% 2.0 2.0 2.0 2.0 # max 2.0 2.0 2.0 2.0 # mean 值 print(df.mean()) # A 2.0 # B 2.0 # C 2.0 # D 2.0 # dtype: float64 # 累積計算 print(df.cumsum()) # A B C D # 2013-01-01 2.0 2.0 2.0 2.0 # 2013-01-02 4.0 4.0 4.0 4.0 # 2013-01-03 6.0 6.0 6.0 6.0 # 2013-01-04 8.0 8.0 8.0 8.0 # 2013-01-05 10.0 10.0 10.0 10.0 # 2013-01-06 12.0 12.0 12.0 12.0
缺值
缺值處理,丟棄或給予其他值
import numpy as np
import pandas as pd
df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)
s1 = pd.Series([1,2,3,4,5,6], index=date)
# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))
print(df)
# F A D
# 2013-01-02 1 0.0 5
# 2013-01-03 2 NaN 5
# 2013-01-04 3 1000.0 5
# 2013-01-05 4 NaN 5
# 2013-01-06 5 NaN 5
# 2013-01-07 6 NaN 5
# DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# 預設 axis=0 => "index"
# how='any',只要有 NaN 就丟掉
# how='all',全部為 NaN 才丟掉
print(df.dropna(how='any'))
# F A D
# 2013-01-02 1 0.0 5
# 2013-01-04 3 1000.0 5
# 將 NaN 填入值
print(df.fillna(value='Test'))
# F A D
# 2013-01-02 1 0 5
# 2013-01-03 2 Test 5
# 2013-01-04 3 1000 5
# 2013-01-05 4 Test 5
# 2013-01-06 5 Test 5
# 2013-01-07 6 Test 5
Grouping
分類處理
import numpy as np
import pandas as pd
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.linspace(1, 8, num=8),
'D' : np.linspace(-1, -8, num=8)})
print(df)
# A B C D
# 0 foo one 1.0 -1.0
# 1 bar one 2.0 -2.0
# 2 foo two 3.0 -3.0
# 3 bar three 4.0 -4.0
# 4 foo two 5.0 -5.0
# 5 bar two 6.0 -6.0
# 6 foo one 7.0 -7.0
print(df.groupby('A').sum())
# C D
# A
# bar 12.0 -12.0
# foo 24.0 -24.0
print(df.groupby(['A','B']).sum())
# C D
# A B
# bar one 2.0 -2.0
# three 4.0 -4.0
# two 6.0 -6.0
# foo one 8.0 -8.0
# three 8.0 -8.0
# two 8.0 -8.0
讀寫檔
- 通常回傳的是 DataFrame
- 列出比較常用,詳細可參考 IO Tools
- sep 可用正規表示法
但若超過一個字且不為 '\s+' 會強迫使用 python parsing engine - pandas.read_pickle(path)
- pandas.read_clipboard(**kwargs)
- pandas.read_table(filepath_or_buffer[, sep, ...])
- pandas.read_csv(filepath_or_buffer[, sep, ...])
- pandas.read_excel(io[, sheetname, header, ...])
- pandas.read_json([path_or_buf, orient, typ, dtype, ...])
- pandas.read_html(io[, match, flavor, header, ...])
- pandas.read_sql(sql, con[, index_col, ...])
import pandas as pd
# 預設 sep=','
pd.read_csv('test.csv')
# a b c
# 0 1 2 3
# 1 4 5 6
# 預設 sep='\t'
# 自定 column names
pd.read_table('test.txt', names=['aa', 'bb', 'cc'])
# aa bb cc
# 0 a b c
# 1 1 2 3
# 2 4 5 6
# 需要 xlrd package
# 資料無標題
pd.read_excel('test.xlsx', header=None)
# 0 1 2 3 4
# 0 1 2 3 4 5
# 1 6 7 8 9 10
# 中文路徑會有問題,可用 open 取代
# sep 可用正規表示法,超過一個字且不為 '\s+' 不指定 engine 會有 warning
with open('新文字文件.txt', 'r') as f:
pd.read_table(f, sep='[a-z]', engine='python')
# a b c
# 0 1 2 3
# 1 4 5 6
Pandas Cheat Sheet: Data Wrangling in Python

留言
張貼留言