[ML] 資料分析

程式語言:Python
Package
  • numpy
  • pandas
  • matplotlib
  • seaborn
  • sklearn
資料:Kaggle Titanic

簡介:資料分析的一些基本操作

觀察資料

  1. import pandas as pd
  2.  
  3. train_data = pd.read_csv("train.csv")
  4. # 前十筆資料
  5. train_data.head(10)
  6. # 後十筆資料
  7. train_data.tail(10)
  8. # 移除特定欄位
  9. train_data.drop(['PassengerId', 'Survived'], axis='columns')
資料資訊
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
  1. import pandas as pd
  2.  
  3. # 資料資訊,可檢查缺項
  4. train_data.info()
資料統計
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64
  1. import pandas as pd
  2.  
  3. # 資料分析,可顯示中位數、平均值等常用統計值
  4. train_data.['Age'].describe()
計算 NA 個數並列出
Age         177
Cabin       687
Embarked      2
dtype: int64
  1. # 計算 NA 個數並列出
  2. naCount = train_data.isnull().sum()
  3. naCount[naCount > 0]
偏度 (skewness)
PassengerId    0.000000
Survived       0.478523
Pclass        -0.630548
Age            0.389108
SibSp          3.695352
Parch          2.749117
Fare           4.787317
dtype: float64
  1. # skewness https://zh.wikipedia.org/wiki/偏度
  2. # 偏度=0 是完美的對稱性
  3. # 負偏態或左偏態:左側的尾部更長,分布的主體集中在右側。
  4. # 正偏態或右偏態:右側的尾部更長,分布的主體集中在左側。
  5. train_data.skew()
  6.  
  7. import numpy as np
  8. # skewness > 0.75 可用 log(1+x) 使之較對稱
  9. x_origin = train_data["Fare"]
  10. x = np.log1p(x_origin)
  11. x_origin = np.expm1(x)
峰度(Kurtosis)
PassengerId    -1.200000
Survived       -1.775005
Pclass         -1.280015
Age             0.178274
SibSp          17.880420
Parch           9.778125
Fare           33.398141
dtype: float64
  1. # 峰度(Kurtosis)
  2. # 常態分佈的峰度為 3
  3. # 峰度大於 3 的稱作尖峰,表示資料的分佈比常態分佈更集中和陡峭。
  4. # 峰度小於 3 的作為平峰型,表示資料分佈比之正態分佈更為平滑
  5. train_data.kurtosis()
相關性
  1. import seaborn as sns
  2. import matplotlib.pyplot as plt
  3.  
  4. # 計算相關性
  5. corrmat = train_data.corr()
  6. # 畫圖
  7. sns.heatmap(corrmat, square=True)
  8. # 顯示圖片
  9. plt.show()

資料視覺化

離散資料
  1. import matplotlib.pyplot as plt
  2. import seaborn as sns
  3.  
  4. # 畫出直方圖
  5. data = train_data['Embarked']
  6. plt.subplot(1, 2, 1)
  7. plt.hist(data)
  8. plt.ylim([0, 700])
  9. plt.subplot(1, 2, 2)
  10. plt.hist(data[train_data.Survived == 1], color='b', label="Survied")
  11. plt.hist(data[train_data.Survived == 0], color='g', label="non-Survied", alpha=0.5)
  12. plt.ylim([0, 700])
  13. plt.legend()
  14.  
  15. # seaborn 會自動移除 NaN 值,需注意
  16. # 畫出直方圖
  17. sns.factorplot('Embarked', hue='Survived', data=train_data, kind="count")
  18. # 顯示中心值與信賴區間
  19. sns.factorplot('Embarked', 'Survived', data=train_data, kind="point")
  20. # 顯示中位數與四分位距
  21. sns.factorplot('Embarked', 'Survived', data=train_data, kind="violin")
  22. plt.show()
連續資料
  1. import matplotlib.pyplot as plt
  2. import seaborn as sns
  3.  
  4. # 畫出直方圖
  5. plt.subplot(1, 2, 1)
  6. # 移除 NA 值
  7. data = train_data['Age'].dropna(how='any')
  8. plt.hist(data, bins=range(0,100,10), rwidth=0.8)
  9. plt.ylim([0, 250])
  10. plt.subplot(1, 2, 2)
  11. plt.hist(data[train_data.Survived == 1], bins=range(0,100,10), color='b', label="Survied", rwidth=0.8)
  12. plt.hist(data[train_data.Survived == 0], bins=range(0,100,10), color='g', label="non-Survied", rwidth=0.8, alpha=0.5)
  13. plt.ylim([0, 250])
  14. plt.legend()
  15.  
  16. # seaborn 會自動移除 NaN 值,需注意
  17. # 畫出相關性的圖
  18. sns.jointplot('Age', 'Survived', data=train_data, kind='hex')
  19. # 顯示中心值與信賴區間
  20. sns.factorplot('Survived', 'Age', data=train_data, kind="point")
  21. # 顯示中位數與四分位距
  22. sns.factorplot('Survived', 'Age', data=train_data, kind="violin")
  23.  
  24. plt.show()

缺項處理

  1. import pandas as pd
  2.  
  3. train_data = pd.read_csv("train.csv")
  4.  
  5. # 填補缺項
  6. train_data = train_data.fillna({'Age': train_data.Age.mean(),
  7. 'Fare': train_data.Fare.mean(),
  8. 'Cabin': 'N/A',
  9. 'Embarked': 'N/A'})
  10.  
  11. # 移除缺項
  12. data = train_data['Age'].dropna(how='any')
  1. # 缺項統一處理
  2. def fillNA(XX):
  3. X = XX.copy(deep=True)
  4. for colName in X.columns:
  5. if sum(X[colName].isnull()) > 0:
  6. if X.dtypes[colName] in [np.int64, np.float64]:
  7. # print("{} is filled by {}".format(colName, X[colName].mean()))
  8. X[colName] = X[colName].fillna(X[colName].mean())
  9. elif X.dtypes[colName] in [object]:
  10. # print("{} is filled by NA".format(colName))
  11. fillVal = "NA"# X[colName].value_counts().index[0]
  12. X[colName] = X[colName].fillna(fillVal)
  13. else:
  14. print("{} is not define".format(colName))
  15. return X

正規化處理

  1. import pandas as pd
  2.  
  3. train_data = pd.read_csv("train.csv")
  4.  
  5. train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / train_data['Age'].std()
  6. train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / (train_data['Age'].max() - train_data['Age'].min())

標籤編碼

  1. from sklearn import preprocessing
  2.  
  3. x = ['A', 'B', 'C', 'A']
  4. lb = preprocessing.LabelBinarizer()
  5. lb.fit(x)
  6. print(lb.classes_)
  7. # ['A' 'B' 'C']
  8. x_proc = lb.transform(x)
  9. print(x_proc)
  10. # [[1 0 0]
  11. # [0 1 0]
  12. # [0 0 1]
  13. # [1 0 0]]
  14.  
  15. le = preprocessing.LabelEncoder()
  16. le.fit(x)
  17. print(le.classes_)
  18. # ['A' 'B' 'C']
  19. x_proc = le.transform(x)
  20. print(x_proc)
  21. # [0 1 2 0]
  22.  
  23. x_proc = pd.get_dummies(x)
  24. print(x_proc)
  25. # A B C
  26. # 0 1 0 0
  27. # 1 0 1 0
  28. # 2 0 0 1
  29. # 3 1 0 0
  1. # 資料統一轉換
  2. from sklearn import preprocessing
  3.  
  4. # 編碼為 1 2 3 ...
  5. def transSymbolEncoder(x):
  6. lb = preprocessing.LabelEncoder()
  7. lb.fit(x)
  8. return lb.transform(x)
  9.  
  10. # 編碼為 001 010 100 ...
  11. # 可用 pd.get_dummies(XX) 取代
  12. def transSymbolBinary(XX, colName):
  13. X = XX
  14. x = X[colName].fillna("NA")
  15. lb = preprocessing.LabelBinarizer()
  16. lb.fit(x)
  17. value = lb.transform(x)
  18. for i in range(len(value[0])):
  19. addName = "{}_{}".format(colName, lb.classes_[i])
  20. X[addName] = value[:, i]
  21.  
  22. return X.drop([colName], axis='columns')
  23.  
  24. def transAll(XX):
  25. X = XX.copy(deep=True)
  26. for colName in X.columns:
  27. if X.dtypes[colName] in [np.int64, np.float64]:
  28. # print("{} is not transformed".format(colName))
  29. # X[colName] = (X[colName] - X[colName].mean()) / X[colName].std()
  30. pass
  31. elif X.dtypes[colName] in [object]:
  32. # print("{} is transformed".format(colName))
  33. X = transSymbolBinary(X, colName)
  34. else:
  35. print("{} is not define".format(colName))
  36. return X

重要性 feature

  1. # 列出重要的 feature
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4.  
  5. def listImportantIndex(indices, importances, threshold):
  6. '''
  7. indices : features 名字
  8. importances : feature 重要性
  9. threshold : 大於才會挑選
  10. '''
  11. # 取絕對值,以符合 linear model 的權重
  12. importances_abs = abs(importances)
  13. # 由小到大排序並回傳對應的 index,再將之反向,也就是改為由大到小的排序
  14. indices_sorted = np.argsort(importances_abs)[::-1]
  15. # 印出的個數
  16. n = sum(importances_abs > threshold)
  17. # 印出排序後的 feature 重要性
  18. print("Feature ranking:")
  19. for f in range(n):
  20. print("%d. feature %s (%f)" % (f + 1, indices[indices_sorted[f]], importances[indices_sorted[f]]))
  21. # 畫直方圖
  22. plt.barh(range(n), importances[indices_sorted[:n]][::-1], align="center")
  23. plt.yticks(range(n), indices[indices_sorted[:n]][::-1])
  24. plt.title("threshold={}".format(threshold))
  25. plt.xlabel("importance")
  26. plt.ylabel("features")
  27. plt.show()
  28. return indices[indices_sorted[:n]]
  1. # 合併重要 features
  2. import pandas as pd
  3.  
  4. def combinIndex(*args):
  5. result = pd.Index([])
  6. for i in indexList:
  7. result = result.union(i)
  8. return result
Random Forest (隨機樹)
  1. from sklearn.ensemble import RandomForestClassifier
  2. from sklearn import preprocessing
  3. import pandas as pd
  4.  
  5. train_data = pd.read_csv("train.csv")
  6.  
  7. # 編碼
  8. def transSymbol(x):
  9. lb = preprocessing.LabelBinarizer()
  10. lb.fit(x)
  11. return lb.transform(x)
  12.  
  13. # 缺項處理
  14. train_data = train_data.fillna({'Age': train_data.Age.mean(),
  15. 'Fare': train_data.Fare.mean(),
  16. 'Cabin': 'N/A',
  17. 'Embarked': 'N/A'})
  18.  
  19. # 移除不需要的 column
  20. X = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis='columns')
  21. # 性別編碼
  22. X['Sex'] = transSymbol(X['Sex'])
  23. # 艙號編碼
  24. X['Cabin'] = transSymbol(X['Cabin'])
  25. # 登船港口編碼
  26. X['Embarked'] = transSymbol(X['Embarked'])
  27. y = train_data['Survived']
  28. clf = RandomForestClassifier(n_estimators=200, oob_score=True)
  29. clf.fit(X, y)
  30.  
  31. # feature 重要性
  32. listImportantIndex(indices=X.columns, importances=clf.feature_importances_, threshold=0.01)
  33. # Feature ranking:
  34. # 1. feature Fare (0.279122)
  35. # 2. feature Age (0.265518)
  36. # 3. feature Sex (0.261311)
  37. # 4. feature Pclass (0.082490)
  38. # 5. feature SibSp (0.049061)
  39. # 6. feature Parch (0.040910)
  40. # 7. feature Embarked (0.020969)
Linear Model
  1. from sklearn.linear_model import LinearRegression
  2. from sklearn import preprocessing
  3. import pandas as pd
  4.  
  5. train_data = pd.read_csv("ipython/train.csv")
  6.  
  7. # 編碼
  8. def transSymbol(x):
  9. lb = preprocessing.LabelBinarizer()
  10. lb.fit(x)
  11. return lb.transform(x)
  12.  
  13. # 缺項處理
  14. train_data = train_data.fillna({'Age': train_data.Age.mean(),
  15. 'Fare': train_data.Fare.mean(),
  16. 'Cabin': 'N/A',
  17. 'Embarked': 'N/A'})
  18.  
  19. # 移除不需要的 column
  20. X = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis='columns')
  21. # 性別編碼
  22. X['Sex'] = transSymbol(X['Sex'])
  23. # 艙號編碼
  24. X['Cabin'] = transSymbol(X['Cabin'])
  25. # 登船港口編碼
  26. X['Embarked'] = transSymbol(X['Embarked'])
  27. y = train_data['Survived']
  28. clf = LinearRegression()
  29. clf.fit(X, y)
  30.  
  31. # feature 重要性
  32. listImportantIndex(indices=X.columns, importances=clf.coef_, threshold=0.01)
  33. # Feature ranking:
  34. # 1. feature Sex (-0.509290)
  35. # 2. feature Cabin (-0.469352)
  36. # 3. feature Pclass (-0.167720)
  37. # 4. feature Embarked (0.061832)
  38. # 5. feature SibSp (-0.041574)
  39. # 6. feature Parch (-0.018721)

演算法處理

  1. # 演算法評估
  2. from sklearn.model_selection import cross_val_score
  3. import seaborn as sns
  4.  
  5. def evaluate_model(clf, X, y):
  6. # 5-Fold Cross Validation
  7. scores = cross_val_score(clf, X, y, cv=10, scoring="neg_mean_squared_error")
  8. # 平均埴
  9. m = scores.mean()
  10. # 標準差
  11. sd = scores.std()
  12. # 訓練資料準確度
  13. clf.fit(X, y)
  14. score = clf.score(X, y)
  15. # 中位數
  16. ax = sns.boxplot(scores)
  17. ax.set_title("val_mean:{:.5f}, val_std:{:.5f}\ntrain_score:{:.2f}\n\nclf:{}".format(m, sd, score, clf))
  18. return clf
  1. from sklearn.model_selection import GridSearchCV
  2.  
  3. # 挑選參數
  4. def param_selection(clf, X, y, nfolds, **param_grid):
  5. # max_features = ['sqrt', 'auto', 'log2']
  6. # max_depth = range(1, 30, 5)
  7. # min_samples_split = range(2, 10, 2)
  8. # min_samples_leaf = range(1, 10, 2)
  9. # param_grid = {'max_features': max_features,
  10. # 'max_depth' : max_depth,
  11. # 'min_samples_split' : min_samples_split,
  12. # 'min_samples_leaf' : min_samples_leaf,}
  13. grid_search = GridSearchCV(clf, param_grid=param_grid, cv=nfolds)
  14. grid_search.fit(X, y)
  15.  
  16. return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_

參考

Python Graph Gallery
seaborn 官網
Pearson 相關
Working with missing data

留言