程式語言:Python
Package
簡介:資料分析的一些基本操作
seaborn 官網
Pearson 相關
Working with missing data
Package
- numpy
- pandas
- matplotlib
- seaborn
- sklearn
簡介:資料分析的一些基本操作
觀察資料
import pandas as pd train_data = pd.read_csv("train.csv") # 前十筆資料 train_data.head(10) # 後十筆資料 train_data.tail(10) # 移除特定欄位 train_data.drop(['PassengerId', 'Survived'], axis='columns')
資料資訊
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB
資料統計
count 714.000000 mean 29.699118 std 14.526497 min 0.420000 25% 20.125000 50% 28.000000 75% 38.000000 max 80.000000 Name: Age, dtype: float64
計算 NA 個數並列出
Age 177 Cabin 687 Embarked 2 dtype: int64
偏度 (skewness)
PassengerId 0.000000 Survived 0.478523 Pclass -0.630548 Age 0.389108 SibSp 3.695352 Parch 2.749117 Fare 4.787317 dtype: float64
峰度(Kurtosis)
PassengerId -1.200000 Survived -1.775005 Pclass -1.280015 Age 0.178274 SibSp 17.880420 Parch 9.778125 Fare 33.398141 dtype: float64
資料視覺化
缺項處理
import pandas as pd train_data = pd.read_csv("train.csv") # 填補缺項 train_data = train_data.fillna({'Age': train_data.Age.mean(), 'Fare': train_data.Fare.mean(), 'Cabin': 'N/A', 'Embarked': 'N/A'}) # 移除缺項 data = train_data['Age'].dropna(how='any')
# 缺項統一處理 def fillNA(XX): X = XX.copy(deep=True) for colName in X.columns: if sum(X[colName].isnull()) > 0: if X.dtypes[colName] in [np.int64, np.float64]: # print("{} is filled by {}".format(colName, X[colName].mean())) X[colName] = X[colName].fillna(X[colName].mean()) elif X.dtypes[colName] in [object]: # print("{} is filled by NA".format(colName)) fillVal = "NA"# X[colName].value_counts().index[0] X[colName] = X[colName].fillna(fillVal) else: print("{} is not define".format(colName)) return X
正規化處理
import pandas as pd train_data = pd.read_csv("train.csv") train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / train_data['Age'].std() train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / (train_data['Age'].max() - train_data['Age'].min())
標籤編碼
from sklearn import preprocessing x = ['A', 'B', 'C', 'A'] lb = preprocessing.LabelBinarizer() lb.fit(x) print(lb.classes_) # ['A' 'B' 'C'] x_proc = lb.transform(x) print(x_proc) # [[1 0 0] # [0 1 0] # [0 0 1] # [1 0 0]] le = preprocessing.LabelEncoder() le.fit(x) print(le.classes_) # ['A' 'B' 'C'] x_proc = le.transform(x) print(x_proc) # [0 1 2 0] x_proc = pd.get_dummies(x) print(x_proc) # A B C # 0 1 0 0 # 1 0 1 0 # 2 0 0 1 # 3 1 0 0
# 資料統一轉換 from sklearn import preprocessing # 編碼為 1 2 3 ... def transSymbolEncoder(x): lb = preprocessing.LabelEncoder() lb.fit(x) return lb.transform(x) # 編碼為 001 010 100 ... # 可用 pd.get_dummies(XX) 取代 def transSymbolBinary(XX, colName): X = XX x = X[colName].fillna("NA") lb = preprocessing.LabelBinarizer() lb.fit(x) value = lb.transform(x) for i in range(len(value[0])): addName = "{}_{}".format(colName, lb.classes_[i]) X[addName] = value[:, i] return X.drop([colName], axis='columns') def transAll(XX): X = XX.copy(deep=True) for colName in X.columns: if X.dtypes[colName] in [np.int64, np.float64]: # print("{} is not transformed".format(colName)) # X[colName] = (X[colName] - X[colName].mean()) / X[colName].std() pass elif X.dtypes[colName] in [object]: # print("{} is transformed".format(colName)) X = transSymbolBinary(X, colName) else: print("{} is not define".format(colName)) return X
重要性 feature
# 列出重要的 feature import numpy as np import matplotlib.pyplot as plt def listImportantIndex(indices, importances, threshold): ''' indices : features 名字 importances : feature 重要性 threshold : 大於才會挑選 ''' # 取絕對值,以符合 linear model 的權重 importances_abs = abs(importances) # 由小到大排序並回傳對應的 index,再將之反向,也就是改為由大到小的排序 indices_sorted = np.argsort(importances_abs)[::-1] # 印出的個數 n = sum(importances_abs > threshold) # 印出排序後的 feature 重要性 print("Feature ranking:") for f in range(n): print("%d. feature %s (%f)" % (f + 1, indices[indices_sorted[f]], importances[indices_sorted[f]])) # 畫直方圖 plt.barh(range(n), importances[indices_sorted[:n]][::-1], align="center") plt.yticks(range(n), indices[indices_sorted[:n]][::-1]) plt.title("threshold={}".format(threshold)) plt.xlabel("importance") plt.ylabel("features") plt.show() return indices[indices_sorted[:n]]
# 合併重要 features import pandas as pd def combinIndex(*args): result = pd.Index([]) for i in indexList: result = result.union(i) return result
演算法處理
# 演算法評估 from sklearn.model_selection import cross_val_score import seaborn as sns def evaluate_model(clf, X, y): # 5-Fold Cross Validation scores = cross_val_score(clf, X, y, cv=10, scoring="neg_mean_squared_error") # 平均埴 m = scores.mean() # 標準差 sd = scores.std() # 訓練資料準確度 clf.fit(X, y) score = clf.score(X, y) # 中位數 ax = sns.boxplot(scores) ax.set_title("val_mean:{:.5f}, val_std:{:.5f}\ntrain_score:{:.2f}\n\nclf:{}".format(m, sd, score, clf)) return clf
from sklearn.model_selection import GridSearchCV # 挑選參數 def param_selection(clf, X, y, nfolds, **param_grid): # max_features = ['sqrt', 'auto', 'log2'] # max_depth = range(1, 30, 5) # min_samples_split = range(2, 10, 2) # min_samples_leaf = range(1, 10, 2) # param_grid = {'max_features': max_features, # 'max_depth' : max_depth, # 'min_samples_split' : min_samples_split, # 'min_samples_leaf' : min_samples_leaf,} grid_search = GridSearchCV(clf, param_grid=param_grid, cv=nfolds) grid_search.fit(X, y) return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_
參考
Python Graph Galleryseaborn 官網
Pearson 相關
Working with missing data
留言
張貼留言