程式語言:Python
Package
簡介:資料分析的一些基本操作
seaborn 官網
Pearson 相關
Working with missing data
Package
- numpy
- pandas
- matplotlib
- seaborn
- sklearn
簡介:資料分析的一些基本操作
觀察資料
- import pandas as pd
- train_data = pd.read_csv("train.csv")
- # 前十筆資料
- train_data.head(10)
- # 後十筆資料
- train_data.tail(10)
- # 移除特定欄位
- train_data.drop(['PassengerId', 'Survived'], axis='columns')
資料資訊
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB
資料統計
count 714.000000 mean 29.699118 std 14.526497 min 0.420000 25% 20.125000 50% 28.000000 75% 38.000000 max 80.000000 Name: Age, dtype: float64
計算 NA 個數並列出
Age 177 Cabin 687 Embarked 2 dtype: int64
偏度 (skewness)
PassengerId 0.000000 Survived 0.478523 Pclass -0.630548 Age 0.389108 SibSp 3.695352 Parch 2.749117 Fare 4.787317 dtype: float64
峰度(Kurtosis)
PassengerId -1.200000 Survived -1.775005 Pclass -1.280015 Age 0.178274 SibSp 17.880420 Parch 9.778125 Fare 33.398141 dtype: float64
資料視覺化
缺項處理
- import pandas as pd
- train_data = pd.read_csv("train.csv")
- # 填補缺項
- train_data = train_data.fillna({'Age': train_data.Age.mean(),
- 'Fare': train_data.Fare.mean(),
- 'Cabin': 'N/A',
- 'Embarked': 'N/A'})
- # 移除缺項
- data = train_data['Age'].dropna(how='any')
- # 缺項統一處理
- def fillNA(XX):
- X = XX.copy(deep=True)
- for colName in X.columns:
- if sum(X[colName].isnull()) > 0:
- if X.dtypes[colName] in [np.int64, np.float64]:
- # print("{} is filled by {}".format(colName, X[colName].mean()))
- X[colName] = X[colName].fillna(X[colName].mean())
- elif X.dtypes[colName] in [object]:
- # print("{} is filled by NA".format(colName))
- fillVal = "NA"# X[colName].value_counts().index[0]
- X[colName] = X[colName].fillna(fillVal)
- else:
- print("{} is not define".format(colName))
- return X
正規化處理
- import pandas as pd
- train_data = pd.read_csv("train.csv")
- train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / train_data['Age'].std()
- train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / (train_data['Age'].max() - train_data['Age'].min())
標籤編碼
- from sklearn import preprocessing
- x = ['A', 'B', 'C', 'A']
- lb = preprocessing.LabelBinarizer()
- lb.fit(x)
- print(lb.classes_)
- # ['A' 'B' 'C']
- x_proc = lb.transform(x)
- print(x_proc)
- # [[1 0 0]
- # [0 1 0]
- # [0 0 1]
- # [1 0 0]]
- le = preprocessing.LabelEncoder()
- le.fit(x)
- print(le.classes_)
- # ['A' 'B' 'C']
- x_proc = le.transform(x)
- print(x_proc)
- # [0 1 2 0]
- x_proc = pd.get_dummies(x)
- print(x_proc)
- # A B C
- # 0 1 0 0
- # 1 0 1 0
- # 2 0 0 1
- # 3 1 0 0
- # 資料統一轉換
- from sklearn import preprocessing
- # 編碼為 1 2 3 ...
- def transSymbolEncoder(x):
- lb = preprocessing.LabelEncoder()
- lb.fit(x)
- return lb.transform(x)
- # 編碼為 001 010 100 ...
- # 可用 pd.get_dummies(XX) 取代
- def transSymbolBinary(XX, colName):
- X = XX
- x = X[colName].fillna("NA")
- lb = preprocessing.LabelBinarizer()
- lb.fit(x)
- value = lb.transform(x)
- for i in range(len(value[0])):
- addName = "{}_{}".format(colName, lb.classes_[i])
- X[addName] = value[:, i]
- return X.drop([colName], axis='columns')
- def transAll(XX):
- X = XX.copy(deep=True)
- for colName in X.columns:
- if X.dtypes[colName] in [np.int64, np.float64]:
- # print("{} is not transformed".format(colName))
- # X[colName] = (X[colName] - X[colName].mean()) / X[colName].std()
- pass
- elif X.dtypes[colName] in [object]:
- # print("{} is transformed".format(colName))
- X = transSymbolBinary(X, colName)
- else:
- print("{} is not define".format(colName))
- return X
重要性 feature
- # 列出重要的 feature
- import numpy as np
- import matplotlib.pyplot as plt
- def listImportantIndex(indices, importances, threshold):
- '''
- indices : features 名字
- importances : feature 重要性
- threshold : 大於才會挑選
- '''
- # 取絕對值,以符合 linear model 的權重
- importances_abs = abs(importances)
- # 由小到大排序並回傳對應的 index,再將之反向,也就是改為由大到小的排序
- indices_sorted = np.argsort(importances_abs)[::-1]
- # 印出的個數
- n = sum(importances_abs > threshold)
- # 印出排序後的 feature 重要性
- print("Feature ranking:")
- for f in range(n):
- print("%d. feature %s (%f)" % (f + 1, indices[indices_sorted[f]], importances[indices_sorted[f]]))
- # 畫直方圖
- plt.barh(range(n), importances[indices_sorted[:n]][::-1], align="center")
- plt.yticks(range(n), indices[indices_sorted[:n]][::-1])
- plt.title("threshold={}".format(threshold))
- plt.xlabel("importance")
- plt.ylabel("features")
- plt.show()
- return indices[indices_sorted[:n]]
- # 合併重要 features
- import pandas as pd
- def combinIndex(*args):
- result = pd.Index([])
- for i in indexList:
- result = result.union(i)
- return result
演算法處理
- # 演算法評估
- from sklearn.model_selection import cross_val_score
- import seaborn as sns
- def evaluate_model(clf, X, y):
- # 5-Fold Cross Validation
- scores = cross_val_score(clf, X, y, cv=10, scoring="neg_mean_squared_error")
- # 平均埴
- m = scores.mean()
- # 標準差
- sd = scores.std()
- # 訓練資料準確度
- clf.fit(X, y)
- score = clf.score(X, y)
- # 中位數
- ax = sns.boxplot(scores)
- ax.set_title("val_mean:{:.5f}, val_std:{:.5f}\ntrain_score:{:.2f}\n\nclf:{}".format(m, sd, score, clf))
- return clf
- from sklearn.model_selection import GridSearchCV
- # 挑選參數
- def param_selection(clf, X, y, nfolds, **param_grid):
- # max_features = ['sqrt', 'auto', 'log2']
- # max_depth = range(1, 30, 5)
- # min_samples_split = range(2, 10, 2)
- # min_samples_leaf = range(1, 10, 2)
- # param_grid = {'max_features': max_features,
- # 'max_depth' : max_depth,
- # 'min_samples_split' : min_samples_split,
- # 'min_samples_leaf' : min_samples_leaf,}
- grid_search = GridSearchCV(clf, param_grid=param_grid, cv=nfolds)
- grid_search.fit(X, y)
- return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_
參考
Python Graph Galleryseaborn 官網
Pearson 相關
Working with missing data
留言
張貼留言