程式語言:Python
Package
簡介:資料分析的一些基本操作
seaborn 官網
Pearson 相關
Working with missing data
Package
- numpy
- pandas
- matplotlib
- seaborn
- sklearn
簡介:資料分析的一些基本操作
觀察資料
import pandas as pd
train_data = pd.read_csv("train.csv")
# 前十筆資料
train_data.head(10)
# 後十筆資料
train_data.tail(10)
# 移除特定欄位
train_data.drop(['PassengerId', 'Survived'], axis='columns')
資料資訊
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB
資料統計
count 714.000000 mean 29.699118 std 14.526497 min 0.420000 25% 20.125000 50% 28.000000 75% 38.000000 max 80.000000 Name: Age, dtype: float64
計算 NA 個數並列出
Age 177 Cabin 687 Embarked 2 dtype: int64
偏度 (skewness)
PassengerId 0.000000 Survived 0.478523 Pclass -0.630548 Age 0.389108 SibSp 3.695352 Parch 2.749117 Fare 4.787317 dtype: float64
峰度(Kurtosis)
PassengerId -1.200000 Survived -1.775005 Pclass -1.280015 Age 0.178274 SibSp 17.880420 Parch 9.778125 Fare 33.398141 dtype: float64
資料視覺化
缺項處理
import pandas as pd
train_data = pd.read_csv("train.csv")
# 填補缺項
train_data = train_data.fillna({'Age': train_data.Age.mean(),
'Fare': train_data.Fare.mean(),
'Cabin': 'N/A',
'Embarked': 'N/A'})
# 移除缺項
data = train_data['Age'].dropna(how='any')
# 缺項統一處理
def fillNA(XX):
X = XX.copy(deep=True)
for colName in X.columns:
if sum(X[colName].isnull()) > 0:
if X.dtypes[colName] in [np.int64, np.float64]:
# print("{} is filled by {}".format(colName, X[colName].mean()))
X[colName] = X[colName].fillna(X[colName].mean())
elif X.dtypes[colName] in [object]:
# print("{} is filled by NA".format(colName))
fillVal = "NA"# X[colName].value_counts().index[0]
X[colName] = X[colName].fillna(fillVal)
else:
print("{} is not define".format(colName))
return X
正規化處理
import pandas as pd
train_data = pd.read_csv("train.csv")
train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / train_data['Age'].std()
train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / (train_data['Age'].max() - train_data['Age'].min())
標籤編碼
from sklearn import preprocessing x = ['A', 'B', 'C', 'A'] lb = preprocessing.LabelBinarizer() lb.fit(x) print(lb.classes_) # ['A' 'B' 'C'] x_proc = lb.transform(x) print(x_proc) # [[1 0 0] # [0 1 0] # [0 0 1] # [1 0 0]] le = preprocessing.LabelEncoder() le.fit(x) print(le.classes_) # ['A' 'B' 'C'] x_proc = le.transform(x) print(x_proc) # [0 1 2 0] x_proc = pd.get_dummies(x) print(x_proc) # A B C # 0 1 0 0 # 1 0 1 0 # 2 0 0 1 # 3 1 0 0
# 資料統一轉換
from sklearn import preprocessing
# 編碼為 1 2 3 ...
def transSymbolEncoder(x):
lb = preprocessing.LabelEncoder()
lb.fit(x)
return lb.transform(x)
# 編碼為 001 010 100 ...
# 可用 pd.get_dummies(XX) 取代
def transSymbolBinary(XX, colName):
X = XX
x = X[colName].fillna("NA")
lb = preprocessing.LabelBinarizer()
lb.fit(x)
value = lb.transform(x)
for i in range(len(value[0])):
addName = "{}_{}".format(colName, lb.classes_[i])
X[addName] = value[:, i]
return X.drop([colName], axis='columns')
def transAll(XX):
X = XX.copy(deep=True)
for colName in X.columns:
if X.dtypes[colName] in [np.int64, np.float64]:
# print("{} is not transformed".format(colName))
# X[colName] = (X[colName] - X[colName].mean()) / X[colName].std()
pass
elif X.dtypes[colName] in [object]:
# print("{} is transformed".format(colName))
X = transSymbolBinary(X, colName)
else:
print("{} is not define".format(colName))
return X
重要性 feature
# 列出重要的 feature
import numpy as np
import matplotlib.pyplot as plt
def listImportantIndex(indices, importances, threshold):
'''
indices : features 名字
importances : feature 重要性
threshold : 大於才會挑選
'''
# 取絕對值,以符合 linear model 的權重
importances_abs = abs(importances)
# 由小到大排序並回傳對應的 index,再將之反向,也就是改為由大到小的排序
indices_sorted = np.argsort(importances_abs)[::-1]
# 印出的個數
n = sum(importances_abs > threshold)
# 印出排序後的 feature 重要性
print("Feature ranking:")
for f in range(n):
print("%d. feature %s (%f)" % (f + 1, indices[indices_sorted[f]], importances[indices_sorted[f]]))
# 畫直方圖
plt.barh(range(n), importances[indices_sorted[:n]][::-1], align="center")
plt.yticks(range(n), indices[indices_sorted[:n]][::-1])
plt.title("threshold={}".format(threshold))
plt.xlabel("importance")
plt.ylabel("features")
plt.show()
return indices[indices_sorted[:n]]
# 合併重要 features
import pandas as pd
def combinIndex(*args):
result = pd.Index([])
for i in indexList:
result = result.union(i)
return result
演算法處理
# 演算法評估
from sklearn.model_selection import cross_val_score
import seaborn as sns
def evaluate_model(clf, X, y):
# 5-Fold Cross Validation
scores = cross_val_score(clf, X, y, cv=10, scoring="neg_mean_squared_error")
# 平均埴
m = scores.mean()
# 標準差
sd = scores.std()
# 訓練資料準確度
clf.fit(X, y)
score = clf.score(X, y)
# 中位數
ax = sns.boxplot(scores)
ax.set_title("val_mean:{:.5f}, val_std:{:.5f}\ntrain_score:{:.2f}\n\nclf:{}".format(m, sd, score, clf))
return clf
from sklearn.model_selection import GridSearchCV
# 挑選參數
def param_selection(clf, X, y, nfolds, **param_grid):
# max_features = ['sqrt', 'auto', 'log2']
# max_depth = range(1, 30, 5)
# min_samples_split = range(2, 10, 2)
# min_samples_leaf = range(1, 10, 2)
# param_grid = {'max_features': max_features,
# 'max_depth' : max_depth,
# 'min_samples_split' : min_samples_split,
# 'min_samples_leaf' : min_samples_leaf,}
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=nfolds)
grid_search.fit(X, y)
return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_
參考
Python Graph Galleryseaborn 官網
Pearson 相關
Working with missing data











留言
張貼留言