[ML] 資料分析

程式語言：Python
Package

numpy
pandas
matplotlib
seaborn
sklearn

資料：Kaggle Titanic

簡介：資料分析的一些基本操作

觀察資料

import pandas as pd

train_data = pd.read_csv("train.csv")
# 前十筆資料
train_data.head(10)
# 後十筆資料
train_data.tail(10)
# 移除特定欄位
train_data.drop(['PassengerId', 'Survived'], axis='columns')

資料資訊

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

import pandas as pd

# 資料資訊，可檢查缺項
train_data.info()

資料統計

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

import pandas as pd

# 資料分析，可顯示中位數、平均值等常用統計值
train_data.['Age'].describe()

計算 NA 個數並列出

Age         177
Cabin       687
Embarked      2
dtype: int64

# 計算 NA 個數並列出
naCount = train_data.isnull().sum()
naCount[naCount > 0]

偏度 (skewness)

PassengerId    0.000000
Survived       0.478523
Pclass        -0.630548
Age            0.389108
SibSp          3.695352
Parch          2.749117
Fare           4.787317
dtype: float64

# skewness https://zh.wikipedia.org/wiki/偏度
# 偏度=0 是完美的對稱性
# 負偏態或左偏態：左側的尾部更長，分布的主體集中在右側。
# 正偏態或右偏態：右側的尾部更長，分布的主體集中在左側。
train_data.skew()

import numpy as np
# skewness > 0.75 可用 log(1+x) 使之較對稱
x_origin = train_data["Fare"]
x = np.log1p(x_origin)
x_origin = np.expm1(x)

峰度（Kurtosis）

PassengerId    -1.200000
Survived       -1.775005
Pclass         -1.280015
Age             0.178274
SibSp          17.880420
Parch           9.778125
Fare           33.398141
dtype: float64

# 峰度（Kurtosis）
# 常態分佈的峰度為 3
# 峰度大於 3 的稱作尖峰，表示資料的分佈比常態分佈更集中和陡峭。
# 峰度小於 3 的作為平峰型，表示資料分佈比之正態分佈更為平滑
train_data.kurtosis()

相關性

import seaborn as sns
import matplotlib.pyplot as plt

# 計算相關性
corrmat = train_data.corr()
# 畫圖
sns.heatmap(corrmat, square=True)
# 顯示圖片
plt.show()

資料視覺化

離散資料

import matplotlib.pyplot as plt
import seaborn as sns

# 畫出直方圖
data = train_data['Embarked']
plt.subplot(1, 2, 1)
plt.hist(data)
plt.ylim([0, 700])
plt.subplot(1, 2, 2)
plt.hist(data[train_data.Survived == 1], color='b', label="Survied")
plt.hist(data[train_data.Survived == 0], color='g', label="non-Survied", alpha=0.5)
plt.ylim([0, 700])
plt.legend()

# seaborn 會自動移除 NaN 值，需注意
# 畫出直方圖
sns.factorplot('Embarked', hue='Survived', data=train_data, kind="count")
# 顯示中心值與信賴區間
sns.factorplot('Embarked', 'Survived', data=train_data, kind="point")
# 顯示中位數與四分位距
sns.factorplot('Embarked', 'Survived', data=train_data, kind="violin")
plt.show()

連續資料

import matplotlib.pyplot as plt
import seaborn as sns

# 畫出直方圖
plt.subplot(1, 2, 1)
# 移除 NA 值
data = train_data['Age'].dropna(how='any')
plt.hist(data, bins=range(0,100,10), rwidth=0.8)
plt.ylim([0, 250])
plt.subplot(1, 2, 2)
plt.hist(data[train_data.Survived == 1], bins=range(0,100,10), color='b', label="Survied", rwidth=0.8)
plt.hist(data[train_data.Survived == 0], bins=range(0,100,10), color='g', label="non-Survied", rwidth=0.8, alpha=0.5)
plt.ylim([0, 250])
plt.legend()

# seaborn 會自動移除 NaN 值，需注意
# 畫出相關性的圖
sns.jointplot('Age', 'Survived', data=train_data, kind='hex')
# 顯示中心值與信賴區間
sns.factorplot('Survived', 'Age', data=train_data, kind="point")
# 顯示中位數與四分位距
sns.factorplot('Survived', 'Age', data=train_data, kind="violin")

plt.show()

缺項處理

import pandas as pd

train_data = pd.read_csv("train.csv")

# 填補缺項
train_data = train_data.fillna({'Age': train_data.Age.mean(), 
                               'Fare': train_data.Fare.mean(), 
                              'Cabin': 'N/A', 
                           'Embarked': 'N/A'})

# 移除缺項
data = train_data['Age'].dropna(how='any')

# 缺項統一處理
def fillNA(XX):
    X = XX.copy(deep=True)
    for colName in X.columns:
        if sum(X[colName].isnull()) > 0:
            if X.dtypes[colName] in [np.int64, np.float64]:
                # print("{} is filled by {}".format(colName, X[colName].mean()))
                X[colName] = X[colName].fillna(X[colName].mean())
            elif X.dtypes[colName] in [object]:
                # print("{} is filled by NA".format(colName))
                fillVal = "NA"# X[colName].value_counts().index[0]
                X[colName] = X[colName].fillna(fillVal)
            else:
                print("{} is not define".format(colName))
    return X

正規化處理

import pandas as pd

train_data = pd.read_csv("train.csv")

train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / train_data['Age'].std()
train_data['Age'] = (train_data['Age'] - train_data['Age'].mean()) / (train_data['Age'].max() - train_data['Age'].min())

標籤編碼

from sklearn import preprocessing

x = ['A', 'B', 'C', 'A']
lb = preprocessing.LabelBinarizer()
lb.fit(x)
print(lb.classes_)
# ['A' 'B' 'C']
x_proc = lb.transform(x)
print(x_proc)
# [[1 0 0]
#  [0 1 0]
#  [0 0 1]
#  [1 0 0]]

le = preprocessing.LabelEncoder()
le.fit(x)
print(le.classes_)
# ['A' 'B' 'C']
x_proc = le.transform(x)
print(x_proc)
# [0 1 2 0]

x_proc = pd.get_dummies(x)
print(x_proc)
#    A  B  C
# 0  1  0  0
# 1  0  1  0
# 2  0  0  1
# 3  1  0  0

# 資料統一轉換
from sklearn import preprocessing

# 編碼為 1 2 3 ...
def transSymbolEncoder(x):
    lb = preprocessing.LabelEncoder()
    lb.fit(x)
    return lb.transform(x)

# 編碼為 001 010 100 ... 
# 可用 pd.get_dummies(XX) 取代
def transSymbolBinary(XX, colName):
    X = XX
    x = X[colName].fillna("NA")
    lb = preprocessing.LabelBinarizer()
    lb.fit(x)
    value = lb.transform(x)
    for i in range(len(value[0])):
        addName = "{}_{}".format(colName, lb.classes_[i])
        X[addName] = value[:, i]

    return X.drop([colName], axis='columns')

def transAll(XX):
    X = XX.copy(deep=True)
    for colName in X.columns:
        if X.dtypes[colName] in [np.int64, np.float64]:
            # print("{} is not transformed".format(colName))
            # X[colName] = (X[colName] - X[colName].mean()) / X[colName].std()
            pass
        elif X.dtypes[colName] in [object]:
            # print("{} is transformed".format(colName))
            X = transSymbolBinary(X, colName)
        else:
            print("{} is not define".format(colName))
    
    return X

重要性 feature

# 列出重要的 feature
import numpy as np
import matplotlib.pyplot as plt

def listImportantIndex(indices, importances, threshold):
    '''
    indices : features 名字
    importances : feature 重要性 
    threshold : 大於才會挑選
    '''
    # 取絕對值，以符合 linear model 的權重
    importances_abs = abs(importances)
    # 由小到大排序並回傳對應的 index，再將之反向，也就是改為由大到小的排序
    indices_sorted = np.argsort(importances_abs)[::-1]
    # 印出的個數
    n = sum(importances_abs > threshold)
    
    # 印出排序後的 feature 重要性
    print("Feature ranking:")
    for f in range(n):
        print("%d. feature %s (%f)" % (f + 1, indices[indices_sorted[f]], importances[indices_sorted[f]]))
    
    # 畫直方圖
    plt.barh(range(n), importances[indices_sorted[:n]][::-1], align="center")
    plt.yticks(range(n), indices[indices_sorted[:n]][::-1])
    plt.title("threshold={}".format(threshold))
    plt.xlabel("importance")
    plt.ylabel("features")
    plt.show()
    
    return indices[indices_sorted[:n]]

# 合併重要 features
import pandas as pd

def combinIndex(*args):
    result = pd.Index([])
    for i in indexList:
        result = result.union(i)
    
    return result

Random Forest (隨機樹)

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pandas as pd

train_data = pd.read_csv("train.csv")

# 編碼
def transSymbol(x):
    lb = preprocessing.LabelBinarizer()
    lb.fit(x)
    return lb.transform(x)

# 缺項處理
train_data = train_data.fillna({'Age': train_data.Age.mean(), 
                               'Fare': train_data.Fare.mean(), 
                              'Cabin': 'N/A',
                           'Embarked': 'N/A'})

# 移除不需要的 column
X = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis='columns')
# 性別編碼
X['Sex'] = transSymbol(X['Sex'])
# 艙號編碼
X['Cabin'] = transSymbol(X['Cabin'])
# 登船港口編碼
X['Embarked'] = transSymbol(X['Embarked'])
    
y = train_data['Survived']
clf = RandomForestClassifier(n_estimators=200, oob_score=True)
clf.fit(X, y)

# feature 重要性
listImportantIndex(indices=X.columns, importances=clf.feature_importances_, threshold=0.01)
# Feature ranking:
# 1. feature Fare (0.279122)
# 2. feature Age (0.265518)
# 3. feature Sex (0.261311)
# 4. feature Pclass (0.082490)
# 5. feature SibSp (0.049061)
# 6. feature Parch (0.040910)
# 7. feature Embarked (0.020969)

Linear Model

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import pandas as pd

train_data = pd.read_csv("ipython/train.csv")

# 編碼
def transSymbol(x):
    lb = preprocessing.LabelBinarizer()
    lb.fit(x)
    return lb.transform(x)

# 缺項處理
train_data = train_data.fillna({'Age': train_data.Age.mean(), 
                               'Fare': train_data.Fare.mean(), 
                              'Cabin': 'N/A',
                           'Embarked': 'N/A'})

# 移除不需要的 column
X = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis='columns')
# 性別編碼
X['Sex'] = transSymbol(X['Sex'])
# 艙號編碼
X['Cabin'] = transSymbol(X['Cabin'])
# 登船港口編碼
X['Embarked'] = transSymbol(X['Embarked'])
    
y = train_data['Survived']
clf = LinearRegression()
clf.fit(X, y)

# feature 重要性
listImportantIndex(indices=X.columns, importances=clf.coef_, threshold=0.01)
# Feature ranking:
# 1. feature Sex (-0.509290)
# 2. feature Cabin (-0.469352)
# 3. feature Pclass (-0.167720)
# 4. feature Embarked (0.061832)
# 5. feature SibSp (-0.041574)
# 6. feature Parch (-0.018721)

演算法處理

# 演算法評估
from sklearn.model_selection import cross_val_score  
import seaborn as sns

def evaluate_model(clf, X, y):
    # 5-Fold Cross Validation
    scores = cross_val_score(clf, X, y, cv=10, scoring="neg_mean_squared_error")
    # 平均埴
    m = scores.mean()
    # 標準差
    sd = scores.std()
    
    # 訓練資料準確度
    clf.fit(X, y)
    score = clf.score(X, y)
    
    # 中位數
    ax = sns.boxplot(scores)
    ax.set_title("val_mean:{:.5f}, val_std:{:.5f}\ntrain_score:{:.2f}\n\nclf:{}".format(m, sd, score, clf))
    
    return clf

from sklearn.model_selection import GridSearchCV

# 挑選參數
def param_selection(clf, X, y, nfolds, **param_grid):
    # max_features = ['sqrt', 'auto', 'log2']
    # max_depth = range(1, 30, 5)
    # min_samples_split = range(2, 10, 2)
    # min_samples_leaf = range(1, 10, 2)
    # param_grid = {'max_features': max_features, 
    #               'max_depth' : max_depth, 
    #               'min_samples_split' : min_samples_split,
    #              'min_samples_leaf' : min_samples_leaf,}
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=nfolds)
    grid_search.fit(X, y)

    return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_

參考

Python Graph Gallery
seaborn 官網
Pearson 相關
Working with missing data

子風的知識庫

搜尋此網誌

[ML] 資料分析

觀察資料

資料視覺化

缺項處理

正規化處理

標籤編碼

重要性 feature

演算法處理

參考

留言

張貼留言