ML:基礎學習
課程:
機器學習基石
簡介:第十講 Logistic Regression
$$
\mathbf{x}=(x_0,x_1,x_2,\cdots ,x_d)\\
s=\sum_{i=0}^{d}w_ix_i\\
\theta (s)=\frac{e^s}{1+e^s}=\frac{1}{1+e^{-s}}\\
h(\mathbf{x})=\frac{1}{1+e^{-\mathbf{w^Tx}}}
$$
比較
$$
s= \mathbf{w^Tx}
$$
\(E_{in}(\mathbf{w})\) 的定義
cross-entropy error
$$
err(\mathbf{w,x_n},y_n)=ln (1+e^{-y_n\mathbf{w^Tx_n}})\\
E_{in}(\mathbf{w})= \frac{1}{N}\sum_{n=1}^{N}err(\mathbf{w,x_n},y_n)
$$
$$
target\ function\ f(\mathbf{x})=P(+1|\mathbf{x})\Leftrightarrow P(y|\mathbf{x})=
\left\{\begin{matrix}
f(\mathbf{x}) & for\ y=+1\\
1-f(\mathbf{x}) & for\ y=-1
\end{matrix}\right.
$$
\(f\) 是正確的,所以 \(f\) 產生 \(D\) 這些資料的機率是很大的,甚至是 1
所以從 \(h\) 找出最大機率的 \(g\) 就能接近 \(f\)
$$
\begin{align*}
Assume&\ likelihood(h)\approx\ probability\ using\ f \approx large\\
If\ D&=\left \{ (\mathbf{x_1},\mathrm{o}),(\mathbf{x_2},\mathrm{x}) \cdots \right \}\\
P(\mathrm{o})=P(\mathbf{x_1})P(\mathrm{o}|\mathbf{x_1}) &= P(\mathbf{x_1})f(\mathbf{x_1})\approx P(\mathbf{x_1})h(\mathbf{x_1})\\
P(\mathrm{x})=P(\mathbf{x_2})P(\mathrm{x}|\mathbf{x_2}) &= P(\mathbf{x_2})(1-f(\mathbf{x_2}))\approx P(\mathbf{x_2})(1-h(\mathbf{x_2}))\\
\vdots&=\vdots
\\
g=\underset{h}{argmax}&\ likelihood(h)\\
h(\mathbf{x})=\theta (\mathbf{w^Tx})&\Rightarrow 1-h(\mathbf{x})=h(\mathbf{-x})\\
likelihood(h)&=P(\mathbf{x_1})h(\mathbf{x_1}) \times P(\mathbf{x_2})(1-h(\mathbf{x_2})) \times \cdots P(\mathbf{x_n})(1-h(\mathbf{x_n}))\\
&=P(\mathbf{x_1})h(\mathbf{x_1}) \times P(\mathbf{x_2})(h(\mathbf{-x_2})) \times \cdots P(\mathbf{x_n})(h(\mathbf{-x_n}))\\
\because P(\mathbf{x_1}) &= P(\mathbf{x_2}) = \cdots = P(\mathbf{x_n})\\
\therefore likelihood(h) &\propto \prod_{n=1}^{N}h(y_n\mathbf{x_n})\\
\\
h\ is\ logistic\ function\\
\underset{h}{max}\ likelihood(logistic\ h)&\propto \underset{h}{max}\ \prod_{n=1}^{N}h(y_n\mathbf{x_n})\\
\underset{\mathbf{w}}{max}\ likelihood(\mathbf{w})&\propto\underset{\mathbf{w}}{max}\ \prod_{n=1}^{N}\theta (y_n\mathbf{w^Tx_n})\\
&\propto\underset{\mathbf{w}}{max}\ ln\prod_{n=1}^{N}\theta (y_n\mathbf{w^Tx_n})\\
&\propto\underset{\mathbf{w}}{max}\ \sum_{n=1}^{N}ln\theta (y_n\mathbf{w^Tx_n})\\
&\propto\underset{\mathbf{w}}{min}\ \sum_{n=1}^{N}-ln\theta (y_n\mathbf{w^Tx_n})\\
&\propto\underset{\mathbf{w}}{min}\ \frac{1}{N}\sum_{n=1}^{N}-ln\theta (y_n\mathbf{w^Tx_n})\\
\\
\theta(s) = \frac{1}{1+e^{-s}}&: \underset{w}{min}\ \frac{1}{N}\sum_{n=1}^{N}-ln (\frac{1}{1+e^{-y_n\mathbf{w^Tx_n}}})\\
&=\underset{w}{min}\ \frac{1}{N}\sum_{n=1}^{N}ln (1+e^{-y_n\mathbf{w^Tx_n}})\\
&=\underset{w}{min}\ \frac{1}{N}\sum_{n=1}^{N}err(\mathbf{w,x_n},y_n)\\
&=\underset{w}{min}\ E_{in}(\mathbf{w})
\end{align*}
$$
那為何跟 cross-entropy error 有關呢?
首先 cross-entropy 的定義為
$$
H(p,q)=-\sum _xp(x)\cdot \mathrm{log}q(x)
$$
因 \(p(x)\) 是目標值,不是 1 就是 0
而 \(q(x)\) 則是實際的機率,代入 logistic function 做為機率的模型,也可用其他的 sigmoid function 取代
再利用 cross-entropy 比較兩者的差異
$$
\begin{align*}
E_{in}&=-\sum _xp(x)\cdot \mathrm{log}q(x)\\
&=-\sum _xp(x)\cdot \mathrm{log}q(x) + (1-p(x))\cdot \mathrm{log}(1-q(x))\\
\because q(x)=\frac{1}{1+e^{-x}} &= -\sum _xp(x)\cdot \mathrm{log}\frac{1}{1+e^{-x}} + (1-p(x))\cdot \mathrm{log}(1-\frac{1}{1+e^{-x}})\\
&= -\sum _x-p(x)\cdot \mathrm{log}(1+e^{-x}) + (1-p(x))\cdot \mathrm{log}(\frac{e^{-x}}{1+e^{-x}})\\
&= -\sum _x-p(x)\cdot \mathrm{log}(1+e^{-x}) + (1-p(x))\cdot (-x-\mathrm{log}(1+e^{-x}))\\
&= \sum _xp(x)\cdot \mathrm{log}(1+e^{-x}) - (1-p(x))\cdot (-x-\mathrm{log}(1+e^{-x}))\\
&= \sum _xp(x)\cdot \mathrm{log}(1+e^{-x}) + x + \mathrm{log}(1+e^{-x}) - p(x)x-p(x)\cdot\mathrm{log}(1+e^{-x}))\\
&= \sum _xx + \mathrm{log}(1+e^{-x}) - p(x)x\\
\end{align*}
$$
而在之前的推導,若不將 \(1-h(\mathbf{x})=h(\mathbf{-x})\),而是維持,改用 \(f(\mathbf{x})\) 表示
$$
\begin{align*}
Assume&\ likelihood(h)\approx\ probability\ using\ f \approx large\\
If\ D&=\left \{ (\mathbf{x_1},\mathrm{o}),(\mathbf{x_2},\mathrm{x}) \cdots \right \}\\
P(\mathrm{o})=P(\mathbf{x_1})P(\mathrm{o}|\mathbf{x_1}) &= P(\mathbf{x_1})f(\mathbf{x_1})\approx P(\mathbf{x_1})h(\mathbf{x_1})\\
P(\mathrm{x})=P(\mathbf{x_2})P(\mathrm{x}|\mathbf{x_2}) &= P(\mathbf{x_2})(1-f(\mathbf{x_2}))\approx P(\mathbf{x_2})(1-h(\mathbf{x_2}))\\
\vdots&=\vdots
\\
g=\underset{h}{argmax}&\ likelihood(h)\\
likelihood(h)&=P(\mathbf{x_1})h(\mathbf{x_1}) \times P(\mathbf{x_2})(1-h(\mathbf{x_2})) \times \cdots P(\mathbf{x_n})(1-h(\mathbf{x_n}))\\
\because P(\mathbf{x_1}) &= P(\mathbf{x_2}) = \cdots = P(\mathbf{x_n})\\
\therefore likelihood(h) &\propto h(\mathbf{x_1}) \times (1-h(\mathbf{x_2})) \times \cdots (1-h(\mathbf{x_n}))\\
&=\prod_{n=1}^{N}(h(\mathbf{x_n}))^{f(\mathbf{x_n})}(1-h(\mathbf{x_n}))^{1-f(\mathbf{x_n})}\\
\therefore likelihood(h) &\propto \mathrm{log}\prod_{n=1}^{N}(h(\mathbf{x_n}))^{f(\mathbf{x_n})}(1-h(\mathbf{x_n}))^{1-f(\mathbf{x_n})}\\
&=\sum_{n=1}^{N}{f(\mathbf{x_n})}\cdot \mathrm{log}(h(\mathbf{x_n})+({1-f(\mathbf{x_n})})\cdot \mathrm{log}(1-h(\mathbf{x_n}))\\
\end{align*}
$$
可看到此項與先前推導的 cross-entropy 的第二項是一樣的
而 \(y\) 的值會影響 \(f(\mathbf{x})\)
$$
\left\{\begin{matrix}
f(\mathbf{x})=1 & for\ y=+1\\
f(\mathbf{x})=0 & for\ y=-1
\end{matrix}\right.
$$
最小化 \(E_{in}(\mathbf{w})\)
因為 \(E_{in}(\mathbf{w})\) 連續可微、二次可微、convex function,故可用梯度求解 \(\triangledown E_{in}(\mathbf{w})=0\)
$$
\triangledown E_{in}(\mathbf{w})=\frac{1}{N}\sum_{n=1}^{N}\theta (-y_n\mathbf{w^Tx_n})(-y_n\mathbf{x_n})=0
$$
利用微分連鎖率
$$
\begin{align*}
E_{in}(\mathbf{w})&=\frac{1}{N}\sum_{n=1}^{N}ln(1+e^{-y_n\mathbf{w^Tx_n}})\\
\frac{\partial E_{in}(\mathbf{w})}{\partial w_i} &= \frac{1}{N}\sum_{n=1}^{N}ln(1+e^{-y_n\mathbf{w^Tx_n}})\\
&= \frac{1}{N}\sum_{n=1}^{N}\frac{1}{1+e^{-y_n\mathbf{w^Tx_n}}}\times e^{-y_n\mathbf{w^Tx_n}}\times -y_nx_{n,i}\\
&= \frac{1}{N}\sum_{n=1}^{N}\theta (-y_n\mathbf{w^Tx_n})\times -y_nx_{n,i}\\
\triangledown E_{in}(\mathbf{w})&=\frac{1}{N}\sum_{n=1}^{N}\theta (-y_n\mathbf{w^Tx_n})( -y_n\mathbf{x_n})
\end{align*}
$$
更新方程式
$$
\mathbf{w_{t+1}}\leftarrow \mathbf{w_t}+\eta \mathbf{v}
$$
以 PLA 為例
$$
\mathbf{w_{t+1}}\leftarrow \mathbf{w_t}+ \underset{\eta }{\underbrace{1}} \cdot \underset{\mathbf{v}}{\underbrace{([sign(\mathbf{w_t^Tx_n}\neq y_n)]\cdot y_n\mathbf{x_n})}}
$$
最小化的方法
$$
\mathbf{w_{t+1}}\leftarrow \mathbf{w_t}+\eta \mathbf{v}\\
\mathbf{w_{t+1}} \leftarrow \mathbf{w_t}-\eta\frac{\triangledown E_{in}(\mathbf{w_t})}{\left \| \triangledown E_{in}(\mathbf{w_t}) \right \|}
$$
$$
\underset{\left \| \mathbf{v} \right \|=1}{min}\ E_{in}(\mathbf{w_t}+\eta \mathbf{v})\ for\ \eta >0
$$
從一小段線段來看且 \(\eta\) 夠小,從泰勒展開的餘項,或微分定理都可得到
$$
f{}'(x)=\underset{dx \to 0}{lim}\frac{f(x+dx)-f(x)}{dx}\\
E_{in}(\mathbf{w_t}+\eta \mathbf{v}) \approx E_{in}(\mathbf{w_t})+\eta \mathbf{v^T}\triangledown E_{in}(\mathbf{w_t})
$$
那麼如何最小化 error 呢?
也就是令 \(\mathbf{v^T}\triangledown E_{in}(\mathbf{w_t})\) 為負最多
故
$$
\mathbf{v}=-\frac{\triangledown E_{in}(\mathbf{w_t})}{\left \| \triangledown E_{in}(\mathbf{w_t}) \right \|}
$$
如何選定 \(\eta\)?
\(\eta\) 隨 \(\left \| \triangledown E_{in}(\mathbf{w_t}) \right \|\) 遞減
令 \(\color{Red} {\eta}= \color{Purple} {\eta}\left \| \triangledown E_{in}(\mathbf{w_t}) \right \|\)
$$
\begin{align*}
\mathbf{w_{t+1}} &\leftarrow \mathbf{w_t}-\color{Red} {\eta}\frac{\triangledown E_{in}(\mathbf{w_t})}{\left \| \triangledown E_{in}(\mathbf{w_t}) \right \|}\\
&\leftarrow \mathbf{w_t}-\color{Purple}{ \eta}\triangledown E_{in}(\mathbf{w_t})
\end{align*}
$$
\(\color{Purple}{\eta}\) 為 fixed learning rate
演算法
- 初始化 \(\mathbf{w_0}\)
- For \(t = 0, 1, \cdots \)
-
計算
$$
\triangledown E_{in}(\mathbf{w_t})=\frac{1}{N}\sum_{n=1}^{N}\theta (-y_n\mathbf{w^T_tx_n})(-y_n\mathbf{x_n})
$$
- 修正錯誤
$$
\mathbf{w_{t+1}}\leftarrow \mathbf{w_t}-\color{Purple}{ \eta}\triangledown E_{in}(\mathbf{w_t})
$$
- 直到 \(\|\triangledown E_{in}(\mathbf{w_t})\|=0\) 或 大概為 0 或 足夠的次數
- 得到 \(g\)
Python 原始碼
import matplotlib.pyplot as plt
import numpy as np
import operator
import math
# 網路上找的 dataset 可以線性分割
rawData = [
((-0.4, 0.3), -1),
((-0.3, -0.1), -1),
((-0.2, 0.4), -1),
((-0.1, 0.1), -1),
((0.9, -0.5), 1),
((0.7, -0.9), 1),
((0.8, 0.2), 1),
((0.4, -0.6), 1),
((0.2, 0.6), -1),
((-0.5, -0.5), -1),
((0.7, 0.3), 1),
((0.9, -0.6), 1),
((-0.1, 0.2), -1),
((0.3, -0.6), 1),
((0.5, 0.1), -1), ]
# 加入 x0
dataset = [((1,) + x, y) for x, y in rawData]
# 內積
def dot(*v):
return sum(map(operator.mul, *v))
# 計算向量長度
def vlen(v):
square = map(operator.mul, v, v)
return sum(square) ** (1/2)
# 取 thita
def thita(v):
return 1 / ( 1 + math.exp( -v ))
# 計算梯度
def gradient(w, dataset):
setsV = []
for x, y in dataset:
setsV.append( map(operator.mul, [thita(-y*dot(w, x)) * (-y)] * len(x), x) )
sum = [0] * len(x)
for v in setsV:
sum = map(operator.add, sum, v)
sum = map(operator.truediv, sum, [1]*len(x))
return list(sum)
# 更新 w
def update(w, n, g):
u = map(operator.mul, [n] * len(g), g)
w = map(operator.sub, w, u)
return list(w)
# LogisticRegression 演算法實作
def LogisticRegression(dataset):
# 初始化 w
w = [0] * 3
t = 0
n = 0.1
max_number = 100
while True:
g = gradient(w, dataset)
print("g {:.5f} => {}: {}".format(vlen(g), t, tuple(w)))
w = update(w, n, g)
t += 1
if vlen(g) < 0.2 or t > max_number:
break
return w
# 主程式
def main():
# 執行
w = LogisticRegression(dataset)
# 畫圖
fig = plt.figure()
# numrows=1, numcols=1, fignum=1
ax1 = fig.add_subplot(111)
xx = list(filter(lambda d: d[1] == -1, dataset))
ax1.scatter([x[0][1] for x in xx], [x[0][2] for x in xx],
s=100, c='b', marker="x", label='-1')
oo = list(filter(lambda d: d[1] == 1, dataset))
ax1.scatter([x[0][1] for x in oo], [x[0][2] for x in oo],
s=100, c='r', marker="o", label='1')
l = np.linspace(-2, 2)
# w0 + w1x + w2y = 0
# y = -w0/w2 - w1/w2 x
if w[2]:
a, b = -w[1] / w[2], -w[0] / w[2]
ax1.plot(l, a * l + b, 'b-')
else:
ax1.plot([-w[0] / w[1]] * len(l), l, 'b-')
plt.legend(loc='upper left', scatterpoints=1)
plt.show()
if __name__ == '__main__':
main()
參考
如何通俗地解释泰勒公式?
損失函數
tf.nn.sigmoid_cross_entropy_with_logits
cross entropy的直觀理解
Cross entropy
留言
張貼留言