## softmax模型

$$\theta^T x.$$

$$W^T x + b.$$

$$P(y=j) = \frac{\phi_j}{\phi_1+\cdots+\phi_k}.$$

$$1\{P\} = \left\{\begin{array}{ll} 1 & \text{ if P is true} \\ 0 & \text{ if P is false} \end{array}\right.$$

$$\begin{split} P(y) &= \frac{\prod_{j=1}^k \phi_j^{1\{y=j\}}}{\sum_{j=1}^k \phi_j} \\ &= \exp\left(\sum_{j=1}^k 1\{y=j\} \log \phi_j - \log\left(\sum_{j=1}^k \phi_j\right)\right) \\ &= b(y)\exp\left(\eta^T \cdot T(y) - a(\eta)\right) \end{split},$$

$$T(y) = \left[\begin{array}{c}1\{y=1\} \\ \vdots \\ 1\{y=k\}\end{array}\right], \text{ } \eta = \left[\begin{array}{c}\log(\phi_1) \\ \vdots \\ \log(\phi_k)\end{array}\right].$$

$$\log(\phi_i) = W_i^T \cdot x + b_i,$$

$$\phi_i = \exp(W_i^T \cdot x + b_i),$$

$$$$\begin{split} h_{W, b}(x) &= E[T(y)|x; W, b] = \sum_{j=1}^k T(j)P(j) \\ &= \left[\begin{array}{c}\frac{\phi_1}{\sum_{j=1}^k \phi_j} \\ \vdots \\ \frac{\phi_k}{\sum_{j=1}^k \phi_j}\end{array}\right] = \left[\begin{array}{c}\frac{\exp(W_1^T \cdot x + b_1)}{\sum_{j=1}^k \exp(W_j^T \cdot x + b_j)} \\ \vdots \\ \frac{\exp(W_k^T \cdot x + b_k)}{\sum_{j=1}^k \exp(W_j^T \cdot x + b_j)}\end{array}\right] \end{split}. \label{model1}$$$$

$$W = \left[\begin{array}{c}W_1^T \\ \vdots \\ W_k^T\end{array}\right], \text{ } b = \left[\begin{array}{c}b_1 \\ \vdots \\ b_k\end{array}\right],$$

$$\sigma\left(\begin{array}{c}z_1 \\ \vdots \\ z_k\end{array}\right) = \left[\begin{array}{c} \frac{e^{z_1}}{\sum_{j=1}^k e^{z_j}} \\ \vdots \\ \frac{e^{z_k}}{\sum_{j=1}^k e^{z_j}}\end{array}\right],$$

$$$$h_{W, b}(x) = \sigma(Wx+b). \label{model}$$$$

$$\begin{split} P(y^{(i)}; x^{(i)}, W, b) &= \exp\left(\sum_{j=1}^k 1\{y^{(i)}=j\} \log (\exp(W_j^T x^{(i)} + b_j)) - \log\left(\sum_{j=1}^k \exp(W_j^T x^{(i)} + b_j)\right)\right) \\ &= \exp\left(\sum_{j=1}^k 1\{y^{(i)}=j\} \log \frac{\exp(W_j^T x^{(i)} + b_j)}{\sum_{j=1}^k \exp(W_j^T x^{(i)} + b_j)}\right) \\ &= \exp(<T(y^{(i)}), \log h_{W, b}(x^{(i)})>). \end{split}$$

$$$$\begin{split} l(W, b) &= \frac{1}{m}\sum_{i=1}^m \log P(y^{(i)}; x^{(i)}, W, b) \\ &= \frac{1}{m}\sum_{i=1}^m <T(y^{(i)}), \log h_{W, b}(x^{(i)})> \end{split} \label{loglike}$$$$

## 模型实现

import sys
py = 'Python ' + '.'.join(map(str, sys.version_info[:3]))
print('Jupyter notebook with kernel: {}'.format(py))

import gzip
import time
from urllib.request import urlopen
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

Jupyter notebook with kernel: Python 3.6.3


def get_samples(images, labels):
img_num = int.from_bytes(images[4:8], byteorder='big')
lbl_num = int.from_bytes(labels[4:8], byteorder='big')
assert(img_num == lbl_num)
print('There are {} samples.'.format(img_num))
row = int.from_bytes(images[8:12], byteorder='big')
col = int.from_bytes(images[12:16], byteorder='big')
img_size = row * col
x, y = [], []
for i in range(img_num):
img_offset = 16 + img_size * i
lbl_offset = 8 + i
img = np.array(list(images[img_offset:img_offset+img_size]))
lbl = labels[lbl_offset]
lbl = np.array([i==lbl for i in range(10)], dtype=np.int8)
x.append(img)
y.append(lbl)
return x, y


get_samples的输入是解压后图像文件和对应的标记文件的字节串，输出为(x, y)，其中x是包含表示图像的列表，每个元素是一个28x28=784维的向量。y是对应x的数字标记。下面是获取所有训练集并且展示其中前25个数据点的代码。

images_url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
labels_url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
train_x, train_y = get_samples(images, labels)

There are 60000 samples.

r = c = 5
fig, ax = plt.subplots(r, c)
for i in range(r):
for j in range(c):
k = i * c + j
img = train_x[k]
img = img.reshape((28, 28))
lbl = train_y[k].argmax()
lbl = str(lbl)
ax[i, j].imshow(img, cmap='gray_r')
ax[i, j].set_title(lbl)
ax[i, j].axis('off')
plt.show()


x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W)+b)
y_ = tf.placeholder(tf.float32, [None, 10])
cost = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

begin = time.time()
for i in range(200):
sess.run(train_step, feed_dict={x: train_x, y_: train_y})
end = time.time()
print('Runtime: {} seconds'.format(end-begin))

Runtime: 107.31829977035522 seconds


images_url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
labels_url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
test_x, test_y = get_samples(images, labels)

There are 10000 samples.

pred_y = sess.run(y, feed_dict={x: test_x})
result = [0, 0]
for ty, py in zip(test_y, pred_y):
result[ty.argmax()==py.argmax()] += 1
error_rate = result[0] * 100 / sum(result)
print('Error rate is: {}%'.format(error_rate))

Error rate is: 8.9%


r = c = 5
fig, ax = plt.subplots(r, c)
for i in range(r):
for j in range(c):
k = i * c + j
img = test_x[k]
img = img.reshape((28, 28))
lbl = test_y[k].argmax()
lbl = str(lbl)
pred = str(pred_y[k].argmax())
ind = '✔' if lbl == pred else '✖'
title = '{}{}'.format(pred, ind)
ax[i, j].imshow(img, 'gray_r')
ax[i, j].set_title(title)
ax[i, j].axis('off')
plt.show()


sess.close()
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W)+b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

begin = time.time()
for i in range(1000):
k = i % 60
bb= k * 1000
be = (k+1)*1000
batch_x = train_x[bb:be]
batch_y = train_y[bb:be]
sess.run(train_step, feed_dict={x: batch_x, y_: batch_y})
end = time.time()
print('Rumtime: {} seconds'.format(end-begin))
pred_y = sess.run(y, feed_dict={x: test_x})
result = [0, 0]
for ty, py in zip(test_y, pred_y):
result[ty.argmax()==py.argmax()] += 1
error_rate = result[0] * 100 / sum(result)
print('Error rate is: {}%'.format(error_rate))

Rumtime: 8.727920055389404 seconds
Error rate is: 7.77%


r = c = 5
fig, ax = plt.subplots(r, c)
k = 0
for i in range(r):
for j in range(c):
while test_y[k].argmax() == pred_y[k].argmax():
k += 1
img = test_x[k]
img = img.reshape((28, 28))
lbl = test_y[k].argmax()
lbl = str(lbl)
pred = str(pred_y[k].argmax())
ind = '✔' if lbl == pred else '✖'
title = '{}{}'.format(pred, ind)
ax[i, j].imshow(img, 'gray_r')
ax[i, j].set_title(title)
ax[i, j].axis('off')
k += 1