Macbook air, Jupyter-Notebook, Python2.7

参考
- link_1
- link_2

上記のリンクを参考にしソフトマックス回帰の実装を試みました。accurate: 0.104384133612と精度が約10パーセントしかありません。cost functionもすぐに極大点にたどり着きます、(局所解とは思えない、、)。どこがおかしいのでしょうか?

初歩的な内容かもしれませんが、よろしくお願いします。

 def softmax(z):
    e = np.exp(z - np.max(z))
    #return e/np.sum(e)
    return np.maximum(1e-5, e/np.sum(e))

n_class = 10

# Weight
W = np.random.randn(X_train.shape[1], n_class)

# Z = XW (+ b)
def transfunc(X, W):
    return np.dot(X, W)

Z = transfunc(X_train, W)

def initialize_label(X):
    Z = transfunc(X, W)
    return np.argmax(np.apply_along_axis(softmax, 1, Z), axis = 1)

# Assign each data to a random label
labels = initialize_label(X_train); labels.shape


def loss_function(X, theta):
    num = X.shape[0]
    J = 0
    z = np.dot(X, theta)
    for i in xrange(num):
        for k in xrange(n_class):
            if labels[i] == k:
                J = J - np.log(softmax(z[i])[k])
            else:
                J = J - 0
    return J

def gradient_k(X, theta, k):
    num = X.shape[0]
    z = np.dot(X, theta)
    grad = np.zeros_like(theta[:,k])
    for i in xrange(num):
        if labels[i] == k:
            grad = grad - X[i]*(1 - softmax(z[i])[k])
        else:
            grad = grad + X[i]*softmax(z[i])[k]
    return grad

loss_cost = [0]

def update(X, theta, eta = 0.0001, max_iter = 100):
    num = X.shape[0]
    for i in xrange(max_iter):
        tmp_theta = theta
        for k in xrange(n_class):
            tmp_theta[:,k] = tmp_theta[:,k] + eta*gradient_k(X, theta, k)
        theta = tmp_theta
        loss_cost.append(loss_function(X, theta))
        if abs(loss_function(X, theta)-loss_cost[i]) < 1e-5:
            print "convege, {}:iter".format(i)
            break
    return theta

書いたスクリプト