Advertisement

CS231_A1:Two-layer Neural Net

阅读量:

Codes All from:

http://www.cnblogs.com/daihengchen/p/5754383.html

https://github.com/autoliuweijie/DeepLearning/blob/master/cs231n/HomeWorks/assignment1/two_layer_net.ipynb

继续学习ing。

A Neural Network

**
**

首先定义了一个函数rel_error来计算relative error相对误差。

复制代码
 # A bit of setup

    
  
    
 import numpy as np
    
 import matplotlib.pyplot as plt
    
  
    
 from cs231n.classifiers.neural_net import TwoLayerNet
    
  
    
 %matplotlib inline
    
 plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
    
 plt.rcParams['image.interpolation'] = 'nearest'
    
 plt.rcParams['image.cmap'] = 'gray'
    
  
    
 # for auto-reloading external modules
    
 # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
    
 %load_ext autoreload
    
 %autoreload 2
    
  
    
 def rel_error(x, y):
    
   """ returns relative error """
    
   return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

定义了一个两层的网络(输入层+神经元隐含层+输出层)

分别初始化模型和toy data(这个应该怎么翻译??玩具数据集??反正应该是个很小的用来做测试的数据集吧),

toy data一共有5个数据,分别属于3中不同的类别。

复制代码
 # Create a small net and some toy data to check your implementations.

    
 # Note that we set the random seed for repeatable experiments.
    
  
    
 input_size = 4
    
 hidden_size = 10
    
 num_classes = 3
    
 num_inputs = 5
    
  
    
 def init_toy_model():
    
   np.random.seed(0)
    
   return TwoLayerNet(input_size, hidden_size, num_classes, std=1e-1)
    
  
    
 def init_toy_data():
    
   np.random.seed(1)
    
   X = 10 * np.random.randn(num_inputs, input_size)
    
   y = np.array([0, 1, 2, 2, 1])
    
   return X, y
    
  
    
 net = init_toy_model()
    
 X, y = init_toy_data()

补全nueral_net.py中的函数。

复制代码
 import numpy as np

    
 import matplotlib.pyplot as plt
    
  
    
  
    
 class TwoLayerNet(object):
    
   """
    
   A two-layer fully-connected neural network. The net has an input dimension of
    
   N, a hidden layer dimension of H, and performs classification over C classes.
    
   We train the network with a softmax loss function and L2 regularization on the
    
   weight matrices. The network uses a ReLU nonlinearity after the first fully
    
   connected layer.
    
   13.   In other words, the network has the following architecture:
    
   15.   input - fully connected layer - ReLU - fully connected layer - softmax
    
   17.   The outputs of the second fully-connected layer are the scores for each class.
    
   """
    
  
    
   def __init__(self, input_size, hidden_size, output_size, std=1e-4):
    
     """
    
     Initialize the model. Weights are initialized to small random values and
    
     biases are initialized to zero. Weights and biases are stored in the
    
     variable self.params, which is a dictionary with the following keys:
    
   26.     W1: First layer weights; has shape (D, H)
    
     b1: First layer biases; has shape (H,)
    
     W2: Second layer weights; has shape (H, C)
    
     b2: Second layer biases; has shape (C,)
    
   31.     Inputs:
    
     - input_size: The dimension D of the input data.
    
     - hidden_size: The number of neurons H in the hidden layer.
    
     - output_size: The number of classes C.
    
     """
    
     self.params = {}
    
     self.params['W1'] = std * np.random.randn(input_size, hidden_size)
    
     self.params['b1'] = np.zeros(hidden_size)
    
     self.params['W2'] = std * np.random.randn(hidden_size, output_size)
    
     self.params['b2'] = np.zeros(output_size)
    
  
    
   def loss(self, X, y=None, reg=0.0):
    
     """
    
     Compute the loss and gradients for a two layer fully connected neural
    
     network.
    
   47.     Inputs:
    
     - X: Input data of shape (N, D). Each X[i] is a training sample.
    
     - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
    
       an integer in the range 0 <= y[i] < C. This parameter is optional; if it
    
       is not passed then we only return scores, and if it is passed then we
    
       instead return the loss and gradients.
    
     - reg: Regularization strength.
    
   55.     Returns:
    
     If y is None, return a matrix scores of shape (N, C) where scores[i, c] is
    
     the score for class c on input X[i].
    
   59.     If y is not None, instead return a tuple of:
    
     - loss: Loss (data loss and regularization loss) for this batch of training
    
       samples.
    
     - grads: Dictionary mapping parameter names to gradients of those parameters
    
       with respect to the loss function; has the same keys as self.params.
    
     """
    
     # Unpack variables from the params dictionary
    
     W1, b1 = self.params['W1'], self.params['b1']
    
     W2, b2 = self.params['W2'], self.params['b2']
    
     N, D = X.shape
    
  
    
     # Compute the forward pass
    
     scores = None
    
     #############################################################################
    
     # TODO: Perform the forward pass, computing the class scores for the input. #
    
     # Store the result in the scores variable, which should be an array of      #
    
     # shape (N, C).                                                             #
    
     #############################################################################
    
     h1 = np.maximum( 0 , np.dot( X , W1) + b1 )  #ReLU
    
     scores = np.dot( h1 , W2 ) + b2
    
     #############################################################################
    
     #                              END OF YOUR CODE                             #
    
     #############################################################################
    
     
    
     # If the targets are not given then jump out, we're done
    
     if y is None:
    
       return scores
    
  
    
     # Compute the loss
    
     loss = None
    
     #############################################################################
    
     # TODO: Finish the forward pass, and compute the loss. This should include  #
    
     # both the data loss and L2 regularization for W1 and W2. Store the result  #
    
     # in the variable loss, which should be a scalar. Use the Softmax           #
    
     # classifier loss. So that your results match ours, multiply the            #
    
     # regularization loss by 0.5                                                #
    
     #############################################################################
    
     scores_max = np.max(scores , axis=1 , keepdims=True) #[N,1]
    
     exp_scores = np.exp(scores - scores_max)             #[N,C]
    
     probs = exp_scores / np.sum(exp_scores,axis=1,keepdims=True) #[N,C]
    
     
    
     correct_logprobs = -np.log(probs[range(N),y])        #[N,1]
    
     data_loss = np.sum(correct_logprobs) / N             #数据损失
    
     reg_loss = 0.5 * reg * np.sum(W1 * W1) + 0.5 * reg * np.sum(W2 * W2)
    
     loss = data_loss + reg_loss
    
     #############################################################################
    
     #                              END OF YOUR CODE                             #
    
     #############################################################################
    
  
    
  
    
     # Backward pass: compute gradients
    
     grads = {}
    
     #############################################################################
    
     # TODO: Compute the backward pass, computing the derivatives of the weights #
    
     # and biases. Store the results in the grads dictionary. For example,       #
    
     # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    
     #############################################################################
    
     dscores = probs          #[N,C]
    
     dscores[range(N),y] -= 1
    
     dscores /= N
    
     # W2 & b2
    
     dW2 = np.dot(h1.T , dscores)
    
     db2 = np.sum(dscores,axis=0,keepdims=True)
    
     # W1 & b1
    
     dh1 = np.dot(dscores,W2.T)
    
     dh1[h1<=0] = 0  #ReLU
    
     dW1 = np.dot(X.T,dh1)
    
     db1 = np.sum(dh1,axis=0,keepdims=True)
    
     #正则化部分
    
     dW2 += reg * W2
    
     dW1 += reg * W1
    
     grads['W1'] = dW1
    
     grads['b1'] = db1
    
     grads['W2'] = dW2
    
     grads['b2'] = db2
    
     #############################################################################
    
     #                              END OF YOUR CODE                             #
    
     #############################################################################
    
  
    
     return loss, grads

当y=None时,看一下计算的得分是否正确。

Your scores:
[[-0.81233741 -1.27654624 -0.70335995]
[-0.17129677 -1.18803311 -0.47310444]
[-0.51590475 -1.01354314 -0.8504215 ]
[-0.15419291 -0.48629638 -0.52901952]
[-0.00618733 -0.12435261 -0.15226949]]

correct scores:
[[-0.81233741 -1.27654624 -0.70335995]
[-0.17129677 -1.18803311 -0.47310444]
[-0.51590475 -1.01354314 -0.8504215 ]
[-0.15419291 -0.48629638 -0.52901952]
[-0.00618733 -0.12435261 -0.15226949]]

Difference between your scores and correct scores:
3.68027204961e-08

OK。

将正确数据y带入,reg=0.1测试一下。

复制代码
 loss, _ = net.loss(X, y, reg=0.1)

    
 correct_loss = 1.30378789133
    
  
    
 # should be very small, we get < 1e-12
    
 print ('Difference between your loss and correct loss:')
    
 print (np.sum(np.abs(loss - correct_loss)))

结果:

Difference between your loss and correct loss:
1.79856129989e-13

这个结果我觉得ok哈哈哈。

下面是梯度检查环节~

复制代码
 from cs231n.gradient_check import eval_numerical_gradient

    
  
    
 # Use numeric gradient checking to check your implementation of the backward pass.
    
 # If your implementation is correct, the difference between the numeric and
    
 # analytic gradients should be less than 1e-8 for each of W1, W2, b1, and b2.
    
  
    
 loss, grads = net.loss(X, y, reg=0.1)
    
  
    
 # these should all be less than 1e-8 or so
    
 for param_name in grads:
    
   f = lambda W: net.loss(X, y, reg=0.1)[0]
    
   param_grad_num = eval_numerical_gradient(f, net.params[param_name], verbose=False)
    
   print ('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name]))

结果:

W1 max relative error: 3.561318e-09
b1 max relative error: 1.555470e-09
W2 max relative error: 3.440708e-09
b2 max relative error: 3.865091e-11

也ok~~

下面开始训练这个网络辣。

复制代码
   def train(self, X, y, X_val, y_val,

    
         learning_rate=1e-3, learning_rate_decay=0.95,
    
         reg=1e-5, num_iters=100,
    
         batch_size=200, verbose=False):
    
     """
    
     Train this neural network using stochastic gradient descent.
    
   8.     Inputs:
    
     - X: A numpy array of shape (N, D) giving training data.
    
     - y: A numpy array f shape (N,) giving training labels; y[i] = c means that
    
       X[i] has label c, where 0 <= c < C.
    
     - X_val: A numpy array of shape (N_val, D) giving validation data.
    
     - y_val: A numpy array of shape (N_val,) giving validation labels.
    
     - learning_rate: Scalar giving learning rate for optimization.
    
     - learning_rate_decay: Scalar giving factor used to decay the learning rate
    
       after each epoch.
    
     - reg: Scalar giving regularization strength.
    
     - num_iters: Number of steps to take when optimizing.
    
     - batch_size: Number of training examples to use per step.
    
     - verbose: boolean; if true print progress during optimization.
    
     """
    
     num_train = X.shape[0]
    
     iterations_per_epoch = max(num_train / batch_size, 1)
    
  
    
     # Use SGD to optimize the parameters in self.model
    
     loss_history = []
    
     train_acc_history = []
    
     val_acc_history = []
    
  
    
     for it in range(num_iters):
    
       X_batch = None
    
       y_batch = None
    
  
    
       #########################################################################
    
       # TODO: Create a random minibatch of training data and labels, storing  #
    
       # them in X_batch and y_batch respectively.                             #
    
       #########################################################################
    
       sample_index = np.random.choice(num_train,batch_size)
    
       X_batch = X[sample_index,:]
    
       y_batch = y[sample_index]
    
       #########################################################################
    
       #                             END OF YOUR CODE                          #
    
       #########################################################################
    
  
    
       # Compute loss and gradients using the current minibatch
    
       loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
    
       loss_history.append(loss)
    
  
    
       #########################################################################
    
       # TODO: Use the gradients in the grads dictionary to update the         #
    
       # parameters of the network (stored in the dictionary self.params)      #
    
       # using stochastic gradient descent. You'll need to use the gradients   #
    
       # stored in the grads dictionary defined above.                         #
    
       #########################################################################
    
       grads['b2'] = grads['b2'].reshape(-1)
    
       grads['b1'] = grads['b1'].reshape(-1)
    
       self.params['W2'] += -learning_rate * grads['W2']
    
       self.params['b2'] -= learning_rate * grads['b2']
    
       self.params['W1'] += -learning_rate * grads['W1']
    
       self.params['b1'] -= learning_rate * grads['b1']
    
       #########################################################################
    
       #                             END OF YOUR CODE                          #
    
       #########################################################################
    
  
    
       if verbose and it % 100 == 0:
    
     print ('iteration %d / %d: loss %f' % (it, num_iters, loss))
    
  
    
       # Every epoch, check train and val accuracy and decay learning rate.
    
       if it % iterations_per_epoch == 0:
    
     # Check accuracy
    
     train_acc = (self.predict(X_batch) == y_batch).mean()
    
     val_acc = (self.predict(X_val) == y_val).mean()
    
     train_acc_history.append(train_acc)
    
     val_acc_history.append(val_acc)
    
  
    
     # Decay learning rate
    
     learning_rate *= learning_rate_decay
    
  
    
     return {
    
       'loss_history': loss_history,
    
       'train_acc_history': train_acc_history,
    
       'val_acc_history': val_acc_history,
    
     }

迭代100次运算看一下损失的下降情况:

复制代码

好了,模型检查没有问题,下面正式载入CIFAR-10的数据开始训练辣。

Train data shape: (49000, 3072)
Train labels shape: (49000,)
Validation data shape: (1000, 3072)
Validation labels shape: (1000,)
Test data shape: (1000, 3072)
Test labels shape: (1000,)

补全神经网络的预测函数。

复制代码
   def predict(self, X):

    
     """
    
     Use the trained weights of this two-layer network to predict labels for
    
     data points. For each data point we predict scores for each of the C
    
     classes, and assign each data point to the class with the highest score.
    
   7.     Inputs:
    
     - X: A numpy array of shape (N, D) giving N D-dimensional data points to
    
       classify.
    
   11.     Returns:
    
     - y_pred: A numpy array of shape (N,) giving predicted labels for each of
    
       the elements of X. For all i, y_pred[i] = c means that X[i] is predicted
    
       to have class c, where 0 <= c < C.
    
     """
    
     y_pred = None
    
  
    
     ###########################################################################
    
     # TODO: Implement this function; it should be VERY simple!                #
    
     ###########################################################################
    
     h1 = np.maximum(0,(np.dot(X,self.params['W1']+self.params['b1'])))
    
     scores = np.dot(h1,self.params['W2']+self.params['b2'])
    
     y_pred = np.argmax(scores,axis=1)
    
     ###########################################################################
    
     #                              END OF YOUR CODE                           #
    
     ###########################################################################
    
  
    
     return y_pred

进行1000次迭代训练看看验证集上的正确率如何~

iteration 0 / 1000: loss 2.302970
iteration 100 / 1000: loss 2.302474
iteration 200 / 1000: loss 2.297076
iteration 300 / 1000: loss 2.257328
iteration 400 / 1000: loss 2.230484
iteration 500 / 1000: loss 2.150620
iteration 600 / 1000: loss 2.080736
iteration 700 / 1000: loss 2.054914
iteration 800 / 1000: loss 1.979290
iteration 900 / 1000: loss 2.039101
Validation accuracy: 0.293

权重可视化。这个形状看起来像个小车车。

接下来看一下不同参数下的训练结果:

复制代码
 best_net = None # store the best model into this

    
  
    
 #################################################################################
    
 # TODO: Tune hyperparameters using the validation set. Store your best trained  #
    
 # model in best_net.                                                            #
    
 #                                                                               #
    
 # To help debug your network, it may help to use visualizations similar to the  #
    
 # ones we used above; these visualizations will have significant qualitative    #
    
 # differences from the ones we saw above for the poorly tuned network.          #
    
 #                                                                               #
    
 # Tweaking hyperparameters by hand can be fun, but you might find it useful to  #
    
 # write code to sweep through possible combinations of hyperparameters          #
    
 # automatically like we did on the previous exercises.                          #
    
 #################################################################################
    
 hidden_size = [10,50]
    
 learning_rate = [1e-3,1.4e-4,1e-4]
    
 reg_list = [0.5,0.75,1.25]
    
 params = [[hid,lr,reg] for hid in hidden_size \
    
      for lr in learning_rate for reg in reg_list]
    
  
    
 best_acc = 0
    
 for param in params:
    
     net = TwoLayerNet(input_size,param[0],num_classes)
    
     stats = net.train(X_train,y_train,X_val,y_val,
    
                  num_iters=1000,batch_size=200,
    
                  learning_rate=param[1],learning_rate_decay=0.95,
    
                  reg=param[2],verbose=False)
    
     val_acc = (net.predict(X_val)==y_val).mean()
    
     print('hidden_size: %s lr: %s reg: %s -> Acc: %s;' \
    
       %(param[0],param[1],param[2],val_acc))
    
     if val_acc > best_acc:
    
     best_acc = val_acc
    
     best_net = net
    
 print('best acc: %s' % best_acc)
    
 #################################################################################
    
 #                               END OF YOUR CODE                                #
    
 #################################################################################

结果:

hidden_size: 10 lr: 0.001 reg: 0.5 -> Acc: 0.318;
hidden_size: 10 lr: 0.001 reg: 0.75 -> Acc: 0.287;
hidden_size: 10 lr: 0.001 reg: 1.25 -> Acc: 0.31;
hidden_size: 10 lr: 0.00014 reg: 0.5 -> Acc: 0.319;
hidden_size: 10 lr: 0.00014 reg: 0.75 -> Acc: 0.288;
hidden_size: 10 lr: 0.00014 reg: 1.25 -> Acc: 0.284;
hidden_size: 10 lr: 0.0001 reg: 0.5 -> Acc: 0.254;
hidden_size: 10 lr: 0.0001 reg: 0.75 -> Acc: 0.261;
hidden_size: 10 lr: 0.0001 reg: 1.25 -> Acc: 0.245;
hidden_size: 50 lr: 0.001 reg: 0.5 -> Acc: 0.285;
hidden_size: 50 lr: 0.001 reg: 0.75 -> Acc: 0.259;
hidden_size: 50 lr: 0.001 reg: 1.25 -> Acc: 0.27;
hidden_size: 50 lr: 0.00014 reg: 0.5 -> Acc: 0.313;
hidden_size: 50 lr: 0.00014 reg: 0.75 -> Acc: 0.313;
hidden_size: 50 lr: 0.00014 reg: 1.25 -> Acc: 0.326;
hidden_size: 50 lr: 0.0001 reg: 0.5 -> Acc: 0.283;
hidden_size: 50 lr: 0.0001 reg: 0.75 -> Acc: 0.279;
hidden_size: 50 lr: 0.0001 reg: 1.25 -> Acc: 0.285;
best acc: 0.326

可视化结果:

Test accuracy: 0.333

这个准确率有点。。。。。参数选的还是不太好。。。。。

不小电脑都嗡嗡的叫唤了,今天就到这吧。。。。

Done~~

全部评论 (0)

还没有任何评论哟~