diff --git a/Playground.ipynb b/Playground.ipynb index fd84dd2..018b0ca 100644 --- a/Playground.ipynb +++ b/Playground.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -176,122 +176,11 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ - "conv_net = Net()" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable containing:\n", - "(0 ,.,.) = \n", - " -0.0385 0.1033 0.0002 0.1075 -0.1455\n", - " 0.0613 0.0409 -0.0703 -0.0915 -0.0821\n", - " 0.0756 0.1122 -0.1820 -0.1431 0.0584\n", - " 0.1220 0.1551 -0.0539 0.1442 0.1441\n", - " 0.0291 -0.1069 0.1430 -0.0193 0.1652\n", - "[torch.FloatTensor of size 1x5x5]" - ] - }, - "execution_count": 139, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "W = list(conv_net.parameters())[0].clone()\n", - "W[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [], - "source": [ - "dtype = torch.FloatTensor" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [], - "source": [ - "reg_loss = Variable(torch.zeros(1), requires_grad=True).type(dtype)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [], - "source": [ - "max_norm(conv_net, 0.01, eps=1e-8)" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Variable containing:\n", - "(0 ,.,.) = \n", - " -0.0385 0.1033 0.0002 0.1075 -0.1455\n", - " 0.0613 0.0409 -0.0703 -0.0915 -0.0821\n", - " 0.0756 0.1122 -0.1820 -0.1431 0.0584\n", - " 0.1220 0.1551 -0.0539 0.1442 0.1441\n", - " 0.0291 -0.1069 0.1430 -0.0193 0.1652\n", - "[torch.FloatTensor of size 1x5x5]" - ] - }, - "execution_count": 141, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "W = list(conv_net.parameters())[0].clone()\n", - "W[1]" + "model = Net()" ] }, { @@ -299,95 +188,15 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "W = torch.randn(100, 100)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ - "norm = W.norm(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "my = torch.sum(torch.pow(W, 2))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "10089.580074277259" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "10089.580071259983" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "norm ** 2" + "orth_loss = Variable(torch.FloatTensor(1), requires_grad=True)\n", + "for name, param in model.named_parameters():\n", + " if 'bias' not in name:\n", + " W_reshaped = W.view(W.shape[0], -1)\n", + " sym = torch.mm(W_reshaped, torch.t(W_reshaped))\n", + " sym -= Variable(torch.eye(W_reshaped.shape[0]))\n", + " orth_loss = orth_loss + sym.sum()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/README.md b/README.md index b8927f1..0d68887 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,60 @@ -`[...] = in progress` - ## Weight Initialization -* [Xavier initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) (general-purpose activation) -* [He et. al initialization](https://www.google.com.lb/search?q=kaiming+he+init&oq=kaiming+he+init&aqs=chrome..69i57j0l5.3422j0j4&sourceid=chrome&ie=UTF-8) (ReLU activation) -* [Orthogonal initialization](https://arxiv.org/pdf/1312.6120v3.pdf) -* [SELU initialization](https://arxiv.org/pdf/1706.02515.pdf) (SELU activation) - #### Xavier Initialization [...] +```python +for m in model.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + nn.init.xavier_normal(m.weight) +``` + +- [arXiv](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) + #### He et. al Initialization -[...] +This is a similar initialization to Xavier tailored specifically for ReLU activations. Note that `fan_in` refers to the number of inputs to the layer. -#### Orthogonal Initialization +```python +for m in model.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + nn.init.kaiming_normal(m.weight, mode='fan_in') +``` -- [Blog Post](https://hjweide.github.io/orthogonal-initialization-in-convolutional-layers) -- [Smerity Blog Post](https://smerity.com/articles/2016/orthogonal_init.html) -- [Google+ Discussion](https://plus.google.com/+SoumithChintala/posts/RZfdrRQWL6u) -- [Reddit Discussion](https://www.reddit.com/r/MachineLearning/comments/2qsje7/how_do_you_initialize_your_neural_network_weights/) +- [arXiv](https://arxiv.org/abs/1502.01852) -#### SELU initialization +#### Orthogonal Initialization -[...] +Orthogonality is a desirable quality in convnet weights in part because it is norm preserving, i.e. it rotates the original matrix, but cannot change its norm (scale/shear). This property is valuable in deep or recurrent networks, where repeated matrix multiplication can result in signals vanishing or exploding. ```python -# Xavier init -for m in model: +for m in model.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): - nn.init.xavier_normal(m.weight) + nn.init.orthogonal(m.weight) +``` -# He et. al init -for m in model: - if isinstance(m, (nn.Conv2d, nn.Linear)): - nn.init.kaiming_normal(m.weight) +- [arXiv](https://arxiv.org/abs/1312.6120) +- [Blog Post](https://hjweide.github.io/orthogonal-initialization-in-convolutional-layers) +- [Smerity Blog Post](https://smerity.com/articles/2016/orthogonal_init.html) +- [Google+ Discussion](https://plus.google.com/+SoumithChintala/posts/RZfdrRQWL6u) +- [Reddit Discussion](https://www.reddit.com/r/MachineLearning/comments/2qsje7/how_do_you_initialize_your_neural_network_weights/) -# orthogonal init -for m in model: - if isinstance(m, (nn.Conv2d, nn.Linear)): - nn.init.orthogonal(m.weight) +#### SELU Initialization -# SELU init -for m in model: +```python +for m in model.modules(): if isinstance(m, nn.Conv2d): - n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels nn.init.normal(m.weight, 0, sqrt(1. / n)) elif isinstance(m, nn.Linear): - n = m.out_features + n = m.in_features nn.init.normal(m.weight, 0, sqrt(1. / n)) ``` -For `BatchNorm` we initialize the weights to 1 and the biases to 0. +- [arXiv](https://arxiv.org/abs/1706.02515) + +#### Batch Norm Initialization ```python for m in model: @@ -63,82 +65,87 @@ for m in model: ## Weight Regularization -* L2 Regularization: add L2 norm weight penalty to loss function. -* L1 Regularization: add L1 norm weight penalty to loss function. -* Orthogonal Regularization: apply a weight penalty of `|W*W.T - I|` to loss function. -* Max Norm Constraint: clamp weight norm to less than a constant `W.norm(2) < c`. - #### L2 Regularization Heavily penalizes peaky weight vectors and encourages diffuse weight vectors. Has the appealing property of encouraging the network to use all of its inputs a little rather that some of its inputs a lot. +```python +reg = 1e-6 +l2_loss = Variable(torch.FloatTensor(1), requires_grad=True) +for name, param in model.named_parameters(): + if 'bias' not in name: + l2_loss = l2_loss + (0.5 * reg * torch.sum(torch.pow(W, 2))) +``` + #### L1 Regularization Encourages sparsity, meaning we encourage the network to select the most useful inputs/features rather than use all. +```python +reg = 1e-6 +l1_loss = Variable(torch.FloatTensor(1), requires_grad=True) +for name, param in model.named_parameters(): + if 'bias' not in name: + l1_loss = l1_loss + (reg * torch.sum(torch.abs(W))) +``` + #### Orthogonal Regularization -Improves gradient flow by keeping the matrix norm close to 1. This is because orthogonal matrices represent an isometry of R^n, i.e. they preserve lengths and angles. They rotate vectors, but cannot scale or shear them. +Improves gradient flow by keeping the matrix norm close to unitary. + +```python +reg = 1e-6 +orth_loss = Variable(torch.FloatTensor(1), requires_grad=True) +for name, param in model.named_parameters(): + if 'bias' not in name: + param_flat = param.view(param.shape[0], -1) + sym = torch.mm(param_flat, torch.t(param_flat)) + sym -= Variable(torch.eye(param_flat.shape[0])) + orth_loss = orth_loss + (reg * sym.sum()) +``` + +- [arXiv](https://arxiv.org/abs/1609.07093) #### Max Norm Constraint If a hidden unit's weight vector's L2 norm `L` ever gets bigger than a certain max value `c`, multiply the weight vector by `c/L`. Enforce it immediately after each weight vector update or after every `X` gradient update. -This constraint is another form of regularization. While L2 penalizes high weights using the loss function, "max norm" acts directly on the weights. L2 exerts a constant pressure to move the weights near zero which could throw a"way useful information when the loss function doesn't provide incentive for the weights to remain far from zero. On the other hand, "max norm" never drives the weights to near zero. As long as the norm is less than the constraint value, the constraint has no effect. - -- [Google+ Discussion](https://plus.google.com/+IanGoodfellow/posts/QUaCJfvDpni) +This constraint is another form of regularization. While L2 penalizes high weights using the loss function, "max norm" acts directly on the weights. L2 exerts a constant pressure to move the weights near zero which could throw away useful information when the loss function doesn't provide incentive for the weights to remain far from zero. On the other hand, "max norm" never drives the weights to near zero. As long as the norm is less than the constraint value, the constraint has no effect. ```python -# l2 reg -l2_loss = Variable(torch.FloatTensor(1), requires_grad=True) -for W in model.parameters(): - l2_loss = l2_loss + (0.5 * W.norm(2) ** 2) - -# l1 reg -l1_loss = Variable(torch.FloatTensor(1), requires_grad=True) -for W in model.parameters(): - l1_loss = l1_loss + W.norm(1) - -# orthogonal reg -orth_loss = Variable(torch.FloatTensor(1), requires_grad=True) -for W in model.parameters(): - W_reshaped = W.view(W.shape[0], -1) - sym = torch.mm(W_reshaped, torch.t(W_reshaped)) - sym -= Variable(torch.eye(W_reshaped.shape[0])) - orth_loss = orth_loss + sym.sum() - -# max norm constraint def max_norm(model, max_val=3, eps=1e-8): for name, param in model.named_parameters(): if 'bias' not in name: - # l2 norm per row (batch) norm = param.norm(2, dim=0, keepdim=True) desired = torch.clamp(norm, 0, max_val) param = param * (desired / (eps + norm)) ``` +- [Google+ Discussion](https://plus.google.com/+IanGoodfellow/posts/QUaCJfvDpni) + ## Batch Normalization [...] -## Optimization Misc. +## Dropout [...] +## Optimization Misc. + - Learning Rate - Batch Size -- Effect on Generalization - -References +- Optimizer +- Generalization - [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) - [SGDR: Stochastic Gradient Descent with Warm Restarts](https://arxiv.org/abs/1608.03983) - [On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima](https://arxiv.org/abs/1609.04836) - [Don't Decay the Learning Rate, Increase the Batch Size](https://arxiv.org/abs/1711.00489) -- https://www.reddit.com/r/MachineLearning/comments/77dn96/r_171006451_understanding_generalization_and/dol2u23/ +- [Reddit Discussion](https://www.reddit.com/r/MachineLearning/comments/77dn96/r_171006451_understanding_generalization_and/dol2u23/) ## Correct Validation Strategies [...] -- https://www.reddit.com/r/MachineLearning/comments/78789r/d_is_my_validation_method_good/ \ No newline at end of file +- [Reddit Discussion](https://www.reddit.com/r/MachineLearning/comments/78789r/d_is_my_validation_method_good/) \ No newline at end of file