目录

1. 编程实现图6-1,并观察特征

2. 观察梯度方向

3. 编写代码实现算法,并可视化轨迹

4. 分析上图,说明原理(选做)

5. 总结SGD、Momentum、AdaGrad、Adam的优缺点(选做)

6. Adam这么好,SGD是不是就用不到了?(选做)

7. 增加RMSprop、Nesterov算法。(选做)

8. 基于MNIST数据集的更新方法的比较(选做)

参考:深度学习入门:基于Python的理论与实现 (ituring.com.cn)

1. 编程实现图6-1,并观察特征

 

 

 参考代码:

import numpy as np

from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

# https://blog.csdn.net/weixin_39228381/article/details/108511882

def func(x, y):

return x * x / 20 + y * y

def paint_loss_func():

x = np.linspace(-50, 50, 100) # x的绘制范围是-50到50,从改区间均匀取100个数

y = np.linspace(-50, 50, 100) # y的绘制范围是-50到50,从改区间均匀取100个数

X, Y = np.meshgrid(x, y)

Z = func(X, Y)

fig = plt.figure() # figsize=(10, 10))

ax = Axes3D(fig)

plt.xlabel('x')

plt.ylabel('y')

ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='rainbow')

plt.show()

paint_loss_func()

 2. 观察梯度方向

3. 编写代码实现算法,并可视化轨迹

SGD、Momentum、Adagrad、Adam

参考代码:

# coding: utf-8

import numpy as np

import matplotlib.pyplot as plt

from collections import OrderedDict

class SGD:

"""随机梯度下降法(Stochastic Gradient Descent)"""

def __init__(self, lr=0.01):

self.lr = lr

def update(self, params, grads):

for key in params.keys():

params[key] -= self.lr * grads[key]

class Momentum:

"""Momentum SGD"""

def __init__(self, lr=0.01, momentum=0.9):

self.lr = lr

self.momentum = momentum

self.v = None

def update(self, params, grads):

if self.v is None:

self.v = {}

for key, val in params.items():

self.v[key] = np.zeros_like(val)

for key in params.keys():

self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]

params[key] += self.v[key]

class Nesterov:

"""Nesterov's Accelerated Gradient (http://arxiv.org/abs/1212.0901)"""

def __init__(self, lr=0.01, momentum=0.9):

self.lr = lr

self.momentum = momentum

self.v = None

def update(self, params, grads):

if self.v is None:

self.v = {}

for key, val in params.items():

self.v[key] = np.zeros_like(val)

for key in params.keys():

self.v[key] *= self.momentum

self.v[key] -= self.lr * grads[key]

params[key] += self.momentum * self.momentum * self.v[key]

params[key] -= (1 + self.momentum) * self.lr * grads[key]

class AdaGrad:

"""AdaGrad"""

def __init__(self, lr=0.01):

self.lr = lr

self.h = None

def update(self, params, grads):

if self.h is None:

self.h = {}

for key, val in params.items():

self.h[key] = np.zeros_like(val)

for key in params.keys():

self.h[key] += grads[key] * grads[key]

params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

class RMSprop:

"""RMSprop"""

def __init__(self, lr=0.01, decay_rate=0.99):

self.lr = lr

self.decay_rate = decay_rate

self.h = None

def update(self, params, grads):

if self.h is None:

self.h = {}

for key, val in params.items():

self.h[key] = np.zeros_like(val)

for key in params.keys():

self.h[key] *= self.decay_rate

self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]

params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

class Adam:

"""Adam (http://arxiv.org/abs/1412.6980v8)"""

def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):

self.lr = lr

self.beta1 = beta1

self.beta2 = beta2

self.iter = 0

self.m = None

self.v = None

def update(self, params, grads):

if self.m is None:

self.m, self.v = {}, {}

for key, val in params.items():

self.m[key] = np.zeros_like(val)

self.v[key] = np.zeros_like(val)

self.iter += 1

lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

for key in params.keys():

self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])

self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

def f(x, y):

return x ** 2 / 20.0 + y ** 2

def df(x, y):

return x / 10.0, 2.0 * y

init_pos = (-7.0, 2.0)

params = {}

params['x'], params['y'] = init_pos[0], init_pos[1]

grads = {}

grads['x'], grads['y'] = 0, 0

optimizers = OrderedDict()

optimizers["SGD"] = SGD(lr=0.95)

optimizers["Momentum"] = Momentum(lr=0.1)

optimizers["AdaGrad"] = AdaGrad(lr=1.5)

optimizers["Adam"] = Adam(lr=0.3)

idx = 1

for key in optimizers:

optimizer = optimizers[key]

x_history = []

y_history = []

params['x'], params['y'] = init_pos[0], init_pos[1]

for i in range(30):

x_history.append(params['x'])

y_history.append(params['y'])

grads['x'], grads['y'] = df(params['x'], params['y'])

optimizer.update(params, grads)

x = np.arange(-10, 10, 0.01)

y = np.arange(-5, 5, 0.01)

X, Y = np.meshgrid(x, y)

Z = f(X, Y)

# for simple contour line

mask = Z > 7

Z[mask] = 0

# plot

plt.subplot(2, 2, idx)

idx += 1

plt.plot(x_history, y_history, 'o-', color="red")

plt.contour(X, Y, Z) # 绘制等高线

plt.ylim(-10, 10)

plt.xlim(-10, 10)

plt.plot(0, 0, '+')

plt.title(key)

plt.xlabel("x")

plt.ylabel("y")

plt.subplots_adjust(wspace=0, hspace=0) # 调整子图间距

plt.show()

4. 分析上图,说明原理(选做)

为什么SGD会走“之字形”?其它算法为什么会比较平滑?Momentum、AdaGrad对SGD的改进体现在哪里?速度?方向?在图上有哪些体现?仅从轨迹来看,Adam似乎不如AdaGrad效果好,是这样么?四种方法分别用了多长时间?是否符合预期?调整学习率、动量等超参数,轨迹有哪些变化?

5. 总结SGD、Momentum、AdaGrad、Adam的优缺点(选做)

6. Adam这么好,SGD是不是就用不到了?(选做)

7. 增加RMSprop、Nesterov算法。(选做)

对比Momentum与Nesterov、AdaGrad与RMSprop。

8. 基于MNIST数据集的更新方法的比较(选做)

在原图基础上,增加RMSprop、Nesterov算法。

编程实现,并谈谈自己的看法。

 优化算法代码可参考前面的内容。

 MNIST数据集的更新方法的比较:

# coding: utf-8

import os

import sys

sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定

import matplotlib.pyplot as plt

from dataset.mnist import load_mnist

from common.util import smooth_curve

from common.multi_layer_net import MultiLayerNet

from common.optimizer import *

# 0:读入MNIST数据==========

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

train_size = x_train.shape[0]

batch_size = 128

max_iterations = 2000

# 1:进行实验的设置==========

optimizers = {}

optimizers['SGD'] = SGD()

optimizers['Momentum'] = Momentum()

optimizers['AdaGrad'] = AdaGrad()

optimizers['Adam'] = Adam()

#optimizers['RMSprop'] = RMSprop()

networks = {}

train_loss = {}

for key in optimizers.keys():

networks[key] = MultiLayerNet(

input_size=784, hidden_size_list=[100, 100, 100, 100],

output_size=10)

train_loss[key] = []

# 2:开始训练==========

for i in range(max_iterations):

batch_mask = np.random.choice(train_size, batch_size)

x_batch = x_train[batch_mask]

t_batch = t_train[batch_mask]

for key in optimizers.keys():

grads = networks[key].gradient(x_batch, t_batch)

optimizers[key].update(networks[key].params, grads)

loss = networks[key].loss(x_batch, t_batch)

train_loss[key].append(loss)

if i % 100 == 0:

print( "===========" + "iteration:" + str(i) + "===========")

for key in optimizers.keys():

loss = networks[key].loss(x_batch, t_batch)

print(key + ":" + str(loss))

# 3.绘制图形==========

markers = {"SGD": "o", "Momentum": "x", "AdaGrad": "s", "Adam": "D"}

x = np.arange(max_iterations)

for key in optimizers.keys():

plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)

plt.xlabel("iterations")

plt.ylabel("loss")

plt.ylim(0, 1)

plt.legend()

plt.show()

好文链接

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: