编程实现优化算法,并3D可视化

1. 函数3D可视化

分别画出 和 的3D图

3D图

实验代码:

import numpy as np

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

# 创建数据点

x = np.linspace(-10, 10, 100)

y = np.linspace(-10, 10, 100)

X, Y = np.meshgrid(x, y)

# 计算函数值

Z = X**2 + Y**2 + Y**3 + X*Y

# 创建3D图形对象

fig = plt.figure()

ax = fig.add_subplot(111, projection='3d')

# 绘制第一个函数的3D图

ax.plot_surface(X, Y, Z, cmap='viridis')

ax.set_xlabel('X')

ax.set_ylabel('Y')

ax.set_zlabel('Z')

ax.set_title('$x[0]^2 + x[1]^2 + x[1]^3 + x[0] \\times x[1]$')

# 显示图形

plt.show()

 3D图

实验代码:

import numpy as np

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

# 创建数据点

x = np.linspace(-10, 10, 100)

y = np.linspace(-10, 10, 100)

X, Y = np.meshgrid(x, y)

# 计算函数值

Z = X**2/20 + Y**2

# 创建3D图形对象

fig = plt.figure()

ax = fig.add_subplot(111, projection='3d')

# 绘制第二个函数的3D图

ax.plot_surface(X, Y, Z, cmap='viridis')

ax.set_xlabel('X')

ax.set_ylabel('Y')

ax.set_zlabel('Z')

ax.set_title('$\\frac{x^2}{20} + y^2$')

# 显示图形

plt.show()

2.加入优化算法,画出轨迹

分别画出 和 的3D轨迹图

结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

实验代码:

import torch

import numpy as np

import copy

from matplotlib import pyplot as plt

from matplotlib import animation

from itertools import zip_longest

class Op(object):

def __init__(self):

pass

def __call__(self, inputs):

return self.forward(inputs)

# 输入:张量inputs

# 输出:张量outputs

def forward(self, inputs):

# return outputs

raise NotImplementedError

# 输入:最终输出对outputs的梯度outputs_grads

# 输出:最终输出对inputs的梯度inputs_grads

def backward(self, outputs_grads):

# return inputs_grads

raise NotImplementedError

class Optimizer(object): # 优化器基类

def __init__(self, init_lr, model):

"""

优化器类初始化

"""

# 初始化学习率,用于参数更新的计算

self.init_lr = init_lr

# 指定优化器需要优化的模型

self.model = model

def step(self):

"""

定义每次迭代如何更新参数

"""

pass

class SimpleBatchGD(Optimizer):

def __init__(self, init_lr, model):

super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)

def step(self):

# 参数更新

if isinstance(self.model.params, dict):

for key in self.model.params.keys():

self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]

class Adagrad(Optimizer):

def __init__(self, init_lr, model, epsilon):

"""

Adagrad 优化器初始化

输入:

- init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数

"""

super(Adagrad, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.epsilon = epsilon

def adagrad(self, x, gradient_x, G, init_lr):

"""

adagrad算法更新参数,G为参数梯度平方的累计值。

"""

G += gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""

参数更新

"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class RMSprop(Optimizer):

def __init__(self, init_lr, model, beta, epsilon):

"""

RMSprop优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta:衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(RMSprop, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.beta = beta

self.epsilon = epsilon

def rmsprop(self, x, gradient_x, G, init_lr):

"""

rmsprop算法更新参数,G为迭代梯度平方的加权移动平均

"""

G = self.beta * G + (1 - self.beta) * gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class Momentum(Optimizer):

def __init__(self, init_lr, model, rho):

"""

Momentum优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- rho:动量因子

"""

super(Momentum, self).__init__(init_lr=init_lr, model=model)

self.delta_x = {}

for key in self.model.params.keys():

self.delta_x[key] = 0

self.rho = rho

def momentum(self, x, gradient_x, delta_x, init_lr):

"""

momentum算法更新参数,delta_x为梯度的加权移动平均

"""

delta_x = self.rho * delta_x - init_lr * gradient_x

x += delta_x

return x, delta_x

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],

self.model.grads[key],

self.delta_x[key],

self.init_lr)

class Adam(Optimizer):

def __init__(self, init_lr, model, beta1, beta2, epsilon):

"""

Adam优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta1, beta2:移动平均的衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(Adam, self).__init__(init_lr=init_lr, model=model)

self.beta1 = beta1

self.beta2 = beta2

self.epsilon = epsilon

self.M, self.G = {}, {}

for key in self.model.params.keys():

self.M[key] = 0

self.G[key] = 0

self.t = 1

def adam(self, x, gradient_x, G, M, t, init_lr):

"""

adam算法更新参数

输入:

- x:参数

- G:梯度平方的加权移动平均

- M:梯度的加权移动平均

- t:迭代次数

- init_lr:初始学习率

"""

M = self.beta1 * M + (1 - self.beta1) * gradient_x

G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2

M_hat = M / (1 - self.beta1 ** t)

G_hat = G / (1 - self.beta2 ** t)

t += 1

x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat

return x, G, M, t

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],

self.model.grads[key],

self.G[key],

self.M[key],

self.t,

self.init_lr)

class OptimizedFunction3D(Op):

def __init__(self):

super(OptimizedFunction3D, self).__init__()

self.params = {'x': 0}

self.grads = {'x': 0}

def forward(self, x):

self.params['x'] = x

return x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]

def backward(self):

x = self.params['x']

gradient1 = 2 * x[0] + x[1]

gradient2 = 2 * x[1] + 3 * x[1] ** 2 + x[0]

grad1 = torch.Tensor([gradient1])

grad2 = torch.Tensor([gradient2])

self.grads['x'] = torch.cat([grad1, grad2])

class Visualization3D(animation.FuncAnimation):

""" 绘制动态图像,可视化参数更新轨迹 """

def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=600, blit=True, **kwargs):

"""

初始化3d可视化类

输入:

xy_values:三维中x,y维度的值

z_values:三维中z维度的值

labels:每个参数更新轨迹的标签

colors:每个轨迹的颜色

interval:帧之间的延迟(以毫秒为单位)

blit:是否优化绘图

"""

self.fig = fig

self.ax = ax

self.xy_values = xy_values

self.z_values = z_values

frames = max(xy_value.shape[0] for xy_value in xy_values)

self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]

for _, label, color in zip_longest(xy_values, labels, colors)]

super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,

interval=interval, blit=blit, **kwargs)

def init_animation(self):

# 数值初始化

for line in self.lines:

line.set_data([], [])

# line.set_3d_properties(np.asarray([])) # 源程序中有这一行,加上会报错。 Edit by David 2022.12.4

return self.lines

def animate(self, i):

# 将x,y,z三个数据传入,绘制三维图像

for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):

line.set_data(xy_value[:i, 0], xy_value[:i, 1])

line.set_3d_properties(z_value[:i])

return self.lines

def train_f(model, optimizer, x_init, epoch):

x = x_init

all_x = []

losses = []

for i in range(epoch):

all_x.append(copy.copy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.

loss = model(x)

losses.append(loss)

model.backward()

optimizer.step()

x = model.params['x']

return torch.Tensor(np.array(all_x)), losses

# 构建5个模型,分别配备不同的优化器

model1 = OptimizedFunction3D()

opt_gd = SimpleBatchGD(init_lr=0.01, model=model1)

model2 = OptimizedFunction3D()

opt_adagrad = Adagrad(init_lr=0.5, model=model2, epsilon=1e-7)

model3 = OptimizedFunction3D()

opt_rmsprop = RMSprop(init_lr=0.1, model=model3, beta=0.9, epsilon=1e-7)

model4 = OptimizedFunction3D()

opt_momentum = Momentum(init_lr=0.01, model=model4, rho=0.9)

model5 = OptimizedFunction3D()

opt_adam = Adam(init_lr=0.1, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)

models = [model1, model2, model3, model4, model5]

opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam]

x_all_opts = []

z_all_opts = []

# 使用不同优化器训练

for model, opt in zip(models, opts):

x_init = torch.FloatTensor([2, 3])

x_one_opt, z_one_opt = train_f(model, opt, x_init, 150) # epoch

# 保存参数值

x_all_opts.append(x_one_opt.numpy())

z_all_opts.append(np.squeeze(z_one_opt))

# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值

x1 = np.arange(-3, 3, 0.1)

x2 = np.arange(-3, 3, 0.1)

x1, x2 = np.meshgrid(x1, x2)

init_x = torch.Tensor(np.array([x1, x2]))

model = OptimizedFunction3D()

# 绘制 f_3d函数 的 三维图像

fig = plt.figure()

ax = plt.axes(projection='3d')

X = init_x[0].numpy()

Y = init_x[1].numpy()

Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4

ax.plot_surface(X, Y, Z, cmap='rainbow')

ax.set_xlabel('x1')

ax.set_ylabel('x2')

ax.set_zlabel('f(x1,x2)')

labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam']

colors = ['#f6373c', '#f6f237', '#45f637', '#37f0f6', '#000000']

animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)

ax.legend(loc='upper left')

plt.show()

实验结果截图:

实验代码:  

import torch

import numpy as np

import copy

from matplotlib import pyplot as plt

from matplotlib import animation

from itertools import zip_longest

from matplotlib import cm

class Op(object):

def __init__(self):

pass

def __call__(self, inputs):

return self.forward(inputs)

# 输入:张量inputs

# 输出:张量outputs

def forward(self, inputs):

# return outputs

raise NotImplementedError

# 输入:最终输出对outputs的梯度outputs_grads

# 输出:最终输出对inputs的梯度inputs_grads

def backward(self, outputs_grads):

# return inputs_grads

raise NotImplementedError

class Optimizer(object): # 优化器基类

def __init__(self, init_lr, model):

"""

优化器类初始化

"""

# 初始化学习率,用于参数更新的计算

self.init_lr = init_lr

# 指定优化器需要优化的模型

self.model = model

def step(self):

"""

定义每次迭代如何更新参数

"""

pass

class SimpleBatchGD(Optimizer):

def __init__(self, init_lr, model):

super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)

def step(self):

# 参数更新

if isinstance(self.model.params, dict):

for key in self.model.params.keys():

self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]

class Adagrad(Optimizer):

def __init__(self, init_lr, model, epsilon):

"""

Adagrad 优化器初始化

输入:

- init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数

"""

super(Adagrad, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.epsilon = epsilon

def adagrad(self, x, gradient_x, G, init_lr):

"""

adagrad算法更新参数,G为参数梯度平方的累计值。

"""

G += gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""

参数更新

"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class RMSprop(Optimizer):

def __init__(self, init_lr, model, beta, epsilon):

"""

RMSprop优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta:衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(RMSprop, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.beta = beta

self.epsilon = epsilon

def rmsprop(self, x, gradient_x, G, init_lr):

"""

rmsprop算法更新参数,G为迭代梯度平方的加权移动平均

"""

G = self.beta * G + (1 - self.beta) * gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class Momentum(Optimizer):

def __init__(self, init_lr, model, rho):

"""

Momentum优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- rho:动量因子

"""

super(Momentum, self).__init__(init_lr=init_lr, model=model)

self.delta_x = {}

for key in self.model.params.keys():

self.delta_x[key] = 0

self.rho = rho

def momentum(self, x, gradient_x, delta_x, init_lr):

"""

momentum算法更新参数,delta_x为梯度的加权移动平均

"""

delta_x = self.rho * delta_x - init_lr * gradient_x

x += delta_x

return x, delta_x

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],

self.model.grads[key],

self.delta_x[key],

self.init_lr)

class Adam(Optimizer):

def __init__(self, init_lr, model, beta1, beta2, epsilon):

"""

Adam优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta1, beta2:移动平均的衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(Adam, self).__init__(init_lr=init_lr, model=model)

self.beta1 = beta1

self.beta2 = beta2

self.epsilon = epsilon

self.M, self.G = {}, {}

for key in self.model.params.keys():

self.M[key] = 0

self.G[key] = 0

self.t = 1

def adam(self, x, gradient_x, G, M, t, init_lr):

"""

adam算法更新参数

输入:

- x:参数

- G:梯度平方的加权移动平均

- M:梯度的加权移动平均

- t:迭代次数

- init_lr:初始学习率

"""

M = self.beta1 * M + (1 - self.beta1) * gradient_x

G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2

M_hat = M / (1 - self.beta1 ** t)

G_hat = G / (1 - self.beta2 ** t)

t += 1

x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat

return x, G, M, t

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],

self.model.grads[key],

self.G[key],

self.M[key],

self.t,

self.init_lr)

class OptimizedFunction3D(Op):

def __init__(self):

super(OptimizedFunction3D, self).__init__()

self.params = {'x': 0}

self.grads = {'x': 0}

def forward(self, x):

self.params['x'] = x

return x[0] * x[0] / 20 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]

def backward(self):

x = self.params['x']

gradient1 = 2 * x[0] / 20

gradient2 = 2 * x[1] / 1

grad1 = torch.Tensor([gradient1])

grad2 = torch.Tensor([gradient2])

self.grads['x'] = torch.cat([grad1, grad2])

class Visualization3D(animation.FuncAnimation):

""" 绘制动态图像,可视化参数更新轨迹 """

def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):

"""

初始化3d可视化类

输入:

xy_values:三维中x,y维度的值

z_values:三维中z维度的值

labels:每个参数更新轨迹的标签

colors:每个轨迹的颜色

interval:帧之间的延迟(以毫秒为单位)

blit:是否优化绘图

"""

self.fig = fig

self.ax = ax

self.xy_values = xy_values

self.z_values = z_values

frames = max(xy_value.shape[0] for xy_value in xy_values)

self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]

for _, label, color in zip_longest(xy_values, labels, colors)]

self.points = [ax.plot([], [], [], color=color, markeredgewidth=1, markeredgecolor='black', marker='o')[0]

for _, color in zip_longest(xy_values, colors)]

# print(self.lines)

super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,

interval=interval, blit=blit, **kwargs)

def init_animation(self):

# 数值初始化

for line in self.lines:

line.set_data_3d([], [], [])

for point in self.points:

point.set_data_3d([], [], [])

return self.points + self.lines

def animate(self, i):

# 将x,y,z三个数据传入,绘制三维图像

for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):

line.set_data_3d(xy_value[:i, 0], xy_value[:i, 1], z_value[:i])

for point, xy_value, z_value in zip(self.points, self.xy_values, self.z_values):

point.set_data_3d(xy_value[i, 0], xy_value[i, 1], z_value[i])

return self.points + self.lines

def train_f(model, optimizer, x_init, epoch):

x = x_init

all_x = []

losses = []

for i in range(epoch):

all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.

loss = model(x)

losses.append(loss)

model.backward()

optimizer.step()

x = model.params['x']

return torch.Tensor(np.array(all_x)), losses

# 构建5个模型,分别配备不同的优化器

model1 = OptimizedFunction3D()

opt_gd = SimpleBatchGD(init_lr=0.95, model=model1)

model2 = OptimizedFunction3D()

opt_adagrad = Adagrad(init_lr=1.5, model=model2, epsilon=1e-7)

model3 = OptimizedFunction3D()

opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)

model4 = OptimizedFunction3D()

opt_momentum = Momentum(init_lr=0.1, model=model4, rho=0.9)

model5 = OptimizedFunction3D()

opt_adam = Adam(init_lr=0.3, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)

models = [model1, model2, model3, model4, model5]

opts = [opt_gd, opt_adagrad, opt_rmsprop, opt_momentum, opt_adam]

x_all_opts = []

z_all_opts = []

# 使用不同优化器训练

for model, opt in zip(models, opts):

x_init = torch.FloatTensor([-7, 2])

x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch

# 保存参数值

x_all_opts.append(x_one_opt.numpy())

z_all_opts.append(np.squeeze(z_one_opt))

# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值

x1 = np.arange(-10, 10, 0.01)

x2 = np.arange(-5, 5, 0.01)

x1, x2 = np.meshgrid(x1, x2)

init_x = torch.Tensor(np.array([x1, x2]))

model = OptimizedFunction3D()

# 绘制 f_3d函数 的 三维图像

fig = plt.figure()

ax = plt.axes(projection='3d')

X = init_x[0].numpy()

Y = init_x[1].numpy()

Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4

surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)

# fig.colorbar(surf, shrink=0.5, aspect=1)

# ax.set_zlim(-3, 2)

ax.set_xlabel('x1')

ax.set_ylabel('x2')

ax.set_zlabel('f(x1,x2)')

labels = ['SGD', 'AdaGrad', 'RMSprop', 'Momentum', 'Adam']

colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']

animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)

ax.legend(loc='upper right')

plt.show()

# animator.save('teaser' + '.gif', writer='imagemagick',fps=10) # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

# save不好用,不费劲了,安装个软件做gif https://pc.qq.com/detail/13/detail_23913.html

 

 

3.复现CS231经典动画

Animations that may help your intuitions about the learning process dynamics. 

Left: Contours of a loss surface and time evolution of different optimization algorithms. Notice the "overshooting" behavior of momentum-based methods, which make the optimization look like a ball rolling down the hill. 

Right: A visualization of a saddle point in the optimization landscape, where the curvature along different dimension has different signs (one dimension curves up and another down). Notice that SGD has a very hard time breaking symmetry and gets stuck on the top. Conversely, algorithms such as RMSprop will see very low gradients in the saddle direction. Due to the denominator term in the RMSprop update, this will increase the effective learning rate along this direction, helping RMSProp proceed. 

轨迹型:  

import torch

import numpy as np

import copy

from matplotlib import pyplot as plt

from matplotlib import animation

from itertools import zip_longest

from matplotlib import cm

class Op(object):

def __init__(self):

pass

def __call__(self, inputs):

return self.forward(inputs)

# 输入:张量inputs

# 输出:张量outputs

def forward(self, inputs):

# return outputs

raise NotImplementedError

# 输入:最终输出对outputs的梯度outputs_grads

# 输出:最终输出对inputs的梯度inputs_grads

def backward(self, outputs_grads):

# return inputs_grads

raise NotImplementedError

class Optimizer(object): # 优化器基类

def __init__(self, init_lr, model):

"""

优化器类初始化

"""

# 初始化学习率,用于参数更新的计算

self.init_lr = init_lr

# 指定优化器需要优化的模型

self.model = model

def step(self):

"""

定义每次迭代如何更新参数

"""

pass

class SimpleBatchGD(Optimizer):

def __init__(self, init_lr, model):

super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)

def step(self):

# 参数更新

if isinstance(self.model.params, dict):

for key in self.model.params.keys():

self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]

class Adagrad(Optimizer):

def __init__(self, init_lr, model, epsilon):

"""

Adagrad 优化器初始化

输入:

- init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数

"""

super(Adagrad, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.epsilon = epsilon

def adagrad(self, x, gradient_x, G, init_lr):

"""

adagrad算法更新参数,G为参数梯度平方的累计值。

"""

G += gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""

参数更新

"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class RMSprop(Optimizer):

def __init__(self, init_lr, model, beta, epsilon):

"""

RMSprop优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta:衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(RMSprop, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.beta = beta

self.epsilon = epsilon

def rmsprop(self, x, gradient_x, G, init_lr):

"""

rmsprop算法更新参数,G为迭代梯度平方的加权移动平均

"""

G = self.beta * G + (1 - self.beta) * gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class Momentum(Optimizer):

def __init__(self, init_lr, model, rho):

"""

Momentum优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- rho:动量因子

"""

super(Momentum, self).__init__(init_lr=init_lr, model=model)

self.delta_x = {}

for key in self.model.params.keys():

self.delta_x[key] = 0

self.rho = rho

def momentum(self, x, gradient_x, delta_x, init_lr):

"""

momentum算法更新参数,delta_x为梯度的加权移动平均

"""

delta_x = self.rho * delta_x - init_lr * gradient_x

x += delta_x

return x, delta_x

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],

self.model.grads[key],

self.delta_x[key],

self.init_lr)

class Adam(Optimizer):

def __init__(self, init_lr, model, beta1, beta2, epsilon):

"""

Adam优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta1, beta2:移动平均的衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(Adam, self).__init__(init_lr=init_lr, model=model)

self.beta1 = beta1

self.beta2 = beta2

self.epsilon = epsilon

self.M, self.G = {}, {}

for key in self.model.params.keys():

self.M[key] = 0

self.G[key] = 0

self.t = 1

def adam(self, x, gradient_x, G, M, t, init_lr):

"""

adam算法更新参数

输入:

- x:参数

- G:梯度平方的加权移动平均

- M:梯度的加权移动平均

- t:迭代次数

- init_lr:初始学习率

"""

M = self.beta1 * M + (1 - self.beta1) * gradient_x

G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2

M_hat = M / (1 - self.beta1 ** t)

G_hat = G / (1 - self.beta2 ** t)

t += 1

x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat

return x, G, M, t

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],

self.model.grads[key],

self.G[key],

self.M[key],

self.t,

self.init_lr)

class OptimizedFunction3D(Op):

def __init__(self):

super(OptimizedFunction3D, self).__init__()

self.params = {'x': 0}

self.grads = {'x': 0}

def forward(self, x):

self.params['x'] = x

return - x[0] * x[0] / 2 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]

def backward(self):

x = self.params['x']

gradient1 = - 2 * x[0] / 2

gradient2 = 2 * x[1] / 1

grad1 = torch.Tensor([gradient1])

grad2 = torch.Tensor([gradient2])

self.grads['x'] = torch.cat([grad1, grad2])

class Visualization3D(animation.FuncAnimation):

""" 绘制动态图像,可视化参数更新轨迹 """

def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):

"""

初始化3d可视化类

输入:

xy_values:三维中x,y维度的值

z_values:三维中z维度的值

labels:每个参数更新轨迹的标签

colors:每个轨迹的颜色

interval:帧之间的延迟(以毫秒为单位)

blit:是否优化绘图

"""

self.fig = fig

self.ax = ax

self.xy_values = xy_values

self.z_values = z_values

frames = max(xy_value.shape[0] for xy_value in xy_values)

# , marker = 'o'

self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]

for _, label, color in zip_longest(xy_values, labels, colors)]

print(self.lines)

super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,

interval=interval, blit=blit, **kwargs)

def init_animation(self):

# 数值初始化

for line in self.lines:

line.set_data([], [])

# line.set_3d_properties(np.asarray([])) # 源程序中有这一行,加上会报错。 Edit by David 2022.12.4

return self.lines

def animate(self, i):

# 将x,y,z三个数据传入,绘制三维图像

for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):

line.set_data(xy_value[:i, 0], xy_value[:i, 1])

line.set_3d_properties(z_value[:i])

return self.lines

def train_f(model, optimizer, x_init, epoch):

x = x_init

all_x = []

losses = []

for i in range(epoch):

all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.

loss = model(x)

losses.append(loss)

model.backward()

optimizer.step()

x = model.params['x']

return torch.Tensor(np.array(all_x)), losses

# 构建5个模型,分别配备不同的优化器

model1 = OptimizedFunction3D()

opt_gd = SimpleBatchGD(init_lr=0.05, model=model1)

model2 = OptimizedFunction3D()

opt_adagrad = Adagrad(init_lr=0.05, model=model2, epsilon=1e-7)

model3 = OptimizedFunction3D()

opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)

model4 = OptimizedFunction3D()

opt_momentum = Momentum(init_lr=0.05, model=model4, rho=0.9)

model5 = OptimizedFunction3D()

opt_adam = Adam(init_lr=0.05, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)

models = [model5, model2, model3, model4, model1]

opts = [opt_adam, opt_adagrad, opt_rmsprop, opt_momentum, opt_gd]

x_all_opts = []

z_all_opts = []

# 使用不同优化器训练

for model, opt in zip(models, opts):

x_init = torch.FloatTensor([0.00001, 0.5])

x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch

# 保存参数值

x_all_opts.append(x_one_opt.numpy())

z_all_opts.append(np.squeeze(z_one_opt))

# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值

x1 = np.arange(-1, 2, 0.01)

x2 = np.arange(-1, 1, 0.05)

x1, x2 = np.meshgrid(x1, x2)

init_x = torch.Tensor(np.array([x1, x2]))

model = OptimizedFunction3D()

# 绘制 f_3d函数 的 三维图像

fig = plt.figure()

ax = plt.axes(projection='3d')

X = init_x[0].numpy()

Y = init_x[1].numpy()

Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4

surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)

# fig.colorbar(surf, shrink=0.5, aspect=1)

ax.set_zlim(-3, 2)

ax.set_xlabel('x1')

ax.set_ylabel('x2')

ax.set_zlabel('f(x1,x2)')

labels = ['Adam', 'AdaGrad', 'RMSprop', 'Momentum', 'SGD']

colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']

animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)

ax.legend(loc='upper right')

plt.show()

# animator.save('animation.gif') # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

小球型:

import torch

import numpy as np

import copy

from matplotlib import pyplot as plt

from matplotlib import animation

from itertools import zip_longest

from matplotlib import cm

class Op(object):

def __init__(self):

pass

def __call__(self, inputs):

return self.forward(inputs)

# 输入:张量inputs

# 输出:张量outputs

def forward(self, inputs):

# return outputs

raise NotImplementedError

# 输入:最终输出对outputs的梯度outputs_grads

# 输出:最终输出对inputs的梯度inputs_grads

def backward(self, outputs_grads):

# return inputs_grads

raise NotImplementedError

class Optimizer(object): # 优化器基类

def __init__(self, init_lr, model):

"""

优化器类初始化

"""

# 初始化学习率,用于参数更新的计算

self.init_lr = init_lr

# 指定优化器需要优化的模型

self.model = model

def step(self):

"""

定义每次迭代如何更新参数

"""

pass

class SimpleBatchGD(Optimizer):

def __init__(self, init_lr, model):

super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)

def step(self):

# 参数更新

if isinstance(self.model.params, dict):

for key in self.model.params.keys():

self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]

class Adagrad(Optimizer):

def __init__(self, init_lr, model, epsilon):

"""

Adagrad 优化器初始化

输入:

- init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数

"""

super(Adagrad, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.epsilon = epsilon

def adagrad(self, x, gradient_x, G, init_lr):

"""

adagrad算法更新参数,G为参数梯度平方的累计值。

"""

G += gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""

参数更新

"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class RMSprop(Optimizer):

def __init__(self, init_lr, model, beta, epsilon):

"""

RMSprop优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta:衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(RMSprop, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.beta = beta

self.epsilon = epsilon

def rmsprop(self, x, gradient_x, G, init_lr):

"""

rmsprop算法更新参数,G为迭代梯度平方的加权移动平均

"""

G = self.beta * G + (1 - self.beta) * gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class Momentum(Optimizer):

def __init__(self, init_lr, model, rho):

"""

Momentum优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- rho:动量因子

"""

super(Momentum, self).__init__(init_lr=init_lr, model=model)

self.delta_x = {}

for key in self.model.params.keys():

self.delta_x[key] = 0

self.rho = rho

def momentum(self, x, gradient_x, delta_x, init_lr):

"""

momentum算法更新参数,delta_x为梯度的加权移动平均

"""

delta_x = self.rho * delta_x - init_lr * gradient_x

x += delta_x

return x, delta_x

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],

self.model.grads[key],

self.delta_x[key],

self.init_lr)

class Adam(Optimizer):

def __init__(self, init_lr, model, beta1, beta2, epsilon):

"""

Adam优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta1, beta2:移动平均的衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(Adam, self).__init__(init_lr=init_lr, model=model)

self.beta1 = beta1

self.beta2 = beta2

self.epsilon = epsilon

self.M, self.G = {}, {}

for key in self.model.params.keys():

self.M[key] = 0

self.G[key] = 0

self.t = 1

def adam(self, x, gradient_x, G, M, t, init_lr):

"""

adam算法更新参数

输入:

- x:参数

- G:梯度平方的加权移动平均

- M:梯度的加权移动平均

- t:迭代次数

- init_lr:初始学习率

"""

M = self.beta1 * M + (1 - self.beta1) * gradient_x

G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2

M_hat = M / (1 - self.beta1 ** t)

G_hat = G / (1 - self.beta2 ** t)

t += 1

x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat

return x, G, M, t

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],

self.model.grads[key],

self.G[key],

self.M[key],

self.t,

self.init_lr)

class OptimizedFunction3D(Op):

def __init__(self):

super(OptimizedFunction3D, self).__init__()

self.params = {'x': 0}

self.grads = {'x': 0}

def forward(self, x):

self.params['x'] = x

return - x[0] * x[0] / 2 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]

def backward(self):

x = self.params['x']

gradient1 = - 2 * x[0] / 2

gradient2 = 2 * x[1] / 1

grad1 = torch.Tensor([gradient1])

grad2 = torch.Tensor([gradient2])

self.grads['x'] = torch.cat([grad1, grad2])

class Visualization3D(animation.FuncAnimation):

""" 绘制动态图像,可视化参数更新轨迹 """

def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):

"""

初始化3d可视化类

输入:

xy_values:三维中x,y维度的值

z_values:三维中z维度的值

labels:每个参数更新轨迹的标签

colors:每个轨迹的颜色

interval:帧之间的延迟(以毫秒为单位)

blit:是否优化绘图

"""

self.fig = fig

self.ax = ax

self.xy_values = xy_values

self.z_values = z_values

frames = max(xy_value.shape[0] for xy_value in xy_values)

self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]

for _, label, color in zip_longest(xy_values, labels, colors)]

self.points = [ax.plot([], [], [], color=color, lw=2, marker='o')[0]

for _, color in zip_longest(xy_values, colors)]

# print(self.lines)

super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,

interval=interval, blit=blit, **kwargs)

def init_animation(self):

# 数值初始化

for line in self.lines:

line.set_data_3d([], [], [])

for point in self.points:

point.set_data_3d([], [], [])

return self.points

def animate(self, i):

# 将x,y,z三个数据传入,绘制三维图像

for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):

line.set_data_3d(xy_value[:i, 0], xy_value[:i, 1], z_value[:i])

for point, xy_value, z_value in zip(self.points, self.xy_values, self.z_values):

point.set_data_3d(xy_value[i, 0], xy_value[i, 1], z_value[i])

return self.points

def train_f(model, optimizer, x_init, epoch):

x = x_init

all_x = []

losses = []

for i in range(epoch):

all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.

loss = model(x)

losses.append(loss)

model.backward()

optimizer.step()

x = model.params['x']

return torch.Tensor(np.array(all_x)), losses

# 构建5个模型,分别配备不同的优化器

model1 = OptimizedFunction3D()

opt_gd = SimpleBatchGD(init_lr=0.05, model=model1)

model2 = OptimizedFunction3D()

opt_adagrad = Adagrad(init_lr=0.05, model=model2, epsilon=1e-7)

model3 = OptimizedFunction3D()

opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)

model4 = OptimizedFunction3D()

opt_momentum = Momentum(init_lr=0.05, model=model4, rho=0.9)

model5 = OptimizedFunction3D()

opt_adam = Adam(init_lr=0.05, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)

models = [model5, model2, model3, model4, model1]

opts = [opt_adam, opt_adagrad, opt_rmsprop, opt_momentum, opt_gd]

x_all_opts = []

z_all_opts = []

# 使用不同优化器训练

for model, opt in zip(models, opts):

x_init = torch.FloatTensor([0.00001, 0.5])

x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch

# 保存参数值

x_all_opts.append(x_one_opt.numpy())

z_all_opts.append(np.squeeze(z_one_opt))

# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值

x1 = np.arange(-1, 2, 0.01)

x2 = np.arange(-1, 1, 0.05)

x1, x2 = np.meshgrid(x1, x2)

init_x = torch.Tensor(np.array([x1, x2]))

model = OptimizedFunction3D()

# 绘制 f_3d函数 的 三维图像

fig = plt.figure()

ax = plt.axes(projection='3d')

X = init_x[0].numpy()

Y = init_x[1].numpy()

Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4

surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)

# fig.colorbar(surf, shrink=0.5, aspect=1)

ax.set_zlim(-3, 2)

ax.set_xlabel('x1')

ax.set_ylabel('x2')

ax.set_zlabel('f(x1,x2)')

labels = ['Adam', 'AdaGrad', 'RMSprop', 'Momentum', 'SGD']

colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']

animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)

ax.legend(loc='upper right')

plt.show()

animator.save('animation.gif') # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

综合型:  

import torch

import numpy as np

import copy

from matplotlib import pyplot as plt

from matplotlib import animation

from itertools import zip_longest

from matplotlib import cm

class Op(object):

def __init__(self):

pass

def __call__(self, inputs):

return self.forward(inputs)

# 输入:张量inputs

# 输出:张量outputs

def forward(self, inputs):

# return outputs

raise NotImplementedError

# 输入:最终输出对outputs的梯度outputs_grads

# 输出:最终输出对inputs的梯度inputs_grads

def backward(self, outputs_grads):

# return inputs_grads

raise NotImplementedError

class Optimizer(object): # 优化器基类

def __init__(self, init_lr, model):

"""

优化器类初始化

"""

# 初始化学习率,用于参数更新的计算

self.init_lr = init_lr

# 指定优化器需要优化的模型

self.model = model

def step(self):

"""

定义每次迭代如何更新参数

"""

pass

class SimpleBatchGD(Optimizer):

def __init__(self, init_lr, model):

super(SimpleBatchGD, self).__init__(init_lr=init_lr, model=model)

def step(self):

# 参数更新

if isinstance(self.model.params, dict):

for key in self.model.params.keys():

self.model.params[key] = self.model.params[key] - self.init_lr * self.model.grads[key]

class Adagrad(Optimizer):

def __init__(self, init_lr, model, epsilon):

"""

Adagrad 优化器初始化

输入:

- init_lr: 初始学习率 - model:模型,model.params存储模型参数值 - epsilon:保持数值稳定性而设置的非常小的常数

"""

super(Adagrad, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.epsilon = epsilon

def adagrad(self, x, gradient_x, G, init_lr):

"""

adagrad算法更新参数,G为参数梯度平方的累计值。

"""

G += gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""

参数更新

"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.adagrad(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class RMSprop(Optimizer):

def __init__(self, init_lr, model, beta, epsilon):

"""

RMSprop优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta:衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(RMSprop, self).__init__(init_lr=init_lr, model=model)

self.G = {}

for key in self.model.params.keys():

self.G[key] = 0

self.beta = beta

self.epsilon = epsilon

def rmsprop(self, x, gradient_x, G, init_lr):

"""

rmsprop算法更新参数,G为迭代梯度平方的加权移动平均

"""

G = self.beta * G + (1 - self.beta) * gradient_x ** 2

x -= init_lr / torch.sqrt(G + self.epsilon) * gradient_x

return x, G

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key] = self.rmsprop(self.model.params[key],

self.model.grads[key],

self.G[key],

self.init_lr)

class Momentum(Optimizer):

def __init__(self, init_lr, model, rho):

"""

Momentum优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- rho:动量因子

"""

super(Momentum, self).__init__(init_lr=init_lr, model=model)

self.delta_x = {}

for key in self.model.params.keys():

self.delta_x[key] = 0

self.rho = rho

def momentum(self, x, gradient_x, delta_x, init_lr):

"""

momentum算法更新参数,delta_x为梯度的加权移动平均

"""

delta_x = self.rho * delta_x - init_lr * gradient_x

x += delta_x

return x, delta_x

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.delta_x[key] = self.momentum(self.model.params[key],

self.model.grads[key],

self.delta_x[key],

self.init_lr)

class Adam(Optimizer):

def __init__(self, init_lr, model, beta1, beta2, epsilon):

"""

Adam优化器初始化

输入:

- init_lr:初始学习率

- model:模型,model.params存储模型参数值

- beta1, beta2:移动平均的衰减率

- epsilon:保持数值稳定性而设置的常数

"""

super(Adam, self).__init__(init_lr=init_lr, model=model)

self.beta1 = beta1

self.beta2 = beta2

self.epsilon = epsilon

self.M, self.G = {}, {}

for key in self.model.params.keys():

self.M[key] = 0

self.G[key] = 0

self.t = 1

def adam(self, x, gradient_x, G, M, t, init_lr):

"""

adam算法更新参数

输入:

- x:参数

- G:梯度平方的加权移动平均

- M:梯度的加权移动平均

- t:迭代次数

- init_lr:初始学习率

"""

M = self.beta1 * M + (1 - self.beta1) * gradient_x

G = self.beta2 * G + (1 - self.beta2) * gradient_x ** 2

M_hat = M / (1 - self.beta1 ** t)

G_hat = G / (1 - self.beta2 ** t)

t += 1

x -= init_lr / torch.sqrt(G_hat + self.epsilon) * M_hat

return x, G, M, t

def step(self):

"""参数更新"""

for key in self.model.params.keys():

self.model.params[key], self.G[key], self.M[key], self.t = self.adam(self.model.params[key],

self.model.grads[key],

self.G[key],

self.M[key],

self.t,

self.init_lr)

class OptimizedFunction3D(Op):

def __init__(self):

super(OptimizedFunction3D, self).__init__()

self.params = {'x': 0}

self.grads = {'x': 0}

def forward(self, x):

self.params['x'] = x

return - x[0] * x[0] / 2 + x[1] * x[1] / 1 # x[0] ** 2 + x[1] ** 2 + x[1] ** 3 + x[0] * x[1]

def backward(self):

x = self.params['x']

gradient1 = - 2 * x[0] / 2

gradient2 = 2 * x[1] / 1

grad1 = torch.Tensor([gradient1])

grad2 = torch.Tensor([gradient2])

self.grads['x'] = torch.cat([grad1, grad2])

class Visualization3D(animation.FuncAnimation):

""" 绘制动态图像,可视化参数更新轨迹 """

def __init__(self, *xy_values, z_values, labels=[], colors=[], fig, ax, interval=100, blit=True, **kwargs):

"""

初始化3d可视化类

输入:

xy_values:三维中x,y维度的值

z_values:三维中z维度的值

labels:每个参数更新轨迹的标签

colors:每个轨迹的颜色

interval:帧之间的延迟(以毫秒为单位)

blit:是否优化绘图

"""

self.fig = fig

self.ax = ax

self.xy_values = xy_values

self.z_values = z_values

frames = max(xy_value.shape[0] for xy_value in xy_values)

self.lines = [ax.plot([], [], [], label=label, color=color, lw=2)[0]

for _, label, color in zip_longest(xy_values, labels, colors)]

self.points = [ax.plot([], [], [], color=color, markeredgewidth =1, markeredgecolor='black', marker='o')[0]

for _,color in zip_longest(xy_values, colors)]

# print(self.lines)

super(Visualization3D, self).__init__(fig, self.animate, init_func=self.init_animation, frames=frames,

interval=interval, blit=blit, **kwargs)

def init_animation(self):

# 数值初始化

for line in self.lines:

line.set_data_3d([], [], [])

for point in self.points:

point.set_data_3d([], [], [])

return self.points + self.lines

def animate(self, i):

# 将x,y,z三个数据传入,绘制三维图像

for line, xy_value, z_value in zip(self.lines, self.xy_values, self.z_values):

line.set_data_3d(xy_value[:i, 0], xy_value[:i, 1], z_value[:i])

for point, xy_value, z_value in zip(self.points, self.xy_values, self.z_values):

point.set_data_3d(xy_value[i, 0], xy_value[i, 1], z_value[i])

return self.points + self.lines

def train_f(model, optimizer, x_init, epoch):

x = x_init

all_x = []

losses = []

for i in range(epoch):

all_x.append(copy.deepcopy(x.numpy())) # 浅拷贝 改为 深拷贝, 否则List的原值会被改变。 Edit by David 2022.12.4.

loss = model(x)

losses.append(loss)

model.backward()

optimizer.step()

x = model.params['x']

return torch.Tensor(np.array(all_x)), losses

# 构建5个模型,分别配备不同的优化器

model1 = OptimizedFunction3D()

opt_gd = SimpleBatchGD(init_lr=0.05, model=model1)

model2 = OptimizedFunction3D()

opt_adagrad = Adagrad(init_lr=0.05, model=model2, epsilon=1e-7)

model3 = OptimizedFunction3D()

opt_rmsprop = RMSprop(init_lr=0.05, model=model3, beta=0.9, epsilon=1e-7)

model4 = OptimizedFunction3D()

opt_momentum = Momentum(init_lr=0.05, model=model4, rho=0.9)

model5 = OptimizedFunction3D()

opt_adam = Adam(init_lr=0.05, model=model5, beta1=0.9, beta2=0.99, epsilon=1e-7)

models = [model5, model2, model3, model4, model1]

opts = [opt_adam, opt_adagrad, opt_rmsprop, opt_momentum, opt_gd]

x_all_opts = []

z_all_opts = []

# 使用不同优化器训练

for model, opt in zip(models, opts):

x_init = torch.FloatTensor([0.00001, 0.5])

x_one_opt, z_one_opt = train_f(model, opt, x_init, 100) # epoch

# 保存参数值

x_all_opts.append(x_one_opt.numpy())

z_all_opts.append(np.squeeze(z_one_opt))

# 使用numpy.meshgrid生成x1,x2矩阵,矩阵的每一行为[-3, 3],以0.1为间隔的数值

x1 = np.arange(-1, 2, 0.01)

x2 = np.arange(-1, 1, 0.05)

x1, x2 = np.meshgrid(x1, x2)

init_x = torch.Tensor(np.array([x1, x2]))

model = OptimizedFunction3D()

# 绘制 f_3d函数 的 三维图像

fig = plt.figure()

ax = plt.axes(projection='3d')

X = init_x[0].numpy()

Y = init_x[1].numpy()

Z = model(init_x).numpy() # 改为 model(init_x).numpy() David 2022.12.4

surf = ax.plot_surface(X, Y, Z, edgecolor='grey', cmap=cm.coolwarm)

# fig.colorbar(surf, shrink=0.5, aspect=1)

ax.set_zlim(-3, 2)

ax.set_xlabel('x1')

ax.set_ylabel('x2')

ax.set_zlabel('f(x1,x2)')

labels = ['Adam', 'AdaGrad', 'RMSprop', 'Momentum', 'SGD']

colors = ['#8B0000', '#0000FF', '#000000', '#008B00', '#FF0000']

animator = Visualization3D(*x_all_opts, z_values=z_all_opts, labels=labels, colors=colors, fig=fig, ax=ax)

ax.legend(loc='upper right')

plt.show()

# animator.save('teaser' + '.gif', writer='imagemagick',fps=10) # 效果不好,估计被挡住了…… 有待进一步提高 Edit by David 2022.12.4

# save不好用,不费劲了,安装个软件做gif https://pc.qq.com/detail/13/detail_23913.html

分别是轨迹型,小球型,综合型的截图。

结合3D动画,用自己的语言,从轨迹、速度等多个角度讲解各个算法优缺点

SGD(随机梯度下降)

优点:简单易实现,计算速度快。

缺点:容易陷入局部最优点,收敛速度慢。

轨迹形成原因:在每次迭代中,根据当前位置的梯度值进行更新,由于更新步长 一致,会形成zigzag型的轨迹。

Momentum(动量法)

优点:加入动量项可以在更新时考虑之前的速度方向,有助于加速收敛。

缺点:可能在平坦区域出现震荡。

轨迹形成原因:动量项使得在相同梯度下,速度会增加,导致在平坦区域产生较 大的步长,在凸起区域则减小步长。

Nesterov Accelerated Gradient(NAG,Nesterov加速梯度法)

优点:在Momentum的基础上进行了改进,能够更准确地估计下一步位置的梯度。

缺点:对于某些问题可能不如Momentum表现好。

轨迹形成原因:NAG首先根据之前的速度提前计算下一步的位置,然后根据该位置 的梯度进行更新,以减少过冲。

AdaGrad(自适应梯度算法)

优点:能够自动调整学习率,对于稀疏数据有较好的效果。

缺点:学习率会随着迭代次数增加而逐渐变小,可能导致早期收敛过快。

轨迹形成原因:AdaGrad通过累加历史梯度平方的方式调整学习率,对于经常出现 的梯度较大的特征,学习率会逐渐减小,从而形成较小的步长。

RMSprop

优点:改进了AdaGrad中学习率逐渐减小的问题,引入了衰减率来控制历史梯度 平方的影响。

缺点:可能在某些问题上表现不如其他算法。

轨迹形成原因:RMSprop通过衰减历史梯度平方来控制学习率的调整,从而避免了 AdaGrad中学习率过小的问题。

Adam(自适应矩估计算法)

优点:结合了动量法和RMSprop的优点,具有较好的收敛性能。

缺点:需要调整一些超参数。

轨迹形成原因:Adam使用了一阶矩估计和二阶矩估计来调整学习率,通过动态调 整步长,使得在不同地方具有适应性。

在优化算法中,速度是指参数更新的速度,也可以理解为步长或者学习率的变化。速度的大小和方向会直接影响参数的更新情况和优化的效果。

影响速度的因素主要有以下几个:

梯度:梯度的大小和方向决定了参数更新的快慢和方向。梯度越大,参数更新的幅度就越大;梯度的方向决定了参数更新的方向。 学习率:学习率决定了每次参数更新的步长大小。学习率过大可能导致震荡或者无法收敛,学习率过小可能导致收敛速度过慢。 动量:动量项可以在更新时考虑之前的速度方向,有助于加速收敛。动量的大小决定了之前速度的影响程度,较大的动量能够在平坦区域加速更新,但也可能导致震荡。 AdaGrad/RMSprop/Adam等自适应算法:这些算法通过根据历史梯度调整学习率来改善参数更新。它们根据不同特征的梯度大小进行调整,使得学习率能够自适应地适应不同的特征。

这些因素共同影响着参数更新的速度和方向,不同的算法在处理这些因素时有不同的策略,从而产生了不同的轨迹形态。一些算法可能更容易陷入局部最优点,收敛速度较慢;而另一些算法可能通过动量项或自适应调整学习率等方式来改善收敛效果,以达到更快的收敛速度。

参考阅读

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: