Module Source.optmizers
Expand source code
import numpy as np
def gd_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0):
"""The function applies the gradient descent optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: not used in this function.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary:parameters a dictionary that contains the updated weights and biases
array:Costs an array that contain the cost of each iteration
"""
costs = []
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop) #**
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2)
grads = model.backward_propagation(X, Y)
parameters = model.update_parameters(grads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) #**
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)#**
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
parameters = model.update_parameters(grads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1]) #**
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def adagrad_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0):
"""The function applies the adagrad optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: not used in this function.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary: parameters a dictionary that contains the updated weights and biases
array: Costs an array that contain the cost of each iteration
"""
costs = []
adagrads={}
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop)
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2)
grads = model.backward_propagation(X, Y)
if i == 0:
for key in grads:
adagrads[key] = np.square(grads[key])
else:
for key in grads:
adagrads[key] = adagrads[key] + np.square(grads[key])
parameters = model.update_parameters_adagrad(grads,adagrads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop)
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
if i == 0:
for key in grads:
adagrads[key] = np.square(grads[key])
else:
for key in grads:
adagrads[key] = adagrads[key] + np.square(grads[key])
parameters = model.update_parameters_adagrad(grads,adagrads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def RMS_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0):
"""The function applies the RMS optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: the dictionary that contains the value of the hyper parameter rho
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary:parameters a dictionary that contains the updated weights and biases
array:Costs an array that contain the cost of each iteration
"""
costs=[]
rho=param_dic["rho"]
eps=param_dic["eps"]
rmsgrads={}
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop)
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2)
grads = model.backward_propagation(X, Y)
if i == 0:
for key in grads:
rmsgrads[key] = (1-rho)*np.square(grads[key])
else:
for key in grads:
rmsgrads[key] = (rho)*rmsgrads[key] +(1-rho)* np.square(grads[key])
parameters = model.upadte_patameters_RMS(grads,rmsgrads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1],eps=eps)
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop)
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
if i == 0:
for key in grads:
rmsgrads[key] = np.square(grads[key])
else:
for key in grads:
rmsgrads[key] = (rho) * rmsgrads[key] + (1 - rho) * np.square(grads[key])
parameters = model.upadte_patameters_RMS(grads,rmsgrads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1],eps=eps)
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def adadelta_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 ,param_dic=None,drop=0):
"""The function applies the adadelta optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: the dictionary that contains the value of the hyper parameters rho and epsilon.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary: parameters a dictionary that contains the updated weights and biases
array: Costs an array that contain the cost of each iteration
"""
costs = []
rho = param_dic["rho"]
eps = param_dic["eps"]
adadeltagrads={}
segma={}
delta={}
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop)
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2)
grads = model.backward_propagation(X, Y)
if i == 0:
for key in grads:
adadeltagrads[key] = np.square(grads[key])
segma[key]=(np.random.randn(grads[key].shape[0],grads[key].shape[1])+2)
delta[key]=np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key]
else:
for key in grads:
adadeltagrads[key] = adadeltagrads[key] + np.square(grads[key])
segma[key]=(rho)*segma[key]+(1-rho)*np.square(delta[key])
delta[key]=np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key]
parameters = model.upadte_patameters_adadelta(grads,delta, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop)
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
if i == 0:
for key in grads:
adadeltagrads[key] = np.square(grads[key])
segma[key] = (np.random.randn(grads[key].shape[0], grads[key].shape[1]) + 100) * 0.00001
delta[key] = np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key]
else:
for key in grads:
adadeltagrads[key] = adadeltagrads[key] + np.square(grads[key])
segma[key] = (rho) * segma[key] + (1 - rho) * np.square(delta[key])
delta[key] = np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key]
parameters = model.upadte_patameters_adadelta(grads,delta, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def adam_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 ,param_dic=None,drop=0):
"""The function applies the adam optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: the dictionary that contains the value of the hyper parameters rho , rhof and epsilon.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary:parameters a dictionary that contains the updated weights and biases
array:Costs an array that contain the cost of each iteration
"""
costs = []
rho = param_dic["rho"]
eps = param_dic["eps"]
rhof = param_dic["rhof"]
adamgrads={}
Fgrads={}
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop)
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2)
grads = model.backward_propagation(X, Y)
if i == 0:
for key in grads:
adamgrads[key] = (1-rho)*np.square(grads[key])
Fgrads[key]=(1-rhof)*grads[key]
else:
for key in grads:
adamgrads[key] = (rho)*adamgrads[key] +(1-rho)* np.square(grads[key])
Fgrads[key] =(rho)*Fgrads[key]+ (1 - rhof) * grads[key]
alpha_t = learning_rate * np.sqrt((1 - rho**(num_iterations)) / (1 - rhof**(num_iterations)))
parameters = model.update_parameters_adam(grads,adamgrads,Fgrads, learning_rate=alpha_t , reg_term=reg_term , m=X.shape[1],eps=eps)
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop)
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
if i == 0:
for key in grads:
adamgrads[key] = (1 - rho) * np.square(grads[key])
Fgrads[key] = (1 - rhof) * grads[key]
else:
for key in grads:
adamgrads[key] = (rho) * adamgrads[key] + (1 - rho) * np.square(grads[key])
Fgrads[key] = (rho) * Fgrads[key] + (1 - rhof) * grads[key]
alpha_t = learning_rate * np.sqrt((1 - rho ** (num_iterations)) / (1 - rhof ** (num_iterations)))
parameters = model.update_parameters_adam(grads,adamgrads,Fgrads, learning_rate=alpha_t , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1],eps=eps)
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def mom_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0):
"""The function applies the momentum optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: the dictionary that contains the value of the hyper parameters beta.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary:parameters a dictionary that contains the updated weights and biases
array:Costs an array that contain the cost of each iteration
"""
costs = []
beta = param_dic['beta']
momen_grad = {}
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X,drop)
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X.shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X, Y)
if i == 0:
for key in grads:
momen_grad[key] = (1 - beta) * grads[key]
else:
for key in grads:
momen_grad[key] = beta * momen_grad[key] + (1 - beta) * grads[key]
parameters = model.update_parameters(momen_grad, learning_rate=learning_rate,reg_term=reg_term,m=X.shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1]/batch_size)):
Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop)
cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size])
if i == 0:
for key in grads:
momen_grad[key] = (1 - beta) * grads[key]
else:
for key in grads:
momen_grad[key] = beta * momen_grad[key] + (1 - beta) * grads[key]
parameters = model.update_parameters(momen_grad, learning_rate=learning_rate,reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
def gd_optm_steepst(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0,learning_rate=0.01, reg_term=0, batch_size=0, param_dic=None,drop=0):
"""The function applies the steepest gradient descent optimizer to update the weight and bias parameters.
Parameters:
model (multilayer): instance of the multilayer class contains the models parameters to be updated.
X: the input feature vector.
Y: the labels.
num_iterations: number of epochs.
print_cost: optional parameter to show the cost function.
print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations.
cont: not used in this function
learning_rate: learning rate to be used in updating the parameters.
reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed.
batch_size: This parameter is used to specify if the learning process is batch , online or minibatch.
param_dic: not used in this function.
drop: dropout parameter to have the option of using the dropout technique.
Returns:
dictionary:parameters a dictionary that contains the updated weights and biases
array:Costs an array that contain the cost of each iteration
"""
costs = []
if batch_size == 0:
for i in range(0, num_iterations):
Alast, cache = model.forward_propagation(X, drop) # **
cost = model.compute_cost(Alast, Y)
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X.shape[1]) * np.sum(model.parameters[key] ** 2)
grads = model.backward_propagation(X, Y)
m = Alast.shape[1]
learning_rate = 100 * np.amin((- 1 / m) * (Y * np.log(np.abs(Alast - (learning_rate * model.cost_func_der(m, Alast, Y)))) + (1 - Y) * (np.log(np.abs(1 - (Alast - learning_rate * model.cost_func_der(m, Alast, Y)))))))
parameters = model.update_parameters(grads, learning_rate=learning_rate, reg_term=reg_term, m=X.shape[1])
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
else:
for i in range(0, num_iterations):
for j in range(int(X.shape[1] / batch_size)):
Alast, cache = model.forward_propagation(X[:, j * batch_size:(j * batch_size) + batch_size], drop) # **
cost = model.compute_cost(Alast, Y[:, j * batch_size:(j * batch_size) + batch_size])
if reg_term != 0:
for key in model.parameters:
cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(
model.parameters[key] ** 2) # **
grads = model.backward_propagation(X[:, j * batch_size:(j * batch_size) + batch_size],Y[:, j * batch_size:(j * batch_size) + batch_size])
m = Alast.shape[1]
learning_rate = 100 * np.amin((- 1 / m) * (Y * np.log(np.abs(Alast - (learning_rate * model.cost_func_der(m, Alast, Y)))) + (1 - Y) * (np.log(np.abs(1 - (Alast - learning_rate * model.cost_func_der(m, Alast, Y)))))))
parameters = model.update_parameters(grads, learning_rate=learning_rate, reg_term=reg_term, m=X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) # **
if print_cost and i % print_cost_each == 0:
costs.append(cost)
print("Cost after iteration %i: %f" % (i, cost))
return parameters, costs
Functions
def RMS_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the RMS optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameter rho drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration
Expand source code
def RMS_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0): """The function applies the RMS optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameter rho drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration """ costs=[] rho=param_dic["rho"] eps=param_dic["eps"] rmsgrads={} if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2) grads = model.backward_propagation(X, Y) if i == 0: for key in grads: rmsgrads[key] = (1-rho)*np.square(grads[key]) else: for key in grads: rmsgrads[key] = (rho)*rmsgrads[key] +(1-rho)* np.square(grads[key]) parameters = model.upadte_patameters_RMS(grads,rmsgrads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1],eps=eps) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) if i == 0: for key in grads: rmsgrads[key] = np.square(grads[key]) else: for key in grads: rmsgrads[key] = (rho) * rmsgrads[key] + (1 - rho) * np.square(grads[key]) parameters = model.upadte_patameters_RMS(grads,rmsgrads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1],eps=eps) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def adadelta_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the adadelta optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters rho and epsilon. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary- parameters a dictionary that contains the updated weights and biases
array- Costs an array that contain the cost of each iteration
Expand source code
def adadelta_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 ,param_dic=None,drop=0): """The function applies the adadelta optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters rho and epsilon. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary: parameters a dictionary that contains the updated weights and biases array: Costs an array that contain the cost of each iteration """ costs = [] rho = param_dic["rho"] eps = param_dic["eps"] adadeltagrads={} segma={} delta={} if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2) grads = model.backward_propagation(X, Y) if i == 0: for key in grads: adadeltagrads[key] = np.square(grads[key]) segma[key]=(np.random.randn(grads[key].shape[0],grads[key].shape[1])+2) delta[key]=np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key] else: for key in grads: adadeltagrads[key] = adadeltagrads[key] + np.square(grads[key]) segma[key]=(rho)*segma[key]+(1-rho)*np.square(delta[key]) delta[key]=np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key] parameters = model.upadte_patameters_adadelta(grads,delta, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) if i == 0: for key in grads: adadeltagrads[key] = np.square(grads[key]) segma[key] = (np.random.randn(grads[key].shape[0], grads[key].shape[1]) + 100) * 0.00001 delta[key] = np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key] else: for key in grads: adadeltagrads[key] = adadeltagrads[key] + np.square(grads[key]) segma[key] = (rho) * segma[key] + (1 - rho) * np.square(delta[key]) delta[key] = np.sqrt(segma[key] / (adadeltagrads[key]) + eps) * grads[key] parameters = model.upadte_patameters_adadelta(grads,delta, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def adagrad_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the adagrad optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary- parameters a dictionary that contains the updated weights and biases
array- Costs an array that contain the cost of each iteration
Expand source code
def adagrad_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0): """The function applies the adagrad optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary: parameters a dictionary that contains the updated weights and biases array: Costs an array that contain the cost of each iteration """ costs = [] adagrads={} if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2) grads = model.backward_propagation(X, Y) if i == 0: for key in grads: adagrads[key] = np.square(grads[key]) else: for key in grads: adagrads[key] = adagrads[key] + np.square(grads[key]) parameters = model.update_parameters_adagrad(grads,adagrads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) if i == 0: for key in grads: adagrads[key] = np.square(grads[key]) else: for key in grads: adagrads[key] = adagrads[key] + np.square(grads[key]) parameters = model.update_parameters_adagrad(grads,adagrads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def adam_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the adam optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters rho , rhof and epsilon. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration
Expand source code
def adam_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 ,param_dic=None,drop=0): """The function applies the adam optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters rho , rhof and epsilon. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration """ costs = [] rho = param_dic["rho"] eps = param_dic["eps"] rhof = param_dic["rhof"] adamgrads={} Fgrads={} if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2) grads = model.backward_propagation(X, Y) if i == 0: for key in grads: adamgrads[key] = (1-rho)*np.square(grads[key]) Fgrads[key]=(1-rhof)*grads[key] else: for key in grads: adamgrads[key] = (rho)*adamgrads[key] +(1-rho)* np.square(grads[key]) Fgrads[key] =(rho)*Fgrads[key]+ (1 - rhof) * grads[key] alpha_t = learning_rate * np.sqrt((1 - rho**(num_iterations)) / (1 - rhof**(num_iterations))) parameters = model.update_parameters_adam(grads,adamgrads,Fgrads, learning_rate=alpha_t , reg_term=reg_term , m=X.shape[1],eps=eps) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) if i == 0: for key in grads: adamgrads[key] = (1 - rho) * np.square(grads[key]) Fgrads[key] = (1 - rhof) * grads[key] else: for key in grads: adamgrads[key] = (rho) * adamgrads[key] + (1 - rho) * np.square(grads[key]) Fgrads[key] = (rho) * Fgrads[key] + (1 - rhof) * grads[key] alpha_t = learning_rate * np.sqrt((1 - rho ** (num_iterations)) / (1 - rhof ** (num_iterations))) parameters = model.update_parameters_adam(grads,adamgrads,Fgrads, learning_rate=alpha_t , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1],eps=eps) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def gd_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the gradient descent optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration
Expand source code
def gd_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0): """The function applies the gradient descent optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration """ costs = [] if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) #** cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term/X.shape[1]) * np.sum(model.parameters[key]**2) grads = model.backward_propagation(X, Y) parameters = model.update_parameters(grads, learning_rate=learning_rate , reg_term=reg_term , m=X.shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) #** cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2)#** grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) parameters = model.update_parameters(grads, learning_rate=learning_rate , reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1]) #** if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def gd_optm_steepst(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=0.01, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the steepest gradient descent optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration
Expand source code
def gd_optm_steepst(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0,learning_rate=0.01, reg_term=0, batch_size=0, param_dic=None,drop=0): """The function applies the steepest gradient descent optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: not used in this function. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration """ costs = [] if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X, drop) # ** cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term / X.shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X, Y) m = Alast.shape[1] learning_rate = 100 * np.amin((- 1 / m) * (Y * np.log(np.abs(Alast - (learning_rate * model.cost_func_der(m, Alast, Y)))) + (1 - Y) * (np.log(np.abs(1 - (Alast - learning_rate * model.cost_func_der(m, Alast, Y))))))) parameters = model.update_parameters(grads, learning_rate=learning_rate, reg_term=reg_term, m=X.shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1] / batch_size)): Alast, cache = model.forward_propagation(X[:, j * batch_size:(j * batch_size) + batch_size], drop) # ** cost = model.compute_cost(Alast, Y[:, j * batch_size:(j * batch_size) + batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum( model.parameters[key] ** 2) # ** grads = model.backward_propagation(X[:, j * batch_size:(j * batch_size) + batch_size],Y[:, j * batch_size:(j * batch_size) + batch_size]) m = Alast.shape[1] learning_rate = 100 * np.amin((- 1 / m) * (Y * np.log(np.abs(Alast - (learning_rate * model.cost_func_der(m, Alast, Y)))) + (1 - Y) * (np.log(np.abs(1 - (Alast - learning_rate * model.cost_func_der(m, Alast, Y))))))) parameters = model.update_parameters(grads, learning_rate=learning_rate, reg_term=reg_term, m=X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) # ** if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs def mom_optm(model, X, Y, num_iterations=10000, print_cost=False, print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0, param_dic=None, drop=0)-
The function applies the momentum optimizer to update the weight and bias parameters.
Parameters
model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters beta. drop: dropout parameter to have the option of using the dropout technique.
Returns
dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration
Expand source code
def mom_optm(model, X, Y, num_iterations=10000, print_cost=False ,print_cost_each=100, cont=0, learning_rate=1, reg_term=0, batch_size=0 , param_dic=None,drop=0): """The function applies the momentum optimizer to update the weight and bias parameters. Parameters: model (multilayer): instance of the multilayer class contains the models parameters to be updated. X: the input feature vector. Y: the labels. num_iterations: number of epochs. print_cost: optional parameter to show the cost function. print_cost_each: this parameter is used when "print_cost" is set True to specify when to print the cost ie: after how many iterations. cont: not used in this function learning_rate: learning rate to be used in updating the parameters. reg_term: lamda term added to the loss function to prevent over fitting. This parameter can be set to zero if no regulization is needed. batch_size: This parameter is used to specify if the learning process is batch , online or minibatch. param_dic: the dictionary that contains the value of the hyper parameters beta. drop: dropout parameter to have the option of using the dropout technique. Returns: dictionary:parameters a dictionary that contains the updated weights and biases array:Costs an array that contain the cost of each iteration """ costs = [] beta = param_dic['beta'] momen_grad = {} if batch_size == 0: for i in range(0, num_iterations): Alast, cache = model.forward_propagation(X,drop) cost = model.compute_cost(Alast, Y) if reg_term != 0: for key in model.parameters: cost += (reg_term / X.shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X, Y) if i == 0: for key in grads: momen_grad[key] = (1 - beta) * grads[key] else: for key in grads: momen_grad[key] = beta * momen_grad[key] + (1 - beta) * grads[key] parameters = model.update_parameters(momen_grad, learning_rate=learning_rate,reg_term=reg_term,m=X.shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs else: for i in range(0, num_iterations): for j in range(int(X.shape[1]/batch_size)): Alast, cache = model.forward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size],drop) cost = model.compute_cost(Alast, Y[:,j*batch_size:(j*batch_size)+batch_size]) if reg_term != 0: for key in model.parameters: cost += (reg_term / X[:, j * batch_size:(j * batch_size) + batch_size].shape[1]) * np.sum(model.parameters[key] ** 2) grads = model.backward_propagation(X[:,j*batch_size:(j*batch_size)+batch_size], Y[:,j*batch_size:(j*batch_size)+batch_size]) if i == 0: for key in grads: momen_grad[key] = (1 - beta) * grads[key] else: for key in grads: momen_grad[key] = beta * momen_grad[key] + (1 - beta) * grads[key] parameters = model.update_parameters(momen_grad, learning_rate=learning_rate,reg_term=reg_term,m=X[:,j*batch_size:(j*batch_size)+batch_size].shape[1]) if print_cost and i % print_cost_each == 0: costs.append(cost) print("Cost after iteration %i: %f" % (i, cost)) return parameters, costs