Source code for smash.core.net

from __future__ import annotations

from smash.solver._mw_forward import forward_b

from smash.core._constant import WB_INITIALIZER, NET_OPTIMIZER, LAYER_NAME

from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from smash.core.model import Model
    from smash.solver._mwd_parameters import ParametersDT
    from smash.solver._mwd_states import StatesDT

import copy
import numpy as np
from terminaltables import AsciiTable
from tqdm import tqdm

__all__ = ["Net"]


[docs]class Net(object):
    """
    Artificial Neural Network initialization.

    Examples
    --------
    >>> net = smash.Net()
    >>> net
    The network does not contain layers or has not been compiled yet
    """

    def __init__(self):
        self.layers = []

        self.history = {"loss_train": [], "loss_valid": []}

        self._optimizer = None

        self._learning_rate = None

        self._compiled = False

    def __repr__(self):
        ret = []

        if self._compiled and self.layers:
            tab = [["Layer Type", "Input/Output Shape", "Num Parameters"]]

            tot_params = 0
            trainable_params = 0

            for layer in self.layers:
                layer_name = layer.layer_name()

                n_params = layer.n_params()

                ioshape = f"{layer.input_shape}/{layer.output_shape()}"

                tab.append([layer_name, str(ioshape), str(n_params)])

                tot_params += n_params

                if layer.trainable:
                    trainable_params += n_params

            table_instance = AsciiTable(tab)
            table_instance.inner_column_border = False
            table_instance.padding_left = 1
            table_instance.padding_right = 1

            ret.append(table_instance.table)
            ret.append(f"Total parameters: {tot_params}")
            ret.append(f"Trainable parameters: {trainable_params}")
            ret.append(f"Optimizer: ({self._optimizer}, lr={self._learning_rate})")

        else:
            ret.append(
                "The network does not contain layers or has not been compiled yet"
            )

        return "\n".join(ret)

    @property
    def layers(self):
        """
        List of Layer objects defining the graph of the network.

        The graph is set using `smash.Net.add` method.

        Examples
        --------
        >>> net = smash.Net()
        >>> net.add(layer="dense", options={"input_shape": (6,), "neurons": 32})
        >>> net.add(layer="activation", options={"name": "sigmoid"})
        >>> net.add(layer="dropout", options={"drop_rate": .2})
        >>> net.compile()

        If you are using IPython, tab completion allows you to visualize all the attributes and methods of each Layer object:

        >>> layer_1 = net.layers[0]
        >>> layer_1.<TAB>
        layer_1.bias                layer_1.neurons
        layer_1.bias_initializer    layer_1.n_params(
        layer_1.input_shape         layer_1.output_shape(
        layer_1.kernel_initializer  layer_1.trainable
        layer_1.layer_input         layer_1.weight
        layer_1.layer_name(

        >>> layer_2 = net.layers[1]
        >>> layer_2.<TAB>
        layer_2.activation_name  layer_2.output_shape(
        layer_2.input_shape      layer_2.n_params(
        layer_2.layer_name(      layer_2.trainable

        >>> layer_3 = net.layers[-1]
        >>> layer_3.<TAB>
        layer_3.drop_rate      layer_3.n_params(
        layer_3.input_shape    layer_3.output_shape(
        layer_3.layer_name(    layer_3.trainable
        """

        return self._layers

    @layers.setter
    def layers(self, value):
        self._layers = value

    @property
    def history(self):
        """
        A dictionary saving the training and validation losses.

        The keys are

        - 'loss_train'
        - 'loss_valid'
        """

        return self._history

    @history.setter
    def history(self, value):
        self._history = value

[docs]    def add(self, layer: str, options: dict):
        """
        Add layers to the neural network.

        Parameters
        ----------
        layer : str
            Layer name. Should be one of

            - 'dense'
            - 'activation'
            - 'scale'
            - 'dropout'

        options : dict
            A dictionary to configure layers added to the network.

            .. hint::
                See options for each layer type:

                - 'dense' :ref:`(see here) <api_reference.add_dense>`
                - 'activation' :ref:`(see here) <api_reference.add_activation>`
                - 'scale' :ref:`(see here) <api_reference.add_scale>`
                - 'dropout' :ref:`(see here) <api_reference.add_dropout>`

        Examples
        --------
        Initialize the neural network

        >>> net = smash.Net()

        Define graph

        >>> # First Dense Layer
        >>> # input_shape is only required for the first layer
        >>> net.add(layer="dense", options={"input_shape": (8,), "neurons": 32})
        >>> # Activation funcion following the first dense layer
        >>> net.add(layer="activation", options={"name": "relu"})
        >>> # Second Dense Layer
        >>> net.add(layer="dense", options={"neurons": 16})
        >>> # Activation function following the second dense layer
        >>> net.add(layer="activation", options={"name": "relu"})
        >>> # Third Dense Layer
        >>> net.add(layer="dense", options={"neurons": 4})
        >>> # Last Activation function (output of the network)
        >>> net.add(layer="activation", options={"name": "sigmoid"})

        Compile and display a summary of the network

        >>> net.compile()
        >>> net
        +----------------------------------------------------------+
        | Layer Type            Input/Output Shape  Num Parameters |
        +----------------------------------------------------------+
        | Dense                 (8,)/(32,)          288            |
        | Activation (ReLU)     (32,)/(32,)         0              |
        | Dense                 (32,)/(16,)         528            |
        | Activation (ReLU)     (16,)/(16,)         0              |
        | Dense                 (16,)/(4,)          68             |
        | Activation (Sigmoid)  (4,)/(4,)           0              |
        +----------------------------------------------------------+
        Total parameters: 884
        Trainable parameters: 884
        Optimizer: (adam, lr=0.001)
        """

        layer = _standardize_layer(layer)

        lay = LAYERS[layer](**options)

        if not self.layers:  # Check options if first layer
            if "input_shape" in options:
                if not isinstance(options["input_shape"], tuple):
                    raise ValueError(
                        f"input_shape option should be a tuple, not {type(options['input_shape'])}"
                    )

            else:
                raise TypeError(
                    f"First layer missing required option argument: 'input_shape'"
                )

        else:  # If be not the first layer then set the input shape to the output shape of the next added layer
            lay._set_input_shape(shape=self.layers[-1].output_shape())

        # Add layer to the network
        self.layers.append(lay)

[docs]    def compile(
        self,
        optimizer: str = "adam",
        options: dict | None = None,
        random_state: int | None = None,
    ):
        """
        Compile the network and set optimizer.

        Parameters
        ----------
        optimizer : str, default 'adam'
            Name of optimizer. Should be one of

            - 'sgd'
            - 'adam'
            - 'adagrad'
            - 'rmsprop'

        options : dict or None, default None
            A dictionary of optimizer options.

            .. hint::
                See options for each optimizer:

                - 'sgd' :ref:`(see here) <api_reference.compile_sgd>`
                - 'adam' :ref:`(see here) <api_reference.compile_adam>`
                - 'adagrad' :ref:`(see here) <api_reference.compile_adagrad>`
                - 'rmsprop' :ref:`(see here) <api_reference.compile_rmsprop>`

        random_state : int or None, default None
            Random seed used to initialize weights.

            .. note::
                If not given, the weights will be initialized with a random seed.

        Examples
        --------
        >>> net = smash.Net()

        Define graph

        >>> net.add(layer="dense", options={"input_shape": (6,), "neurons": 16})
        >>> net.add(layer="activation", options={"name": "relu"})

        Compile the network

        >>> net.compile(optimizer='sgd', options={'learning_rate': 0.009, 'momentum': 0.001})
        >>> net
        +-------------------------------------------------------+
        | Layer Type         Input/Output Shape  Num Parameters |
        +-------------------------------------------------------+
        | Dense              (6,)/(16,)          112            |
        | Activation (ReLU)  (16,)/(16,)         0              |
        +-------------------------------------------------------+
        Total parameters: 112
        Trainable parameters: 112
        Optimizer: (sgd, lr=0.009)
        """

        if self.layers:
            if options is None:
                options = {}

            optimizer = _standardize_optimizer(optimizer)

            if random_state is not None:
                np.random.seed(random_state)

            opt = OPT_FUNC[optimizer](**options)

            for layer in self.layers:
                if hasattr(layer, "_initialize"):
                    layer._initialize(opt)

            self._compiled = True
            self._optimizer = optimizer
            self._learning_rate = opt.learning_rate

        else:
            raise ValueError("The network does not contain layers")

[docs]    def copy(self):
        """
        Make a deepcopy of the Net.

        Returns
        -------
        Net
            A copy of Net.
        """

        return copy.deepcopy(self)

[docs]    def set_trainable(self, trainable: list[bool]):
        """
        Method which enables to train or freeze the weights and biases of the network's layers.

        Parameters
        ----------
        trainable : list of bool
            List of booleans with a length of the total number of the network's layers.

            .. note::
                Dropout, activation, and scaling functions are non-parametric layers,
                meaning they do not have any learnable weights or biases.
                Therefore, it is not necessary to set these layers as trainable
                since they do not involve any weight updates during training.
        """

        if len(trainable) == len(self.layers):
            for i, layer in enumerate(self.layers):
                layer.trainable = trainable[i]

        else:
            raise ValueError(
                f"Inconsistent length between trainable ({len(trainable)}) and the number of layers ({len(self.layers)})"
            )

    def _fit_d2p(
        self,
        x_train: np.ndarray,
        instance: Model,
        control_vector: np.ndarray,
        mask: np.ndarray,
        parameters_bgd: ParametersDT,
        states_bgd: StatesDT,
        epochs: int,
        early_stopping: bool,
        verbose: bool,
    ):  # fit physiographic descriptors to Model parameters mapping
        if not self._compiled:
            raise ValueError(f"The network has not been compiled yet")

        loss_opt = 0  # only use for early stopping purpose

        # train model
        for epo in tqdm(range(epochs), desc="Training"):
            # Forward propogation
            y_pred = self._forward_pass(x_train)

            # Calculate the gradient of the loss function wrt y_pred
            loss_grad = _hcost_prime(
                y_pred, control_vector, mask, instance, parameters_bgd, states_bgd
            )

            # Compute loss
            loss = _hcost(instance)

            # Calculate the infinity norm of the projected gradient
            proj_g = _inf_norm(loss_grad)

            # early stopping
            if early_stopping:
                if loss_opt > loss or epo == 0:
                    loss_opt = loss

                    for layer in self.layers:
                        if hasattr(layer, "_initialize"):
                            layer._weight = np.copy(layer.weight)
                            layer._bias = np.copy(layer.bias)

            # Backpropagation
            self._backward_pass(loss_grad=loss_grad)

            if verbose:
                ret = []

                ret.append(f"{' ' * 4}At epoch")
                ret.append("{:3}".format(epo + 1))
                ret.append("J =" + "{:10.6f}".format(loss))
                ret.append("|proj g| =" + "{:10.6f}".format(proj_g))

                tqdm.write((" " * 4).join(ret))

            self.history["loss_train"].append(loss)

        if early_stopping:
            for layer in self.layers:
                if hasattr(layer, "_initialize"):
                    layer.weight = np.copy(layer._weight)
                    layer.bias = np.copy(layer._bias)

    def _forward_pass(self, x_train: np.ndarray, training: bool = True):
        layer_output = x_train

        for layer in self.layers:
            layer_output = layer._forward_pass(layer_output, training)

        return layer_output

    def _backward_pass(self, loss_grad: np.ndarray):
        for layer in reversed(self.layers):
            loss_grad = layer._backward_pass(loss_grad)

    def _predict(self, x_train: np.ndarray):
        preds = self._forward_pass(x_train, training=False)

        return preds


### LAYER ###


class Layer(object):
    def _set_input_shape(self, shape: tuple):
        self.input_shape = shape

    def layer_name(self):
        return self.__class__.__name__

    def n_params(self):
        return 0

    def _forward_pass(self, x: np.ndarray, training: bool):
        raise NotImplementedError()

    def _backward_pass(self, accum_grad: np.ndarray):
        raise NotImplementedError()

    def output_shape(self):
        raise NotImplementedError()


class Activation(Layer):

    """
    Activation layer that applies a specified activation function to the input.

    Options
    -------
    name : str
        The name of the activation function that will be used. Should be one of

        - 'relu' : Rectified Linear Unit
        - 'sigmoid' : Sigmoid
        - 'selu' : Scaled Exponential Linear Unit
        - 'elu' : Exponential Linear Unit
        - 'softmax' : Softmax
        - 'leaky_relu' : Leaky Rectified Linear Unit
        - 'tanh' : Hyperbolic Tangent
        - 'softplus' : Softplus
    """

    def __init__(self, name: str, **unknown_options):
        _check_unknown_options("Activation Layer", unknown_options)

        self.input_shape = None

        self.activation_name = name
        self._activation_func = ACTIVATION_FUNC[name.lower()]()
        self.trainable = True

    def layer_name(self):
        return "Activation (%s)" % (self._activation_func.__class__.__name__)

    def _forward_pass(self, x: np.ndarray, training: bool = True):
        self.layer_input = x
        return self._activation_func(x)

    def _backward_pass(self, accum_grad: np.ndarray):
        return accum_grad * self._activation_func.gradient(self.layer_input)

    def output_shape(self):
        return self.input_shape


class Scale(Layer):

    """
    Scale layer that applies the min-max scaling function to the outputs.

    Options
    -------
    bounds : list, tuple or array-like
        A sequence of ``(min, max)`` values that the outputs will be scaled to.
    """

    def __init__(self, bounds: list | tuple | np.ndarray, **unknown_options):
        _check_unknown_options("Scale Layer", unknown_options)

        self.input_shape = None

        self.scale_name = "minmaxscale"

        self._scale_func = MinMaxScale(np.array(bounds))

        self.trainable = True

    def layer_name(self):
        return "Scale (%s)" % (self._scale_func.__class__.__name__)

    def _forward_pass(self, x, training=True):
        self.layer_input = x
        return self._scale_func(x)

    def _backward_pass(self, accum_grad):
        return accum_grad * self._scale_func.gradient(self.layer_input)

    def output_shape(self):
        return self.input_shape


def _wb_initialization(layer: Layer, attr: str):
    fin = layer.input_shape[0]
    fout = layer.neurons

    if attr == "bias":
        initializer = layer.bias_initializer
        shape = (1, fout)

    else:
        initializer = layer.kernel_initializer
        shape = (fin, fout)

    split_inizer = initializer.split("_")

    if split_inizer[-1] == "uniform":
        if split_inizer[0] == "glorot":
            limit = np.sqrt(6 / (fin + fout))

        elif split_inizer[0] == "he":
            limit = np.sqrt(6 / fin)

        else:
            limit = 1 / np.sqrt(fin)

        setattr(layer, attr, np.random.uniform(-limit, limit, shape))

    elif split_inizer[-1] == "normal":
        if split_inizer[0] == "glorot":
            std = np.sqrt(2 / (fin + fout))

        elif split_inizer[0] == "he":
            std = np.sqrt(2 / fin)

        else:
            std = 0.01

        setattr(layer, attr, np.random.normal(0, std, shape))

    else:
        setattr(layer, attr, np.zeros(shape))


class Dense(Layer):

    """
    Fully-connected (dense) layer.

    Options
    -------
    neurons : int
        The number of neurons in the layer.

    input_shape : tuple or None, default None
        The expected input shape of the dense layer.
        It must be specified if this is the first layer in the network.

    kernel_initializer : str, default 'glorot_uniform'
        Weight initialization method. Should be one of

        - 'uniform'
        - 'glorot_uniform'
        - 'he_uniform'
        - 'normal'
        - 'glorot_normal'
        - 'he_normal'
        - 'zeros'

    bias_initializer : str, default 'zeros'
        Bias initialization method. Should be one of

        - 'uniform'
        - 'glorot_uniform'
        - 'he_uniform'
        - 'normal'
        - 'glorot_normal'
        - 'he_normal'
        - 'zeros'
    """

    def __init__(
        self,
        neurons: int,
        input_shape: tuple | None = None,
        kernel_initializer: str = "glorot_uniform",
        bias_initializer: str = "zeros",
        **unknown_options,
    ):
        _check_unknown_options("Dense Layer", unknown_options)

        self.layer_input = None

        self.input_shape = input_shape

        self.neurons = neurons

        self.trainable = True

        self.weight = None

        self.bias = None

        self.kernel_initializer = kernel_initializer.lower()

        if self.kernel_initializer not in WB_INITIALIZER:
            raise ValueError(
                f"Unknown kernel initializer: {self.kernel_initializer}. Choices {WB_INITIALIZER}"
            )

        self.bias_initializer = bias_initializer.lower()

        if self.bias_initializer not in WB_INITIALIZER:
            raise ValueError(
                f"Unknown bias initializer: {self.bias_initializer}. Choices {WB_INITIALIZER}"
            )

    def _initialize(self, optimizer: function):
        # Initialize weights and biases
        _wb_initialization(self, "weight")
        _wb_initialization(self, "bias")

        # Set optimizer
        self._weight_opt = copy.copy(optimizer)
        self._bias_opt = copy.copy(optimizer)

    def n_params(self):
        return np.prod(self.weight.shape) + np.prod(self.bias.shape)

    def _forward_pass(self, x: np.ndarray, training: bool = True):
        if training:
            self.layer_input = x
        return x.dot(self.weight) + self.bias

    def _backward_pass(self, accum_grad: np.ndarray):
        # Save weights used during forwards pass
        weight = self.weight

        if self.trainable:
            # Calculate gradient w.r.t layer weights
            grad_w = self.layer_input.T.dot(accum_grad)
            grad_w0 = np.sum(accum_grad, axis=0, keepdims=True)

        # Update the layer weights
        self.weight = self._weight_opt.update(self.weight, grad_w)
        self.bias = self._bias_opt.update(self.bias, grad_w0)

        # Return accumulated gradient for next layer
        # Calculated based on the weights used during the forward pass
        accum_grad = accum_grad.dot(weight.T)
        return accum_grad

    def output_shape(self):
        return (self.neurons,)


class Dropout(Layer):

    """
    Dropout layer that randomly sets the output of the previous layer to zero with a specified probability.

    Options
    -------
    drop_rate: float
        The probability of setting a given output value to zero.
    """

    def __init__(self, drop_rate: float, **unknown_options):
        _check_unknown_options("Dropout Layer", unknown_options)

        self.drop_rate = drop_rate

        self._mask = None

        self.input_shape = None

        self.trainable = True

    def _forward_pass(self, x: np.ndarray, training: bool = True):
        c = 1 - self.drop_rate

        if training:
            self._mask = np.random.uniform(size=x.shape) > self.drop_rate
            c = self._mask

        return x * c

    def _backward_pass(self, accum_grad: np.ndarray):
        return accum_grad * self._mask

    def output_shape(self):
        return self.input_shape


LAYERS = {
    "dense": Dense,
    "activation": Activation,
    "scale": Scale,
    "dropout": Dropout,
}


### ACTIVATION FUNCTIONS ###


class Sigmoid:
    def __call__(self, x):
        return 1 / (1 + np.exp(-x))

    def gradient(self, x):
        return self.__call__(x) * (1 - self.__call__(x))


class Softmax:
    def __call__(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def gradient(self, x):
        p = self.__call__(x)
        return p * (1 - p)


class TanH:
    def __call__(self, x):
        return 2 / (1 + np.exp(-2 * x)) - 1

    def gradient(self, x):
        return 1 - np.power(self.__call__(x), 2)


class ReLU:
    def __call__(self, x):
        return np.where(x >= 0, x, 0)

    def gradient(self, x):
        return np.where(x >= 0, 1, 0)


class LeakyReLU:
    def __init__(self, alpha=0.2):
        self.alpha = alpha

    def __call__(self, x):
        return np.where(x >= 0, x, self.alpha * x)

    def gradient(self, x):
        return np.where(x >= 0, 1, self.alpha)


class ELU:
    def __init__(self, alpha=0.1):
        self.alpha = alpha

    def __call__(self, x):
        return np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1))

    def gradient(self, x):
        return np.where(x >= 0.0, 1, self.__call__(x) + self.alpha)


class SELU:
    def __init__(self):
        self.alpha = 1.6732632423543772848170429916717
        self.scale = 1.0507009873554804934193349852946

    def __call__(self, x):
        return self.scale * np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1))

    def gradient(self, x):
        return self.scale * np.where(x >= 0.0, 1, self.alpha * np.exp(x))


class SoftPlus:
    def __call__(self, x):
        return np.log(1 + np.exp(x))

    def gradient(self, x):
        return 1 / (1 + np.exp(-x))


ACTIVATION_FUNC = {
    "relu": ReLU,
    "sigmoid": Sigmoid,
    "selu": SELU,
    "elu": ELU,
    "softmax": Softmax,
    "leaky_relu": LeakyReLU,
    "tanh": TanH,
    "softplus": SoftPlus,
}


### Scaling functions ###


class MinMaxScale:
    def __init__(self, bounds: np.ndarray):
        self._bounds = bounds

        self.lower = np.array([b[0] for b in bounds])
        self.upper = np.array([b[1] for b in bounds])

    def __call__(self, x: np.ndarray):
        return self.lower + x * (self.upper - self.lower)

    def gradient(self, x: np.ndarray):
        return self.upper - self.lower


### OPTIMIZER ###


class StochasticGradientDescent:

    """
    Compile the neural network with Stochastic Gradient Descent (SGD) optimizer.

    Options
    -------
    learning_rate : float, default 0.01
        The learning rate used to update the weights during training.

    momentum : float, default 0
        The momentum used to smooth the gradient updates.
    """

    def __init__(
        self, learning_rate: float = 0.01, momentum: float = 0, **unknown_options
    ):
        _check_unknown_options("SGD optimizer", unknown_options)

        self.learning_rate = learning_rate

        self.momentum = momentum
        self.w_updt = None

    def update(self, w, grad_wrt_w):
        # If not initialized
        if self.w_updt is None:
            self.w_updt = np.zeros(np.shape(w))
        # Use momentum if set
        self.w_updt = self.momentum * self.w_updt + (1 - self.momentum) * grad_wrt_w
        # Move against the gradient to minimize loss
        return w - self.learning_rate * self.w_updt


class Adam:

    """
    Compile the neural network with Adaptive Moment Estimation (Adam) optimizer.

    Options
    -------
    learning_rate : float, default 0.001
        The learning rate used to update the weights during training.

    b1 : float, default 0.9
        Exponential decay rate for the first moment estimate.

    b2 : float, default 0.999
        Exponential decay rate for the second moment estimate.
    """

    def __init__(
        self,
        learning_rate: float = 0.001,
        b1: float = 0.9,
        b2: float = 0.999,
        **unknown_options,
    ):
        _check_unknown_options("Adam optimizer", unknown_options)

        self.learning_rate = learning_rate

        self.eps = 1e-8
        self.m = None
        self.v = None

        # Decay rates
        self.b1 = b1
        self.b2 = b2

    def update(self, w: np.ndarray, grad_wrt_w: np.ndarray):
        # If not initialized
        if self.m is None:
            self.m = np.zeros(np.shape(grad_wrt_w))
            self.v = np.zeros(np.shape(grad_wrt_w))

        self.m = self.b1 * self.m + (1 - self.b1) * grad_wrt_w
        self.v = self.b2 * self.v + (1 - self.b2) * np.power(grad_wrt_w, 2)

        m_hat = self.m / (1 - self.b1)
        v_hat = self.v / (1 - self.b2)

        self.w_updt = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)

        return w - self.w_updt


class Adagrad:

    """
    Compile the neural network with Adaptive Gradient (Adagrad) optimizer.

    Options
    -------
    learning_rate : float, default 0.01
        The learning rate used to update the weights during training.

    """

    def __init__(self, learning_rate: float = 0.01, **unknown_options):
        _check_unknown_options("Adagrad optimizer", unknown_options)

        self.learning_rate = learning_rate

        self.G = None  # Sum of squares of the gradients
        self.eps = 1e-8

    def update(self, w: np.ndarray, grad_wrt_w: np.ndarray):
        # If not initialized
        if self.G is None:
            self.G = np.zeros(np.shape(w))
        # Add the square of the gradient of the loss function at w
        self.G += np.power(grad_wrt_w, 2)
        # Adaptive gradient with higher learning rate for sparse data
        return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.eps)


class RMSprop:

    """
    Compile the neural network with Root Mean Square Propagation (RMSprop) optimizer.

    Options
    -------
    learning_rate : float, default 0.001
        The learning rate used to update the weights during training.

    rho : float, default 0.9
        The decay rate for the running average of the squared gradients.
    """

    def __init__(
        self, learning_rate: float = 0.001, rho: float = 0.9, **unknown_options
    ):
        _check_unknown_options("RMSprop optimizer", unknown_options)

        self.learning_rate = learning_rate

        self.Eg = None  # Running average of the square gradients at w
        self.eps = 1e-8
        self.rho = rho

    def update(self, w: np.ndarray, grad_wrt_w: np.ndarray):
        # If not initialized
        if self.Eg is None:
            self.Eg = np.zeros(np.shape(grad_wrt_w))

        self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_wrt_w, 2)

        # Divide the learning rate for a weight by a running average of the magnitudes of recent
        # gradients for that weight
        return w - self.learning_rate * grad_wrt_w / np.sqrt(self.Eg + self.eps)


OPT_FUNC = {
    "sgd": StochasticGradientDescent,
    "adam": Adam,
    "adagrad": Adagrad,
    "rmsprop": RMSprop,
}


### LOSS ###


def _hcost(instance: Model):
    return instance.output.cost


def _hcost_prime(
    y: np.ndarray,
    control_vector: np.ndarray,
    mask: np.ndarray,
    instance: Model,
    parameters_bgd: ParametersDT,
    states_bgd: StatesDT,
):
    # % Set parameters or states
    for i, name in enumerate(control_vector):
        if name in instance.setup._parameters_name:
            getattr(instance.parameters, name)[mask] = y[:, i]

        else:
            getattr(instance.states, name)[mask] = y[:, i]

    parameters_b = instance.parameters.copy()
    parameters_bgd_b = instance.parameters.copy()

    states_b = instance.states.copy()
    states_bgd_b = instance.states.copy()

    output_b = instance.output.copy()

    cost = np.float32(0)
    cost_b = np.float32(1)

    forward_b(
        instance.setup,
        instance.mesh,
        instance.input_data,
        instance.parameters,
        parameters_b,
        parameters_bgd,
        parameters_bgd_b,
        instance.states,
        states_b,
        states_bgd,
        states_bgd_b,
        instance.output,
        output_b,
        cost,
        cost_b,
    )

    grad = np.transpose(
        [
            getattr(parameters_b, name)[mask]
            if name in instance.setup._parameters_name
            else getattr(states_b, name)[mask]
            for name in control_vector
        ]
    )

    return grad


### STANDARDIZE ###


def _standardize_layer(layer: str):
    if isinstance(layer, str):
        layer = layer.lower()

        if layer in LAYER_NAME:
            return layer

        else:
            raise ValueError(f"Unknown layer type '{layer}'. Choices: {LAYER_NAME}")

    else:
        raise TypeError(f"layer argument must be str")


def _standardize_optimizer(optimizer: str):
    if isinstance(optimizer, str):
        optimizer = optimizer.lower()

        if optimizer in NET_OPTIMIZER:
            return optimizer

        else:
            raise ValueError(
                f"Unknown optimizer '{optimizer}'. Choices: {NET_OPTIMIZER}"
            )

    else:
        raise TypeError(f"optimizer argument must be str")


### OTHERS ###


def _inf_norm(grad: np.ndarray):
    return np.amax(np.abs(grad))


def _check_unknown_options(type_check: str, unknown_options: dict):
    if unknown_options:
        msg = ", ".join(map(str, unknown_options.keys()))
        raise KeyError("Unknown %s options: '%s'" % (type_check, msg))