diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 453925c3..9c793591 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1 +1,2 @@
-style = "sciml"
\ No newline at end of file
+style = "sciml"
+format_markdown = true
\ No newline at end of file
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 9bfb3d10..969cc9bd 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -3,9 +3,13 @@ on:
   pull_request:
     branches:
       - master
+    paths-ignore:
+      - 'docs/**'
   push:
     branches:
       - master
+    paths-ignore:
+      - 'docs/**'
 jobs:
   test:
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 1388e96a..f4587026 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 .DS_Store
 /Manifest.toml
 /dev/
+docs/build
\ No newline at end of file
diff --git a/docs/Project.toml b/docs/Project.toml
index 207ffd7e..d4aa8703 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -15,7 +15,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 CUDA = "3, 4, 5"
 CellularAutomata = "0.0.2"
 DifferentialEquations = "7"
-Documenter = "0.27"
+Documenter = "1"
 OrdinaryDiffEq = "6"
 Plots = "1"
 PredefinedDynamicalSystems = "1"
diff --git a/docs/make.jl b/docs/make.jl
index f20f467b..9fa9ec58 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -8,17 +8,10 @@ ENV["GKSwstype"] = "100"
 include("pages.jl")
 
 makedocs(modules = [ReservoirComputing],
-         clean = true, doctest = false,
          sitename = "ReservoirComputing.jl",
-         strict = [
-             :doctest,
-             :linkcheck,
-             :parse_error,
-             :example_block,
-             # Other available options are
-             # :autodocs_block, :cross_references, :docs_block, :eval_block, :example_block, :footnote, :meta_block, :missing_docs, :setup_block
-         ],
-         format = Documenter.HTML(analytics = "UA-90474609-3",
+         clean = true, doctest = false, linkcheck = true,
+         warnonly = [:missing_docs],
+         format = Documenter.HTML(
                                   assets = ["assets/favicon.ico"],
                                   canonical = "https://docs.sciml.ai/ReservoirComputing/stable/"),
          pages = pages)
diff --git a/docs/src/api/esn.md b/docs/src/api/esn.md
index 057bab2a..ff03dc8c 100644
--- a/docs/src/api/esn.md
+++ b/docs/src/api/esn.md
@@ -3,13 +3,13 @@
     ESN
 ```
 
-In addition to all the components that can be explored in the documentation a couple need a separate introduction. The ```variation``` arguments can be
+In addition to all the components that can be explored in the documentation, a couple components need a separate introduction. The ```variation``` arguments can be
 ```@docs
     Default
     Hybrid
 ```
 
-These arguments detail more deep variation of the underlying model and they need a separate call. For the moment the most complex is the ```Hybrid``` call, but this can and will change in the future.
+These arguments detail a deeper variation of the underlying model, and they need a separate call. For the moment, the most complex is the ```Hybrid``` call, but this can and will change in the future.
 All ESN models can be trained using the following call:
 ```@docs
     train
diff --git a/docs/src/api/esn_drivers.md b/docs/src/api/esn_drivers.md
index a71178c0..a11ec35b 100644
--- a/docs/src/api/esn_drivers.md
+++ b/docs/src/api/esn_drivers.md
@@ -4,7 +4,7 @@
     MRNN
     GRU
 ```
-The ```GRU``` driver also provides the user the choice of the possible variant:
+The ```GRU``` driver also provides the user with the choice of the possible variants:
 ```@docs
     FullyGated
     Minimal
diff --git a/docs/src/api/esn_layers.md b/docs/src/api/esn_layers.md
index 4afaec85..76be5268 100644
--- a/docs/src/api/esn_layers.md
+++ b/docs/src/api/esn_layers.md
@@ -9,7 +9,7 @@
     MinimumLayer
     NullLayer
 ```
-The sign in the ```MinimumLayer``` are chosen based on the following methods:
+The signs in the ```MinimumLayer``` are chosen based on the following methods:
 ```@docs
     BernoulliSample
     IrrationalSample
@@ -18,7 +18,7 @@ To derive the matrix one can call the following function:
 ```@docs
     create_layer
 ```
-To create new input layers it suffice to define a new struct containing the needed parameters of the new input layer. This struct wiil need to be an ```AbstractLayer```, so the ```create_layer``` function can be dispatched over it. The workflow should follow this snippet:
+To create new input layers, it suffices to define a new struct containing the needed parameters of the new input layer. This struct will need to be an ```AbstractLayer```, so the ```create_layer``` function can be dispatched over it. The workflow should follow this snippet:
 ```julia
 #creation of the new struct for the layer
 struct MyNewLayer <: AbstractLayer
@@ -42,12 +42,12 @@ end
     NullReservoir
 ```
 
-Like for the input layers, to actually build the matrix of the reservoir one can call the following function:
+Like for the input layers, to actually build the matrix of the reservoir, one can call the following function:
 ```@docs
     create_reservoir
 ```
 
-To create a new reservoir the procedure is imilar to the one for the input layers. First the definition of the new struct of type ```AbstractReservoir``` with the reservoir parameters is needed. Then the dispatch over the ```create_reservoir``` function makes the model actually build the reservoir matrix. An example of the workflow is given in the following snippet:
+To create a new reservoir, the procedure is similar to the one for the input layers. First, the definition of the new struct of type ```AbstractReservoir``` with the reservoir parameters is needed. Then the dispatch over the ```create_reservoir``` function makes the model actually build the reservoir matrix. An example of the workflow is given in the following snippet:
 ```julia
 #creation of the new struct for the reservoir
 struct MyNewReservoir <: AbstractReservoir
diff --git a/docs/src/api/reca.md b/docs/src/api/reca.md
index 747b2bf5..dcbc86df 100644
--- a/docs/src/api/reca.md
+++ b/docs/src/api/reca.md
@@ -8,4 +8,4 @@ The input encodings are the equivalent of the input matrices of the ESNs. These
     RandomMapping
 ```
 
-The training and prediction follow the same workflow of the ESN. It is important to note that at the moment we were not able to find any paper using these models with a ```Generative``` approach for the prediction, so full support is given only to the ```Predictive``` method.
+The training and prediction follow the same workflow as the ESN. It is important to note that currently we were unable to find any papers using these models with a ```Generative``` approach for the prediction, so full support is given only to the ```Predictive``` method.
diff --git a/docs/src/api/training.md b/docs/src/api/training.md
index ee10884b..b34f046b 100644
--- a/docs/src/api/training.md
+++ b/docs/src/api/training.md
@@ -7,7 +7,7 @@
 ```
 
 ## Gaussian Regression
-Currently (v0.9) unavailable.
+Currently, v0.9 is unavailable.
 
 ## Support Vector Regression
-Support vector Regression is possible using a direct call to [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods. Instead of a wrapper please refer to the use of ```LIBSVM.AbstractSVR``` in the original library.
+Support Vector Regression is possible using a direct call to [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods. Instead of a wrapper, please refer to the use of ```LIBSVM.AbstractSVR``` in the original library.
diff --git a/docs/src/esn_tutorials/change_layers.md b/docs/src/esn_tutorials/change_layers.md
index f3e27caa..0a659cd5 100644
--- a/docs/src/esn_tutorials/change_layers.md
+++ b/docs/src/esn_tutorials/change_layers.md
@@ -1,5 +1,5 @@
 # Using Different Layers
-A great deal of efforts in the ESNs field are devoted to finding an ideal construction for the reservoir matrices. With a simple interface using ReservoirComputing.jl is possible to leverage the currently implemented matrix constructions methods for both the reservoir and the input layer. In this page it is showcased how it is possible to change both of these layers.
+A great deal of effort in the ESNs field is devoted to finding the ideal construction for the reservoir matrices. With a simple interface using ReservoirComputing.jl it is possible to leverage the currently implemented matrix construction methods for both the reservoir and the input layer. On this page, it is showcased how it is possible to change both of these layers.
 
 The `input_init` keyword argument provided with the `ESN` constructor allows for changing the input layer. The layers provided in ReservoirComputing.jl are the following:
 - ```WeightedLayer(scaling)```
@@ -7,7 +7,7 @@ The `input_init` keyword argument provided with the `ESN` constructor allows for
 - ```SparseLayer(scaling, sparsity)```
 - ```MinimumLayer(weight, sampling)```
 - ```InformedLayer(model_in_size; scaling=0.1, gamma=0.5)```
-In addition the user can define a custom layer following this workflow:
+In addition, the user can define a custom layer following this workflow:
 ```julia
 #creation of the new struct for the layer
 struct MyNewLayer <: AbstractLayer
@@ -39,10 +39,10 @@ function create_reservoir(reservoir::AbstractReservoir, res_size)
 end
 ```
 
-## Example of minimally complex ESN
-Using [^1] and [^2] as references this section will provide an example on how to change both the input layer and the reservoir for ESNs. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_layers/layers.jl). This example was run on Julia v1.7.2.
+## Example of a minimally complex ESN
+Using [^1] and [^2] as references, this section will provide an example of how to change both the input layer and the reservoir for ESNs. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_layers/layers.jl). This example was run on Julia v1.7.2.
 
-The task for this example will be the one step ahead prediction of the Henon map. To obtain the data one can leverage the package [DynamicalSystems.jl](https://juliadynamics.github.io/DynamicalSystems.jl/dev/). The data is scaled to be between -1 and 1.
+The task for this example will be the one step ahead prediction of the Henon map. To obtain the data, one can leverage the package [DynamicalSystems.jl](https://juliadynamics.github.io/DynamicalSystems.jl/dev/). The data is scaled to be between -1 and 1.
 ```@example mesn
 using PredefinedDynamicalSystems
 train_len = 3000
@@ -79,7 +79,7 @@ for i=1:length(reservoirs)
     println(msd(testing_target, output))
 end
 ```
-As it is possible to see, changing layers in ESN models is straightforward. Be sure to check the API documentation for a full list of reservoir and layers.
+As it is possible to see, changing layers in ESN models is straightforward. Be sure to check the API documentation for a full list of reservoirs and layers.
 
 
 ## Bibliography
diff --git a/docs/src/esn_tutorials/deep_esn.md b/docs/src/esn_tutorials/deep_esn.md
index 1cd41a68..6619f722 100644
--- a/docs/src/esn_tutorials/deep_esn.md
+++ b/docs/src/esn_tutorials/deep_esn.md
@@ -1,11 +1,11 @@
 # Deep Echo State Networks
 
-Deep Echo State Network architectures started to gain some traction recently. In this guide we illustrate how it is possible to use ReservoirComputing.jl to build a deep ESN. 
+Deep Echo State Network architectures started to gain some traction recently. In this guide, we illustrate how it is possible to use ReservoirComputing.jl to build a deep ESN. 
 
-The network implemented in this library is taken from [^1]. It works by stacking reservoirs on top of each other, feeding the output on one in the next. The states are obtained by merging all the inner states of the stacked reservoirs. For a more in depth explanation refer to the paper linked above. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/deep-esn/deepesn.jl). This example was run on Julia v1.7.2.
+The network implemented in this library is taken from [^1]. It works by stacking reservoirs on top of each other, feeding the output from one into the next. The states are obtained by merging all the inner states of the stacked reservoirs. For a more in-depth explanation, refer to the paper linked above. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/deep-esn/deepesn.jl). This example was run on Julia v1.7.2.
 
 ## Lorenz Example
-For this example we are going to reuse the Lorenz data used in the [Lorenz System Forecasting](@ref) example.
+For this example, we are going to reuse the Lorenz data used in the [Lorenz System Forecasting](@ref) example.
 ```@example deep_lorenz
 using OrdinaryDiffEq
 
@@ -31,7 +31,7 @@ target_data = data[:, shift+1:shift+train_len]
 test_data = data[:,shift+train_len+1:shift+train_len+predict_len]
 ```
 
-Again, it is *important* to notice that the data needs to be formatted in a matrix with the features as rows and time steps as columns like it is done in this example. This is needed even if the time series consists of single values. 
+Again, it is *important* to notice that the data needs to be formatted in a matrix, with the features as rows and time steps as columns, as in this example. This is needed even if the time series consists of single values. 
 
 The construction of the ESN is also really similar. The only difference is that the reservoir can be fed as an array of reservoirs. 
 ```@example deep_lorenz
@@ -50,11 +50,11 @@ esn = ESN(input_data;
     states_type = StandardStates())
 ```
 
-As it is possible to see, different sizes can be chosen for the different reservoirs. The input layer and bias can also be given as vectors, but of course they have to be of the same size of the reservoirs vector. If they are not passed as a vector, the value passed is going to be used for all the layers in the deep ESN.
+As it is possible to see, different sizes can be chosen for the different reservoirs. The input layer and bias can also be given as vectors, but of course, they have to be of the same size of the reservoirs vector. If they are not passed as a vector, the value passed will be used for all the layers in the deep ESN.
 
-In addition to using the provided functions for the construction of the layers the user can also choose to build their own matrix, or array of matrices, and feed that into the `ESN` in the same way.
+In addition to using the provided functions for the construction of the layers, the user can also choose to build their own matrix, or array of matrices, and feed that into the `ESN` in the same way.
 
-The training and prediction follows the usual framework:
+The training and prediction follow the usual framework:
 ```@example deep_lorenz
 training_method = StandardRidge(0.0) 
 output_layer = train(esn, target_data, training_method)
@@ -83,7 +83,7 @@ plot(p1, p2, p3, plot_title = "Lorenz System Coordinates",
     legendfontsize=12, titlefontsize=20)
 ```
 
-Note that there is a known bug at the moment with using `WeightedLayer` as the input layer with the deep ESN. We are in the process of investigating and solving it. The leak coefficient for the reservoirs has to always be the same with the current implementation. This is also something we are actively looking into expanding.
+Note that there is a known bug at the moment with using `WeightedLayer` as the input layer with the deep ESN. We are in the process of investigating and solving it. The leak coefficient for the reservoirs has to always be the same in the current implementation. This is also something we are actively looking into expanding.
 
 ## Documentation
 [^1]: Gallicchio, Claudio, and Alessio Micheli. "_Deep echo state network (deepesn): A brief survey._" arXiv preprint arXiv:1712.04323 (2017).
diff --git a/docs/src/esn_tutorials/different_drivers.md b/docs/src/esn_tutorials/different_drivers.md
index f126b479..9b9fcd55 100644
--- a/docs/src/esn_tutorials/different_drivers.md
+++ b/docs/src/esn_tutorials/different_drivers.md
@@ -1,21 +1,21 @@
 # Using Different Reservoir Drivers
-While the original implementation of the Echo State Network implemented the model using the equations of Recurrent Neural Networks to obtain non linearity in the reservoir, other variations have been proposed in recent years. More specifically the different drivers implemented in ReservoirComputing.jl are the multiple activation function RNN `MRNN()` and the Gated Recurrent Unit `GRU()`. To change them it suffice to give the chosen method to the `ESN` keyword argument `reservoir_driver`. In this section some example of their usage will be given, as well as a quick introduction to their equations.
+While the original implementation of the Echo State Network implemented the model using the equations of Recurrent Neural Networks to obtain non-linearity in the reservoir, other variations have been proposed in recent years. More specifically, the different drivers implemented in ReservoirComputing.jl are the multiple activation function RNN `MRNN()` and the Gated Recurrent Unit `GRU()`. To change them, it suffices to give the chosen method to the `ESN` keyword argument `reservoir_driver`. In this section, some examples, of their usage will be given, as well as a brief introduction to their equations.
 
 ## Multiple Activation Function RNN
-Based on the double activation function ESN (DAFESN) proposed in [^1], the Multiple Activation Function ESN expands the idea and allows a custom number of activation functions to be used in the reservoir dynamics. This can be thought as a linear combination of multiple activation functions with corresponding parameters.
+Based on the double activation function ESN (DAFESN) proposed in [^1], the Multiple Activation Function ESN expands the idea and allows a custom number of activation functions to be used in the reservoir dynamics. This can be thought of as a linear combination of multiple activation functions with corresponding parameters.
 ```math
 \mathbf{x}(t+1) = (1-\alpha)\mathbf{x}(t) + \lambda_1 f_1(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t)) + \dots + \lambda_D f_D(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t))
 ```
-where ``D`` is the number of activation function and respective parameters chosen.
+where ``D`` is the number of activation functions and respective parameters chosen.
 
-The method to call to use the mutliple activation function ESN is `MRNN(activation_function, leaky_coefficient, scaling_factor)`. The arguments can be used as both `args` or `kwargs`. `activation_function` and `scaling_factor` have to be vectors (or tuples) containing the chosen activation functions and respective scaling factors (``f_1,...,f_D`` and ``\lambda_1,...,\lambda_D`` following the nomenclature introduced above). The leaky_coefficient represents ``\alpha`` and it is a single value. 
+The method to call to use the multiple activation function ESN is `MRNN(activation_function, leaky_coefficient, scaling_factor)`. The arguments can be used as both `args` and `kwargs`. `activation_function` and `scaling_factor` have to be vectors (or tuples) containing the chosen activation functions and respective scaling factors (``f_1,...,f_D`` and ``\lambda_1,...,\lambda_D`` following the nomenclature introduced above). The `leaky_coefficient` represents ``\alpha`` and it is a single value. 
 
-Starting the example, the data used is based on the following function based on the DAFESN paper [^1]. A full script of the example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/mrnn/mrnn.jl). This example was run on Julia v1.7.2.
+Starting with the example, the data used is based on the following function based on the DAFESN paper [^1]. A full script of the example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/mrnn/mrnn.jl). This example was run on Julia v1.7.2.
 ```@example mrnn
 u(t) = sin(t)+sin(0.51*t)+sin(0.22*t)+sin(0.1002*t)+sin(0.05343*t)
 ```
 
-For this example the type of prediction will be one step ahead. The metric used to assure a good prediction is going to be the normalized root-mean-square deviation `rmsd` from [StatsBase](https://juliastats.org/StatsBase.jl/stable/). Like in the other examples first it is needed to gather the data:
+For this example, the type of prediction will be one step ahead. The metric used to assure a good prediction will be the normalized root-mean-square deviation `rmsd` from [StatsBase](https://juliastats.org/StatsBase.jl/stable/). Like in the other examples, first it is needed to gather the data:
 ```@example mrnn
 train_len = 3000
 predict_len = 2000
@@ -28,14 +28,14 @@ testing_input = reduce(hcat, data[shift+train_len:shift+train_len+predict_len-1]
 testing_target = reduce(hcat, data[shift+train_len+1:shift+train_len+predict_len])
 ```
 
-In order to follow the paper more closely it is necessary to define a couple of activation functions. The numbering of them follows the ones in the paper. Of course one can also use any function, custom defined, available in the base language or any activation function from [NNlib](https://fluxml.ai/Flux.jl/stable/models/nnlib/#Activation-Functions).
+To follow the paper more closely, it is necessary to define a couple of activation functions. The numbering of them follows the ones in the paper. Of course, one can also use any custom-defined function, available in the base language or any activation function from [NNlib](https://fluxml.ai/Flux.jl/stable/models/nnlib/#Activation-Functions).
 ```@example mrnn
 f2(x) = (1-exp(-x))/(2*(1+exp(-x)))
 f3(x) = (2/pi)*atan((pi/2)*x)
 f4(x) = x/sqrt(1+x*x)
 ```
 
-It is now possible to build different drivers, using the parameters suggested by the paper. Also in this instance the numbering follows the test cases of the paper. In the end a simple for loop is implemented to compare the different drivers and activation functions.
+It is now possible to build different drivers, using the parameters suggested by the paper. Also, in this instance, the numbering follows the test cases of the paper. In the end, a simple for loop is implemented to compare the different drivers and activation functions.
 ```@example mrnn
 using ReservoirComputing, Random, StatsBase
 
@@ -73,10 +73,10 @@ for case in test_cases
 end
 ```
 
-In this example it is also possible to observe the input of parameters to the methods `RNN()` `MRNN()` both by argument and by keyword argument.
+In this example, it is also possible to observe the input of parameters to the methods `RNN()` `MRNN()`, both by argument and by keyword argument.
 
 ## Gated Recurrent Unit
-Gated Recurrent Units (GRUs) [^2] have been proposed in more recent years with the intent of limiting notable problems of RNNs, like the vanishing gradient. This change in the underlying equations can be easily transported in the Reservoir Computing paradigm, switching the RNN equations in the reservoir with the GRU equations. This approach has been explored in [^3] and [^4]. Different variations of GRU have been proposed [^5][^6]; this section is subdivided into different sections that go in detail about the governing equations and the implementation of them into ReservoirComputing.jl. Like before, to access the GRU reservoir driver it suffice to change the `reservoir_diver` keyword argument for `ESN` with `GRU()`. All the variations that are going to be presented can be used in this package by leveraging the keyword argument `variant` in the method `GRU()` and specifying the chosen variant: `FullyGated()` or `Minimal()`. Other variations are possible modifying the inner layers and reservoirs. The default is set to the standard version `FullyGated()`. The first section will go in more detail about the default of the `GRU()` method, and the following ones will refer to it to minimize repetitions. This example was run on Julia v1.7.2.
+Gated Recurrent Units (GRUs) [^2] have been proposed in more recent years with the intent of limiting notable problems of RNNs, like the vanishing gradient. This change in the underlying equations can be easily transported into the Reservoir Computing paradigm, by switching the RNN equations in the reservoir with the GRU equations. This approach has been explored in [^3] and [^4]. Different variations of GRU have been proposed [^5][^6]; this section is subdivided into different sections that go into detail about the governing equations and the implementation of them into ReservoirComputing.jl. Like before, to access the GRU reservoir driver, it suffices to change the `reservoir_diver` keyword argument for `ESN` with `GRU()`. All the variations that will be presented can be used in this package by leveraging the keyword argument `variant` in the method `GRU()` and specifying the chosen variant: `FullyGated()` or `Minimal()`. Other variations are possible by modifying the inner layers and reservoirs. The default is set to the standard version `FullyGated()`. The first section will go into more detail about the default of the `GRU()` method, and the following ones will refer to it to minimize repetitions. This example was run on Julia v1.7.2.
 
 ### Standard GRU
 The equations for the standard GRU are as follows:
@@ -87,12 +87,12 @@ The equations for the standard GRU are as follows:
 \mathbf{x}(t) = \mathbf{z}(t) \odot \mathbf{x}(t-1)+(1-\mathbf{z}(t)) \odot \tilde{\mathbf{x}}(t)
 ```
 
-Going over the `GRU` keyword argument it will be explained how to feed the desired input to the model. 
- - `activation_function` is a vector with default values `[NNlib.sigmoid, NNlib.sigmoid, tanh]`. This argument controls the activation functions of the GRU, going from top to bottom. Changing the first element corresponds in changing the activation function for ``\mathbf{r}(t)`` and so on.
+Going over the `GRU` keyword argument, it will be explained how to feed the desired input to the model. 
+ - `activation_function` is a vector with default values `[NNlib.sigmoid, NNlib.sigmoid, tanh]`. This argument controls the activation functions of the GRU, going from top to bottom. Changing the first element corresponds to changing the activation function for ``\mathbf{r}(t)`` and so on.
  - `inner_layer` is a vector with default values `fill(DenseLayer(), 2)`. This keyword argument controls the ``\mathbf{W}_{\text{in}}``s going from top to bottom like before.
  - `reservoir` is a vector with default value `fill(RandSparseReservoir(), 2)`. In a similar fashion to `inner_layer`, this keyword argument controls the reservoir matrix construction in a top to bottom order.
  - `bias` is again a vector with default value `fill(DenseLayer(), 2)`. It is meant to control the ``\mathbf{b}``s, going as usual from top to bottom.
- - `variant` as already illustrated controls the GRU variant. The default value is set to `FullyGated()`.
+ - `variant` controls the GRU variant. The default value is set to `FullyGated()`.
  
 It is important to notice that `inner_layer` and `reservoir` control every layer except ``\mathbf{W}_{in}`` and ``\mathbf{W}`` and ``\mathbf{b}``. These arguments are given as input to the `ESN()` call as `input_layer`, `reservoir` and `bias`. 
 
@@ -105,7 +105,7 @@ The first variation of the GRU is dependent only on the previous hidden state an
 \mathbf{z}(t) = \sigma (\mathbf{W}^z\mathbf{x}(t-1)+\mathbf{b}_z) \\
 ```
 
-To obtain this variation it will suffice to set `inner_layer = fill(NullLayer(), 2)` and leaving the `variant = FullyGated()`.
+To obtain this variation, it will suffice to set `inner_layer = fill(NullLayer(), 2)` and leaving the `variant = FullyGated()`.
 
 ### Type 2
 The second variation only depends on the previous hidden state:
@@ -114,10 +114,10 @@ The second variation only depends on the previous hidden state:
 \mathbf{z}(t) = \sigma (\mathbf{W}^z\mathbf{x}(t-1)) \\
 ```
 
-Similarly to before, to obtain this variation it is only needed to set `inner_layer = fill(NullLayer(), 2)` and `bias = fill(NullLayer(), 2)` while keeping `variant = FullyGated()`.
+Similarly to before, to obtain this variation, it is only required to set `inner_layer = fill(NullLayer(), 2)` and `bias = fill(NullLayer(), 2)` while keeping `variant = FullyGated()`.
 
 ### Type 3
-The final variation before the minimal one depends only on the biases
+The final variation, before the minimal one, depends only on the biases
 ```math
 \mathbf{r}(t) = \sigma (\mathbf{b}_r) \\
 \mathbf{z}(t) = \sigma (\mathbf{b}_z) \\
@@ -136,7 +136,7 @@ The minimal GRU variation merges two gates into one:
 This variation can be obtained by setting `variation=Minimal()`. The `inner_layer`, `reservoir` and `bias` kwargs this time are **not** vectors, but must be defined like, for example `inner_layer = DenseLayer()` or `reservoir = SparseDenseReservoir()`.
 
 ### Examples
-To showcase the use of the `GRU()` method this section will only illustrate the standard `FullyGated()` version. The full script for this example with the data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/gru/l). 
+To showcase the use of the `GRU()` method, this section will only illustrate the standard `FullyGated()` version. The full script for this example with the data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/gru/). 
 
 The data used for this example is the Santa Fe laser dataset [^7] retrieved from [here](https://web.archive.org/web/20160427182805/http://www-psych.stanford.edu/~andreas/Time-Series/SantaFe.html). The data is split to account for a next step prediction.
 ```@example gru
@@ -166,7 +166,7 @@ esn = ESN(training_input;
     reservoir_driver = GRU())
 ```
 
-The default inner reservoir and input layer for the GRU are the same defaults for the `reservoir` and `input_layer` of the ESN. One can use the explicit call if they choose so.
+The default inner reservoir and input layer for the GRU are the same defaults for the `reservoir` and `input_layer` of the ESN. One can use the explicit call if they choose to.
 ```@example gru
 gru = GRU(reservoir=[RandSparseReservoir(res_size), 
     RandSparseReservoir(res_size)],
diff --git a/docs/src/esn_tutorials/hybrid.md b/docs/src/esn_tutorials/hybrid.md
index 31ecae94..2a7b72b7 100644
--- a/docs/src/esn_tutorials/hybrid.md
+++ b/docs/src/esn_tutorials/hybrid.md
@@ -1,8 +1,8 @@
 # Hybrid Echo State Networks
-Following the idea of giving physical information to machine learning models the hybrid echo state networks [^1] try to achieve this results by feeding model data into the ESN. In this example it is explained how to create and leverage such models in ReservoirComputing.jl. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/hybrid/hybrid.jl). This example was run on Julia v1.7.2.
+Following the idea of giving physical information to machine learning models, the hybrid echo state networks [^1] try to achieve this results by feeding model data into the ESN. In this example, it is explained how to create and leverage such models in ReservoirComputing.jl. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/hybrid/hybrid.jl). This example was run on Julia v1.7.2.
 
 ## Generating the data
-For this example we are going to forecast the Lorenz system. As usual the data is generated leveraging `DifferentialEquations.jl`:
+For this example, we are going to forecast the Lorenz system. As usual, the data is generated leveraging `DifferentialEquations.jl`:
 ```@example hybrid
 using DifferentialEquations
 
@@ -33,7 +33,7 @@ tspan_train = (tspan[1], ode_sol.t[train_len])
 ```
 
 ## Building the Hybrid Echo State Network
-In order to feed the data to the ESN it is necessary to create a suitable function.
+To feed the data to the ESN, it is necessary to create a suitable function.
 ```@example hybrid
 function prior_model_data_generator(u0, tspan, tsteps, model = lorenz)
     prob = ODEProblem(lorenz, u0, tspan) 
@@ -42,7 +42,7 @@ function prior_model_data_generator(u0, tspan, tsteps, model = lorenz)
 end
 ```
 
-Given initial condition, time span and time steps this function returns the data for the chosen model. Now, using the `Hybrid` method it is possible to input all this information to the model
+Given the initial condition, time span, and time steps, this function returns the data for the chosen model. Now, using the `Hybrid` method, it is possible to input all this information to the model.
 ```@example hybrid
 using ReservoirComputing, Random
 Random.seed!(42)
diff --git a/docs/src/esn_tutorials/lorenz_basic.md b/docs/src/esn_tutorials/lorenz_basic.md
index ee21b6c4..84b53220 100644
--- a/docs/src/esn_tutorials/lorenz_basic.md
+++ b/docs/src/esn_tutorials/lorenz_basic.md
@@ -3,7 +3,7 @@
 This example expands on the readme Lorenz system forecasting to better showcase how to use methods and functions provided in the library for Echo State Networks. Here the prediction method used is ```Generative```, for a more detailed explanation of the differences between ```Generative``` and ```Predictive``` please refer to the other examples given in the documentation. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/lorenz_basic/lorenz_basic.jl). This example was run on Julia v1.7.2.
 
 ## Generating the data
-Starting off the workflow the first step is to obtain the data. Leveraging ```OrdinaryDiffEq``` it is possible to derive the Lorenz system data in the following way:
+Starting off the workflow, the first step is to obtain the data. Leveraging ```OrdinaryDiffEq``` it is possible to derive the Lorenz system data in the following way:
 ```@example lorenz
 using OrdinaryDiffEq
 
@@ -19,7 +19,7 @@ prob = ODEProblem(lorenz!, [1.0,0.0,0.0], (0.0,200.0))
 data = solve(prob, ABM54(), dt=0.02)
 ```
 
-After obtaining the data it is necessary to determine the kind of prediction for the model. Since this example is going to use the ```Generative``` prediction type, this means that the target data is foing to be the next step of the input data. In addition it is important to notice that the Lorenz system just obtained presents a transient period that is not representative of the general behavior of the system. This can easily be discarded setting a ```shift``` parameter.
+After obtaining the data, it is necessary to determine the kind of prediction for the model. Since this example will use the ```Generative``` prediction type, this means that the target data will be the next step of the input data. In addition, it is important to notice that the Lorenz system just obtained presents a transient period that is not representative of the general behavior of the system. This can easily be discarded by setting a ```shift``` parameter.
 ```@example lorenz
 #determine shift length, training length and prediction length
 shift = 300
@@ -32,10 +32,10 @@ target_data = data[:, shift+1:shift+train_len]
 test_data = data[:,shift+train_len+1:shift+train_len+predict_len]
 ```
 
-It is *important* to notice that the data needs to be formatted in a matrix with the features as rows and time steps as columns like it is done in this example. This is needed even if the time series consists of single values. 
+It is *important* to notice that the data needs to be formatted in a matrix with the features as rows and time steps as columns as in this example. This is needed even if the time series consists of single values. 
 
 ## Building the Echo State Network
-Once the data is ready it is possible to define the parameters for the ESN and the ```ESN``` struct itself. In this example the values from [^1] are loosely followed as general guidelines.
+Once the data is ready, it is possible to define the parameters for the ESN and the ```ESN``` struct itself. In this example, the values from [^1] are loosely followed as general guidelines.
 ```@example lorenz
 using ReservoirComputing
 
@@ -55,22 +55,22 @@ esn = ESN(input_data;
     states_type = StandardStates())
 ```
 
-Most of the parameters here chosen mirror the default ones, so a direct call is not necessary. The readme example is identical to this one, except for the explicit call. Going line by line to see what is happening starting from ```res_size```: this value determines the dimensions of the reservoir matrix. In this case a size of 300 has been chosen, so the reservoir matrix is going to be 300 x 300. This is not always the case, since some input layer constructions can modify the dimensions of the reservoir, but in that case everything is taken care of internally. 
+Most of the parameters chosen here mirror the default ones, so a direct call is not necessary. The readme example is identical to this one, except for the explicit call. Going line by line to see what is happening, starting from ```res_size```: this value determines the dimensions of the reservoir matrix. In this case, a size of 300 has been chosen, so the reservoir matrix will be 300 x 300. This is not always the case, since some input layer constructions can modify the dimensions of the reservoir, but in that case, everything is taken care of internally. 
 
-The ```res_radius``` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the ```RandSparseReservoir()``` method is 1.0 in accordance to the most followed guidelines found in the literature (see [^2] and references therein). The ```sparsity``` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values. In this example the call to the parameters inside ```RandSparseReservoir()``` was done explicitly to showcase the meaning of each of them, but it is also possible to simply pass the values directly like so ```RandSparseReservoir(1.2, 6/300)```.
+The ```res_radius``` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the ```RandSparseReservoir()``` method is 1.0 in accordance with the most commonly followed guidelines found in the literature (see [^2] and references therein). The ```sparsity``` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course, it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values. In this example, the call to the parameters inside ```RandSparseReservoir()``` was done explicitly to showcase the meaning of each of them, but it is also possible to simply pass the values directly, like so ```RandSparseReservoir(1.2, 6/300)```.
 
-The value of ```input_scaling``` determines the upper and lower bounds of the uniform distribution of the weights in the ```WeightedLayer()```. Like before this value can be passed either as argument or keyword argument ```WeightedLayer(0.1)```. The value of 0.1 represents the default. The default input layer is the ```DenseLayer```, a fully connected layer. The details of the weighted version can be found in [^3], for this example this version returns the best results.
+The value of ```input_scaling``` determines the upper and lower bounds of the uniform distribution of the weights in the ```WeightedLayer()```. Like before, this value can be passed either as an argument or as a keyword argument ```WeightedLayer(0.1)```. The value of 0.1 represents the default. The default input layer is the ```DenseLayer```, a fully connected layer. The details of the weighted version can be found in [^3], for this example, this version returns the best results.
 
-The reservoir driver represents the dynamics of the reservoir. In the standard ESN definition these dynamics are obtained through a Recurrent Neural Network (RNN), and this is reflected by calling the ```RNN``` driver for the ```ESN``` struct. This option is set as the default and unless there is the need to change parameters it is not needed. The full equation is the following:
+The reservoir driver represents the dynamics of the reservoir. In the standard ESN definition, these dynamics are obtained through a Recurrent Neural Network (RNN), and this is reflected by calling the ```RNN``` driver for the ```ESN``` struct. This option is set as the default, and unless there is the need to change parameters, it is not needed. The full equation is the following:
 ```math
 \textbf{x}(t+1) = (1-\alpha)\textbf{x}(t) + \alpha \cdot \text{tanh}(\textbf{W}\textbf{x}(t)+\textbf{W}_{\text{in}}\textbf{u}(t))
 ```
-where ``α`` represents the leaky coefficient and tanh can be any activation function. Also ``\textbf{x}`` represent the state vector, ``\textbf{u}`` the input data and ``\textbf{W}, \textbf{W}_{\text{in}}`` are the reservoir matrix and input matrix respectively. The default call to the RNN in the library is the following ```RNN(;activation_function=tanh, leaky_coefficient=1.0)```, where the meaning of the parameters is clear from the equation above. Instead og the hyperbolic tangent any activation function can be used, either leveraging external lybraries such as ```NNlib``` or creating a custom one. 
+where ``α`` represents the leaky coefficient, and tanh can be any activation function. Also, ``\textbf{x}`` represents the state vector, ``\textbf{u}`` the input data, and ``\textbf{W}, \textbf{W}_{\text{in}}`` are the reservoir matrix and input matrix, respectively. The default call to the RNN in the library is the following ```RNN(;activation_function=tanh, leaky_coefficient=1.0)```, where the meaning of the parameters is clear from the equation above. Instead of the hyperbolic tangent, any activation function can be used, either leveraging external libraries such as ```NNlib``` or creating a custom one. 
 
-The final calls are modifications to the states in training or prediction. The default calls, depicted in the example, do not make any modifications to the states. This is the safest bet is one is not sure on how these work. The ```nla_type``` applies a non linear algorithm to the states, while the ```states_type``` can expand them concatenating them with the input data, or padding them concatenating a constant value to all the states. More in depth descriptions about these parameters are given in other examples in the documentation.
+The final calls are modifications to the states in training or prediction. The default calls, depicted in the example, do not make any modifications to the states. This is the safest bet if one is not sure how these work. The ```nla_type``` applies a non-linear algorithm to the states, while the ```states_type``` can expand them by concatenating them with the input data, or padding them by concatenating a constant value to all the states. More in depth descriptions of these parameters are given in other examples in the documentation.
 
 ## Training and Prediction
-Now that the ESN has been created and all the parameters have been explained it is time to proceed with the training. The full call of the readme example follows this general idea:
+Now that the ESN has been created and all the parameters have been explained, it is time to proceed with the training. The full call of the readme example follows this general idea:
 ```@example lorenz
 #define training method
 training_method = StandardRidge(0.0)
@@ -79,18 +79,18 @@ training_method = StandardRidge(0.0)
 output_layer = train(esn, target_data, training_method)
 ```
 
-The training returns an ```OutputLayer``` struct containing the trained output matrix and other informations needed for the prediction. The necessary elements in the ```train()``` call are the ```ESN``` struct created in the previous step and the ```target_data```, that in this case is the one step ahead evolution of the Lorenz system. The training method chosen in this example is the standard one, so an equivalent way of calling the ```train``` function here is ```output_layer = train(esn, target_data)``` like the readme basic version. Likewise the default value for the ridge regression parameter is set to zero, so the actual default training is Ordinary Least Squares regression. Other training methods are available and will be explained in following examples. 
+The training returns an ```OutputLayer``` struct containing the trained output matrix and other  needed for the prediction. The necessary elements in the ```train()``` call are the ```ESN``` struct created in the previous step and the ```target_data```, which in this case is the one step ahead evolution of the Lorenz system. The training method chosen in this example is the standard one, so an equivalent way of calling the ```train``` function here is ```output_layer = train(esn, target_data)``` like the readme basic version. Likewise, the default value for the ridge regression parameter is set to zero, so the actual default training is Ordinary Least Squares regression. Other training methods are available and will be explained in the following examples. 
 
-Once the ```OutputLayer``` has been obtained the prediction can be done following this procedure:
+Once the ```OutputLayer``` has been obtained, the prediction can be done following this procedure:
 ```@example lorenz
 output = esn(Generative(predict_len), output_layer)
 ```
-both the training method and the output layer are needed in this call. The number of steps for the prediction must be specified to the ```Generative``` method. The output results are given in a matrix. 
+both the training method and the output layer are needed in this call. The number of steps for the prediction must be specified in the ```Generative``` method. The output results are given in a matrix. 
 
 !!! info "Saving the states during prediction"
-    While the states are saved in the `ESN` struct for the training, for the prediction they are not saved by default. To inspect the states it is necessary to pass the boolean keyword argument `save_states` to the prediction call, in this example using `esn(... ; save_states=true)`. This returns a tuple `(output, states)` where `size(states) = res_size, prediction_len`
+    While the states are saved in the `ESN` struct for the training, for the prediction they are not saved by default. To inspect the states, it is necessary to pass the boolean keyword argument `save_states` to the prediction call, in this example using `esn(... ; save_states=true)`. This returns a tuple `(output, states)` where `size(states) = res_size, prediction_len`
 
-To inspect the results they can easily be plotted using an external library. In this case ```Plots``` is adopted:
+To inspect the results, they can easily be plotted using an external library. In this case, ```Plots``` is adopted:
 ```@example lorenz
 using Plots, Plots.PlotMeasures
 
diff --git a/docs/src/general/different_training.md b/docs/src/general/different_training.md
index 01c5a788..231389c1 100644
--- a/docs/src/general/different_training.md
+++ b/docs/src/general/different_training.md
@@ -1,14 +1,14 @@
 # Changing Training Algorithms
-Notably Echo State Networks have been trained with Ridge Regression algorithms, but the range of useful algorithms to use is much greater. In this section of the documentation it is possible to explore how to use other training methods to obtain the readout layer. All the methods implemented in ReservoirComputing.jl can be used for all models in the library, not only ESNs. The general workflow illustrated in this section will be based on a dummy RC model `my_model = MyModel(...)` that need training in order to obtain the readout layer. The training is done following:
+Notably Echo State Networks have been trained with Ridge Regression algorithms, but the range of useful algorithms to use is much greater. In this section of the documentation, it is possible to explore how to use other training methods to obtain the readout layer. All the methods implemented in ReservoirComputing.jl can be used for all models in the library, not only ESNs. The general workflow illustrated in this section will be based on a dummy RC model `my_model = MyModel(...)` that needs training to obtain the readout layer. The training is done as follows:
 ```julia
 training_algo = TrainingAlgo()
 readout_layer = train(my_model, train_data, training_algo)
 ```
 
-In this section it is possible to explore how to properly build the `training_algo` and all the possible choices available. In the example section of the documentation it will be provided copy-pastable code to better explore the training algorithms and their impact over the model.
+In this section, it is possible to explore how to properly build the `training_algo` and all the possible choices available. In the example section of the documentation it will be provided copy-pasteable code to better explore the training algorithms and their impact on the model.
 
 ## Linear Models
-The library includes a standard implementation of ridge regression, callable using `StandardRidge(regularization_coeff)` where the default value for the regularization coefficent is set to zero. This is also the default model called when no model is specified in `train()`. This makes the default call for traning `train(my_model, train_data)` use Ordinary Least Squares (OLS) for regression.
+The library includes a standard implementation of ridge regression, callable using `StandardRidge(regularization_coeff)` where the default value for the regularization coefficient is set to zero. This is also the default model called when no model is specified in `train()`. This makes the default call for training `train(my_model, train_data)` use Ordinary Least Squares (OLS) for regression.
 
 Leveraging [MLJLinearModels](https://juliaai.github.io/MLJLinearModels.jl/stable/) it is possible to expand the choices of linear models used for the training. The wrappers provided are structured in the following way:
 ```julia
@@ -18,7 +18,7 @@ struct LinearModel
     regression_kwargs
 end
 ```
-to call the ridge regression using the MLJLinearModels APIs one can use `LinearModel(;regression=LinearRegression)`. It is also possible to use a specific solver, by calling `LinearModel(regression=LinearRegression, solver=Analytical())`. For all the available solvers please reref to the [MLJLinearModels documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/models). To change the regularization coefficient in the ridge example, using for example `lambda = 0.1`, it is needed to pass it in the `regression_kwargs` like so `LinearModel(;regression=LinearRegression, solver=Analytical(), regression_kwargs=(lambda=lambda))`. The nomenclature of the coefficients must follow the MLJLinearModels APIs, using `lambda, gamma` for `LassoRegression` and `delta, lambda, gamma` for `HuberRegression`. Again, please check the [relevant documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/api/) if in doubt. When using MLJLinearModels based regressors do remember to specify `using MLJLinearModels`.
+to call the ridge regression using the MLJLinearModels APIs, one can use `LinearModel(;regression=LinearRegression)`. It is also possible to use a specific solver, by calling `LinearModel(regression=LinearRegression, solver=Analytical())`. For all the available solvers, please refer to the [MLJLinearModels documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/models/). To change the regularization coefficient in the ridge example, using for example `lambda = 0.1`, it is needed to pass it in the `regression_kwargs` like so `LinearModel(;regression=LinearRegression, solver=Analytical(), regression_kwargs=(lambda=lambda))`. The nomenclature of the coefficients must follow the MLJLinearModels APIs, using `lambda, gamma` for `LassoRegression` and `delta, lambda, gamma` for `HuberRegression`. Again, please check the [relevant documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/api/) if in doubt. When using MLJLinearModels based regressors, do remember to specify `using MLJLinearModels`.
 
 ## Gaussian Processes
 Another way to obtain the readout layer is possible using Gaussian regression. This is provided through a wrapper of [GaussianProcesses](http://stor-i.github.io/GaussianProcesses.jl/latest/) structured in the following way:
@@ -31,9 +31,9 @@ struct GaussianProcess
     optimizer
 end
 ```
-While it is necessary to specify a `mean` and a `kernel`, the other defaults are `lognoise=-2, optimize=false, optimizer=Optim.LBFGS()`. For the choice of means and kernels please refer to the proper documentation, [here](http://stor-i.github.io/GaussianProcesses.jl/latest/mean/) and [here](http://stor-i.github.io/GaussianProcesses.jl/latest/kernels/) respectively. 
+While it is necessary to specify a `mean` and a `kernel`, the other defaults are `lognoise=-2, optimize=false, optimizer=Optim.LBFGS()`. For the choice of means and kernels, please refer to the proper documentation, [here](http://stor-i.github.io/GaussianProcesses.jl/latest/mean/) and [here](http://stor-i.github.io/GaussianProcesses.jl/latest/kernels/), respectively. 
 
-Building on the simple example given in the GaussianProcesses documentation it is possible to build an intuition of how to use this algorithms for training ReservoirComputing.jl models.
+Building on the simple example given in the GaussianProcesses documentation, it is possible to build an intuition of how to use these algorithms for training ReservoirComputing.jl models.
 ```julia
 mZero = MeanZero()   #Zero mean function
 kern = SE(0.0,0.0)   #Squared exponential kernel (note that hyperparameters are on the log scale)
@@ -41,7 +41,7 @@ logObsNoise = -1.0
 
 gp = GaussianProcess(mZero, kern, lognoise=logObsNoise)
 ```
-Like in the previous case, if one uses GaussianProcesses based regressors it is necessary to specify `using GaussianProcesses`. Additionally, if the optimizer chosen is from an external package, i.e. Optim, that package need to be used in the script as well adding `using Optim`.
+Like in the previous case, if one uses GaussianProcesses based regressors, it is necessary to specify `using GaussianProcesses`. Additionally, if the optimizer chosen is from an external package, i.e. Optim, that package needs to be used in the script as well by adding `using Optim`.
 
 ## Support Vector Regression
-Contrary to the `LinearModel`s and `GaussianProcess`es, no wrappers are needed for support vector regression. By using [LIBSVM.jl](https://github.com/JuliaML/LIBSVM.jl), LIBSVM wrappers in Julia, it is possible to call both `epsilonSVR()` or `nuSVR()` directly in `train()`. For the full range of kernel provided and the parameters to call we refer the user to the official [documentation](https://www.csie.ntu.edu.tw/~cjlin/libsvm/). Like before, if one intends to use LIBSVM regressors it is necessary to specify `using LIBSVM`.
+Contrary to the `LinearModel`s and `GaussianProcess`es, no wrappers are needed for support vector regression. By using [LIBSVM.jl](https://github.com/JuliaML/LIBSVM.jl), LIBSVM wrappers in Julia, it is possible to call both `epsilonSVR()` or `nuSVR()` directly in `train()`. For the full range of kernels provided and the parameters to call, we refer the user to the official [documentation](https://www.csie.ntu.edu.tw/~cjlin/libsvm/). Like before, if one intends to use LIBSVM regressors, it is necessary to specify `using LIBSVM`.
diff --git a/docs/src/general/predictive_generative.md b/docs/src/general/predictive_generative.md
index dbc83184..0c61f08c 100644
--- a/docs/src/general/predictive_generative.md
+++ b/docs/src/general/predictive_generative.md
@@ -1,10 +1,10 @@
 # Generative vs Predictive
-The library provides two different methods for prediction denoted as `Predictive()` and `Generative()`, following the two major applications of Reservoir Computing models found in the literature. Both these methods are given as argument for the trained model. While copy-pastable example will be provided further on in the documentation it is better the clarify the difference early on to focus more on the library implementation going forward.
+The library provides two different methods for prediction, denoted as `Predictive()` and `Generative()`, following the two major applications of Reservoir Computing models found in the literature. Both of these methods are given as arguments for the trained model. While copy-pasteable example swill be provided further on in the documentation, it is better to clarify the difference early on to focus more on the library implementation going forward.
 
 ## Predictive
-In the first method, the user user can use Reservoir Computing models in a similar fashion as standard Machine Learning models. This means using a set of features as input and a set of labels as outputs. In this case the features and labels can be vectors of different dimensions, as ``X=\{x_1,...,x_n\} \ x_i \in \mathbb{R}^{N}`` and ``Y=\{y_1,...,y_n\} \ y_i \in \mathbb{R}^{M}`` where ``X`` is the feature set and ``Y`` the label set. Given the difference in dimensionality for the prediction call it will be needed to feed to the function the feature set to be labeled, for example calling `Predictive(X)` using the set given in this example.
+In the first method, the user can use Reservoir Computing models similarly as standard Machine Learning models. This means using a set of features as input and a set of labels as outputs. In this case, the features and labels can be vectors of different dimensions, as ``X=\{x_1,...,x_n\} \ x_i \in \mathbb{R}^{N}`` and ``Y=\{y_1,...,y_n\} \ y_i \in \mathbb{R}^{M}`` where ``X`` is the feature set and ``Y`` the label set. Given the difference in dimensionality for the prediction call, it will be needed to feed to the function the feature set to be labeled, for example by calling `Predictive(X)` using the set given in this example.
 
-!this allows for one step ahaed or h steps ahaed prediction
+!this allows for one step ahead or h steps ahead prediction.
 
 ## Generative
-The generative method allows the user to extend the forecasting capabilities of the model, letting the predicted results to be fed back in the model to generate the next prediction. By doing so the model is able to run autonomously, without any feature dataset as input. The call for this model needs only the number of steps that the user intend to forecast, for example calling `Generative(100)` to generate one hundred time steps.
+The generative method allows the user to extend the forecasting capabilities of the model, letting the predicted results to be fed back into the model to generate the next prediction. By doing so, the model can run autonomously, without any feature dataset as input. The call for this model needs only the number of steps that the user intends to forecast, for example calling `Generative(100)` to generate one hundred time steps.
diff --git a/docs/src/general/states_variation.md b/docs/src/general/states_variation.md
index d70a84ab..521fbdad 100644
--- a/docs/src/general/states_variation.md
+++ b/docs/src/general/states_variation.md
@@ -1,19 +1,19 @@
 # Altering States
-In every ReservoirComputing model is posible to perform some alteration on the states in the training stage. Depending on the chosen modification this can improve the results for the prediction. Or more simply they can be used to reproduce results in the literature. The alterations are divided in two possibilities: the first concerns padding or extending the states, the second concerns non linear algorithms performed over the states.
+In every ReservoirComputing model is possible to perform some alteration on the states in the training stage. Depending on the chosen modification, this can improve the results of the prediction. Or more simply, they can be used to reproduce results in the literature. The alterations are divided into two possibilities: the first concerns padding or extending the states, and the second concerns non-linear algorithms performed over the states.
 
 ## Padding and Extending States
 Extending the states means appending to them the corresponding input values. If ``\textbf{x}(t)`` is the reservoir state at time t corresponding to the input ``\textbf{u}(t)`` the extended state will be represented as `` [\textbf{x}(t); \textbf{u}(t)]`` where ``[;]`` is intended as vertical concatenation. This procedure is, for example, used in [Jaeger's Scholarpedia](http://www.scholarpedia.org/article/Echo_state_network) description of Echo State Networks. The extension of the states can be obtained in every ReservoirComputing.jl model by using the keyword argument `states_type` and calling the method `ExtendedStates()`. No argument is needed.
 
-Padding the states is appending a constant value, 1.0 for example, to each state. Using the notation introduced before we can define the padded states as ``[\textbf{x}(t); 1.0]``. This approach is detailed in the [seminal guide](https://mantas.info/get-publication/?f=Practical_ESN.pdf) to Echo State Networks by Mantas Lukoševičius. By using the keyword argument `states_type` the user can call the method `PaddedStates(padding)` where `padding` represents the value that will be concatenated to the states. As default the value is set to unity, so the majority of times calling `PaddedStates()` will suffice.
+Padding the states means appending a constant value, 1.0 for example, to each state. Using the notation introduced before, we can define the padded states as ``[\textbf{x}(t); 1.0]``. This approach is detailed in the [seminal guide](https://mantas.info/get-publication/?f=Practical_ESN.pdf) to Echo State Networks by Mantas Lukoševičius. By using the keyword argument `states_type` the user can call the method `PaddedStates(padding)` where `padding` represents the value that will be concatenated to the states. As default, the value is set to unity, so the majority of the time, calling `PaddedStates()` will suffice.
 
-Altough not found easily in the literature, it is also possible to pad the extended states by using the method `PaddedExtendedStates(padding)` that has unity as `padding` default as well.
+Though not easily found in the literature, it is also possible to pad the extended states by using the method `PaddedExtendedStates(padding)` that has unity as `padding` default as well.
 
-Of course it is also possible to not apport any of these changes to the states by calling `StandardStates()`. This is also the default choice for the states.
+Of course, it is also possible to not apport any of these changes to the states by calling `StandardStates()`. This is also the default choice for the states.
 
-## Non Linear Algorithms
-First introduced in [^1] and expanded in [^2] these are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented. Using the keyword argument `nla_type` it is possible to choose in every model in ReservoirComputing.jl the specific non linear algorithm to use. The defualt value is set to `NLADefault()`, where no non linear algorithm takes place.
+## Non-Linear Algorithms
+First introduced in [^1] and expanded in [^2] these are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented. Using the keyword argument `nla_type` it is possible to choose in every model in ReservoirComputing.jl the specific non-linear algorithm to use. The default value is set to `NLADefault()`, where no non-linear algorithm takes place.
 
-Following the nomenclature used in [^2] the algorithms can be called as `NLAT1()`, `NLAT2()` and `NLAT3()`. To better explain what they do, let ``\textbf{x}_{i, j}`` be elements of the states matrix, with ``i=1,...,T \ j=1,...,N`` where ``T`` is the length of the training and ``N`` is the reservoir size. 
+Following the nomenclature used in [^2], the algorithms can be called as `NLAT1()`, `NLAT2()` and `NLAT3()`. To better explain what they do, let ``\textbf{x}_{i, j}`` be elements of the state matrix, with ``i=1,...,T \ j=1,...,N`` where ``T`` is the length of the training and ``N`` is the reservoir size. 
 
 **NLAT1**
 ```math
diff --git a/docs/src/index.md b/docs/src/index.md
index ee6c2b61..643451a1 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,10 +1,10 @@
 # ReservoirComputing.jl
 
-ReservoirComputing.jl provides an efficient, modular and easy to use implementation of Reservoir Computing models such as Echo State Networks (ESNs). Reservoir Computing (RC) is an umbrella term used to describe a family of models such as ESNs and Liquid State Machines (LSMs). The key concept is to expand the input data into a higher dimension and use regression in order to train the model; in some ways Reservoir Computers can be considered similar to kernel methods. 
+ReservoirComputing.jl provides an efficient, modular, and easy to use implementation of Reservoir Computing models such as Echo State Networks (ESNs). Reservoir Computing (RC) is an umbrella term used to describe a family of models such as ESNs and Liquid State Machines (LSMs). The key concept is to expand the input data into a higher dimension and use regression to train the model; in some ways, Reservoir Computers can be considered similar to kernel methods. 
 
 
 !!! info "Introductory material"
-    This library assumes some basic knowledge of Reservoir Computing. For a good introduction, we suggest the following papers: the first two are the seminal papers about ESN and LSM, the others are in-depth review papers that should cover all the needed information. For the majority of the algorithms implemented in this library we cited in the documentation the original work introducing them. If you ever are in doubt about about a method or a function just type ```? function``` in the Julia REPL to read the relevant notes.
+    This library assumes some basic knowledge of Reservoir Computing. For a good introduction, we suggest the following papers: the first two are the seminal papers about ESN and LSM, the others are in-depth review papers that should cover all the needed information. For the majority of the algorithms implemented in this library, we cited in the documentation the original work introducing them. If you ever have doubts about a method or a function, just type ```? function``` in the Julia REPL to read the relevant notes.
 
     * Jaeger, Herbert: The “echo state” approach to analyzing and training recurrent neural networks-with an erratum note.
     * Maass W, Natschläger T, Markram H: Real-time computing without stable states: a new framework for neural computation based on perturbations.
@@ -12,39 +12,42 @@ ReservoirComputing.jl provides an efficient, modular and easy to use implementat
     * Lukoševičius, Mantas, and Herbert Jaeger: Reservoir computing approaches to recurrent neural network training.
     
 !!! info "Performance tip"
-    For faster computations on the CPU it is suggested to add `using MKL` to the script. For clarity's sake this library will not be indicated under every example in the documentation.
+    For faster computations on the CPU, it is suggested to add `using MKL` to the script. For clarity's sake, this library will not be indicated under every example in the documentation.
 ## Installation
-ReservoirComputing.jl is registered in the General Julia Registry, so the installation of the package follows the usual procedure:
+
+To install ReservoirComputing.jl, use the Julia package manager:
+
 ```julia
-import Pkg; Pkg.add("ReservoirComputing")
+using Pkg
+Pkg.add("ReservoirComputing")
 ```
 The support for this library is for Julia v1.6 or greater.
 
 ## Features Overview
 
-This library provides multiple ways of training the chosen RC model. More specifically the available algorithms are:
+This library provides multiple ways of training the chosen RC model. More specifically, the available algorithms are:
 - ```StandardRidge```: a naive implementation of Ridge Regression. The default choice for training.
 - ```LinearModel```: a wrap around [MLJLinearModels](https://juliaai.github.io/MLJLinearModels.jl/stable/).
 - ```LIBSVM.AbstractSVR```: a direct call of [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods.
 
-Also provided are two different ways of doing predictions using RC:
+Also provided are two different ways of making predictions using RC:
 - ```Generative```: the algorithm uses the prediction of the model in the previous step to continue the prediction. It only needs the number of steps as input.
-- ```Predictive```: standard Machine Learning type of prediction. Given the features the RC model will return the label/prediction.
+- ```Predictive```: standard Machine Learning type of prediction. Given the features, the RC model will return the label/prediction.
 
-It is possible to modify the RC obtained states in the training and prediction step using the following:
+It is possible to modify the RC obtained states in the training and prediction steps using the following:
 - ```StandardStates```: default choice, no changes will be made to the states.
-- ```ExtendedStates```: the states are extended using a vertical concatenation with the input data.
-- ```PaddedStates```: the states are padded using a vertical concatenation with the choosing padding value
-- ```PaddedExtendedStates```: a combination of the first two. First the states are extended and then padded.
+- ```ExtendedStates```: the states are extended using a vertical concatenation, with the input data.
+- ```PaddedStates```: the states are padded using a vertical concatenation with the chosen padding value.
+- ```PaddedExtendedStates```: a combination of the first two. First, the states are extended and then padded.
 
-In addition another modification is possible through the choice of non linear algorithms:
+In addition, another modification is possible through the choice of non-linear algorithms:
 - ```NLADefault```: default choice, no changes will be made to the states.
 - ```NLAT1```
 - ```NLAT2```
 - ```NLAT3```
 
 ### Echo State Networks
-Regarding ESNs in the library are implemented the following input layers:
+For ESNs the following input layers are implemented :
 - ```WeightedLayer```: weighted layer matrix with weights sampled from a uniform distribution.
 - ```DenseLayer```: dense layer matrix with weights sampled from a uniform distribution.
 - ```SparseLayer```: sparse layer matrix with weights sampled from a uniform distribution.
@@ -53,7 +56,7 @@ Regarding ESNs in the library are implemented the following input layers:
   - ```IrrationalSample```
 - ```InformedLayer```: special kin of weighted layer matrix for Hybrid ESNs.
  
-The package also contains multiple implementation of Reservoirs:
+The package also contains multiple implementations of Reservoirs:
 - ```RandSparseReservoir```: random sparse matrix with scaling of spectral radius
 - ```PseudoSVDReservoir```: Pseudo SVD construction of a random sparse matrix
 - ```DelayLineReservoir```: minimal matrix with chosen weights
@@ -61,9 +64,9 @@ The package also contains multiple implementation of Reservoirs:
 - ```SimpleCycleReservoir```: minimal matrix with chosen weights
 - ```CycleJumpsReservoir```: minimal matrix with chosen weights
  
-In addition multiple ways of driving the reservoir states are also provided:
+In addition, multiple ways of driving the reservoir states are also provided:
 - ```RNN```: standard Recurrent Neural Network driver.
-- ```MRNN```: Multiple RNN driver, it consists on a linear combination of RNNs
+- ```MRNN```: Multiple RNN driver, it consists of a linear combination of RNNs
 - ```GRU```: gated Recurrent Unit driver, with all the possible GRU variants available:
   - ```FullyGated```
   - ```Variant1```
@@ -71,15 +74,29 @@ In addition multiple ways of driving the reservoir states are also provided:
   - ```Variant3```
   - ```Minimal```
 
-An hybrid version of the model is also available through ```Hybrid```
+A hybrid version of the model is also available through ```Hybrid```
 
 ### Reservoir Computing with Cellular Automata
-The package provides also an implementation of Reservoir Computing models based on one dimensional Cellular Automata through the ```RECA``` call. For the moment the only input encoding available (an input encoding plays a similar role to the input matrix for ESNs) is a random mapping, called through ```RandomMapping```. 
+The package provides also an implementation of Reservoir Computing models based on one dimensional Cellular Automata through the ```RECA``` call. For the moment, the only input encoding available (an input encoding plays a similar role to the input matrix for ESNs) is a random mapping, called through ```RandomMapping```. 
+
+All the training methods described above can be used, as can all the modifications to the states. Both prediction methods are also possible in theory, although in the literature only ```Predictive``` tasks have been explored.
+
+## Contributing
 
-All the training methods described above can be used, as well as all the modifications to the states. Both prediction methods are also possible in theory, although in the literature only ```Predictive``` tasks have been explored.
+  - Please refer to the
+    [SciML ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://github.com/SciML/ColPrac/blob/master/README.md)
+    for guidance on PRs, issues, and other matters relating to contributing to SciML.
+
+  - See the [SciML Style Guide](https://github.com/SciML/SciMLStyle) for common coding practices and other style decisions.
+  - There are a few community forums:
+    
+      + The #diffeq-bridged and #sciml-bridged channels in the
+        [Julia Slack](https://julialang.org/slack/)
+      + The #diffeq-bridged and #sciml-bridged channels in the
+        [Julia Zulip](https://julialang.zulipchat.com/#narrow/stream/279055-sciml-bridged)
+      + On the [Julia Discourse forums](https://discourse.julialang.org)
+      + See also [SciML Community page](https://sciml.ai/community/)
 
-### Contributing
-Contributions are very welcomed! Some interesting variation of RC models are posted in the issues, but everyone is free to just post relevant papers that could fit the scope of the library. Help with the documentation, providing new examples or application cases is also really important and appreciated. Everything that can make the package a little better is a great contribution, no matter how small. The API section of the documentation provides a more in depth look into how things work and are connected, so that is a good place to start exploring more the library. For every doubt that cannot be expressed in issues please feel free to contact any of the lead developers on Slack or by email.
 
 ## Citing
 
@@ -129,26 +146,19 @@ Pkg.status(;mode = PKGMODE_MANIFEST) # hide
 ```@raw html
 </details>
 ```
-```@raw html
-You can also download the 
-<a href="
-```
 ```@eval
 using TOML
-version = TOML.parse(read("../../Project.toml",String))["version"]
-name = TOML.parse(read("../../Project.toml",String))["name"]
-link = "https://github.com/SciML/"*name*".jl/tree/gh-pages/v"*version*"/assets/Manifest.toml"
-```
-```@raw html
-">manifest</a> file and the
-<a href="
-```
-```@eval
-using TOML
-version = TOML.parse(read("../../Project.toml",String))["version"]
-name = TOML.parse(read("../../Project.toml",String))["name"]
-link = "https://github.com/SciML/"*name*".jl/tree/gh-pages/v"*version*"/assets/Project.toml"
-```
-```@raw html
-">project</a> file.
+using Markdown
+version = TOML.parse(read("../../Project.toml", String))["version"]
+name = TOML.parse(read("../../Project.toml", String))["name"]
+link_manifest = "https://github.com/SciML/" * name * ".jl/tree/gh-pages/v" * version *
+                "/assets/Manifest.toml"
+link_project = "https://github.com/SciML/" * name * ".jl/tree/gh-pages/v" * version *
+               "/assets/Project.toml"
+Markdown.parse("""You can also download the
+[manifest]($link_manifest)
+file and the
+[project]($link_project)
+file.
+""")
 ```
diff --git a/docs/src/reca_tutorials/reca.md b/docs/src/reca_tutorials/reca.md
index ccd06d3a..68410541 100644
--- a/docs/src/reca_tutorials/reca.md
+++ b/docs/src/reca_tutorials/reca.md
@@ -1,8 +1,8 @@
 # Reservoir Computing using Cellular Automata
 
-Reservoir Computing based on Elementary Cellular Automata (ECA) has been recently introduced. Dubbed as ReCA [^1][^2] it proposed the advantage of storing the reservoir states as binary data. Less parameter tuning represents another advantage of this model. The architecture implemented in ReservoirComputing.jl follows [^3] which build over the original implementation, improving the results. It is strongly suggested to go through the paper to get a solid understanding of the model before delving into experimentation with the code.
+Reservoir Computing based on Elementary Cellular Automata (ECA) has been recently introduced. Dubbed as ReCA [^1][^2] it proposed the advantage of storing the reservoir states as binary data. Less parameter tuning represents another advantage of this model. The architecture implemented in ReservoirComputing.jl follows [^3] which builds on top of the original implementation, improving the results. It is strongly suggested to go through the paper to get a solid understanding of the model before delving into experimentation with the code.
 
-To showcase how to use this models this page illustrates the performance of ReCA in the 5 bit memory task [^4]. The script for the example and companion data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/tree/main/reca).
+To showcase how to use these models, this page illustrates the performance of ReCA in the 5 bit memory task [^4]. The script for the example and companion data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/tree/main/reca).
 
 ## 5 bit memory task
 The data can be read as follows:
@@ -13,26 +13,26 @@ input = readdlm("./5bitinput.txt", ',', Float32)
 output = readdlm("./5bitoutput.txt", ',', Float32)
 ```
 
-To use a ReCA model it is necessary to define the rule one intends to use. To do so ReservoirComputing.jl leverages [CellularAutomata.jl](https://github.com/MartinuzziFrancesco/CellularAutomata.jl) that needs to be called as well to define the `RECA` struct:
+To use a ReCA model, it is necessary to define the rule one intends to use. To do so, ReservoirComputing.jl leverages [CellularAutomata.jl](https://github.com/MartinuzziFrancesco/CellularAutomata.jl) that needs to be called as well to define the `RECA` struct:
 ```@example reca
 using ReservoirComputing, CellularAutomata
 
 ca = DCA(90)
 ```
 
-To define the ReCA model it suffices to call:
+To define the ReCA model, it suffices to call:
 ```@example reca
 reca = RECA(input, ca; 
     generations = 16,
     input_encoding = RandomMapping(16, 40))
 ```
 
-After the training can be performed with the chosen method. 
+After this, the training can be performed with the chosen method. 
 ```@example reca
 output_layer = train(reca, output, StandardRidge(0.00001))
 ```
 
-The prediction in this case will be a `Predictive()` with the input data equal to the training data. In addition, to test the 5 bit memory task, a conversion from Float to Bool is necessary (at the moment we are aware of a bug that doesn't allow to input boolean data to the RECA models):
+The prediction in this case will be a `Predictive()` with the input data equal to the training data. In addition, to test the 5 bit memory task, a conversion from Float to Bool is necessary (at the moment, we are aware of a bug that doesn't allow boolean input data to the RECA models):
 ```@example reca
 prediction = reca(Predictive(input), output_layer)
 final_pred = convert(AbstractArray{Float32}, prediction .> 0.5)
diff --git a/src/ReservoirComputing.jl b/src/ReservoirComputing.jl
index d4dec0da..09743ddd 100644
--- a/src/ReservoirComputing.jl
+++ b/src/ReservoirComputing.jl
@@ -51,7 +51,7 @@ end
 """
     Generative(prediction_len)
 
-This prediction methodology allows the models to produce an autonomous prediction, feeding the prediction into itself to generate the next step. 
+This prediction methodology allows the models to produce an autonomous prediction, feeding the prediction into itself to generate the next step.
 The only parameter needed is the number of steps for the prediction.
 """
 struct Generative{T} <: AbstractPrediction
@@ -66,7 +66,7 @@ end
 """
     Predictive(prediction_data)
 
-Given a set of labels as ```prediction_data``` this method of prediction will return the correspinding labels in a standard Machine Learning fashion.
+Given a set of labels as ```prediction_data```, this method of prediction will return the corresponding labels in a standard Machine Learning fashion.
 """
 function Predictive(prediction_data)
     prediction_len = size(prediction_data, 2)
diff --git a/src/esn/echostatenetwork.jl b/src/esn/echostatenetwork.jl
index 79228c68..1230ee5f 100644
--- a/src/esn/echostatenetwork.jl
+++ b/src/esn/echostatenetwork.jl
@@ -31,7 +31,7 @@ end
 """
     Hybrid(prior_model, u0, tspan, datasize)
 
-Given the model parameters returns an ```Hybrid``` variation of the ESN. This entails
+Given the model parameters, returns an ```Hybrid``` variation of the ESN. This entails
 a different training and prediction. Construction based on [1].
 
 [1] Jaideep Pathak et al. "Hybrid Forecasting of Chaotic Processes: Using Machine
@@ -62,9 +62,9 @@ end
 
 Constructor for the Echo State Network model. It requires the reservoir size as the input
 and the data for the training. It returns a struct ready to be trained with the states
-already harvested. 
+already harvested.
 
-After the training this struct can be used for the prediction following the second
+After the training, this struct can be used for the prediction following the second
 function call. This will take as input a prediction type and the output layer from the
 training. The ```initial_conditions``` and ```last_state``` parameters can be left as
 they are, unless there is a specific reason to change them. All the components are
@@ -190,7 +190,7 @@ end
     train(esn::AbstractEchoStateNetwork, target_data, training_method=StandardRidge(0.0))
 
 Training of the built ESN over the ```target_data```. The default training method is
-RidgeRegression. The output is an ```OutputLayer``` object to be fed at the esn call
+RidgeRegression. The output is an ```OutputLayer``` object to be fed to the esn call
 for the prediction.
 """
 function train(esn::AbstractEchoStateNetwork,
diff --git a/src/esn/esn_input_layers.jl b/src/esn/esn_input_layers.jl
index bc3f59d8..4794be43 100644
--- a/src/esn/esn_input_layers.jl
+++ b/src/esn/esn_input_layers.jl
@@ -8,12 +8,12 @@ end
     WeightedInput(scaling)
     WeightedInput(;scaling=0.1)
 
-Returns a weighted layer initializer object, that will produce a weighted input matrix with 
-a with random non-zero elements drawn from [-```scaling```, ```scaling```], as described
+Returns a weighted layer initializer object, that will produce a weighted input matrix with
+random non-zero elements drawn from [-```scaling```, ```scaling```], as described
 in [1]. The ```scaling``` factor can be given as arg or kwarg.
 
 [1] Lu, Zhixin, et al. "_Reservoir observers: Model-free inference of unmeasured variables
-in chaotic systems._" 
+in chaotic systems._"
 Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
 """
 function WeightedLayer(; scaling = 0.1)
@@ -46,7 +46,7 @@ end
     DenseLayer(;scaling=0.1)
 
 Returns a fully connected layer initializer object, that will produce a weighted input
-matrix with a with random non-zero elements drawn from [-```scaling```, ```scaling```]. 
+matrix with random non-zero elements drawn from [-```scaling```, ```scaling```].
 The ```scaling``` factor can be given as arg or kwarg. This is the default choice in the
 ```ESN``` construction.
 """
@@ -61,7 +61,7 @@ end
 """
     create_layer(input_layer::AbstractLayer, res_size, in_size)
 
-Returns a ```res_size``` times ```in_size``` matrix layer, built accordingly to the
+Returns a ```res_size``` times ```in_size``` matrix layer, built according to the
 ```input_layer``` constructor.
 """
 function create_layer(input_layer::DenseLayer,
@@ -78,8 +78,8 @@ end
     SparseLayer(scaling; sparsity=0.1)
     SparseLayer(;scaling=0.1, sparsity=0.1)
 
-Returns a sparsely connected layer initializer object, that will produce a random sparse 
-input matrix with random non-zero elements drawn from [-```scaling```, ```scaling```] and 
+Returns a sparsely connected layer initializer object, that will produce a random sparse
+input matrix with random non-zero elements drawn from [-```scaling```, ```scaling```] and
 given sparsity. The ```scaling``` and ```sparsity``` factors can be given as args or kwargs.
 """
 struct SparseLayer{T} <: AbstractLayer
@@ -118,7 +118,7 @@ end
     BernoulliSample(;p=0.5)
 
 Returns a Bernoulli sign constructor for the ```MinimumLayer``` call. The ```p``` factor
-determines the probability of the result as in the Distributions call. The value can be
+determines the probability of the result, as in the Distributions call. The value can be
 passed as an arg or kwarg. This sign weight determination for input layers is introduced
 in [1].
 
@@ -138,8 +138,8 @@ end
     IrrationalSample(irrational, start)
     IrrationalSample(;irrational=pi, start=1)
 
-Returns an irrational sign contructor for the '''MinimumLayer''' call. The values can be
-passed as args or kwargs. The sign of the weight are decided from the decimal expansion of
+Returns an irrational sign constructor for the ```MinimumLayer``` call. The values can be
+passed as args or kwargs. The sign of the weight is decided from the decimal expansion of
 the given ```irrational```. The first ```start``` decimal digits are thresholded at 4.5,
 then the n-th input sign will be + and - respectively.
 
diff --git a/src/esn/esn_reservoir_drivers.jl b/src/esn/esn_reservoir_drivers.jl
index e339a3c3..c775ebe7 100644
--- a/src/esn/esn_reservoir_drivers.jl
+++ b/src/esn/esn_reservoir_drivers.jl
@@ -127,13 +127,13 @@ end
 
 """
     MRNN(activation_function, leaky_coefficient, scaling_factor)
-    MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0, 
+    MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0,
         scaling_factor=fill(leaky_coefficient, length(activation_function)))
 
-Returns a Multiple RNN initializer, where multiple function are combined in a linear
+Returns a Multiple RNN initializer, where multiple functions are combined in a linear
 combination with chosen parameters ```scaling_factor```. The ```activation_function```
-and ```scaling_factor``` arguments must vectors of the same size. Multiple combinations
-are possible, the implementation is based upon a double activation function idea,
+and ```scaling_factor``` arguments must be vectors of the same size. Multiple combinations
+are possible. The implementation is based upon the double activation function idea,
 found in [1].
 
 [1] Lun, Shu-Xian, et al. "_A novel model of leaky integrator echo state network for
@@ -195,7 +195,7 @@ end
 Returns a standard Gated Recurrent Unit ESN initializer, as described in [1].
 
 [1] Cho, Kyunghyun, et al. “_Learning phrase representations using RNN encoder-decoder
-for statistical machine translation._” 
+for statistical machine translation._”
 arXiv preprint arXiv:1406.1078 (2014).
 """
 struct FullyGated <: AbstractGRUVariant end
@@ -205,7 +205,7 @@ struct FullyGated <: AbstractGRUVariant end
 
 Returns a minimal GRU ESN initializer as described in [1].
 
-[1] Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._" 
+[1] Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._"
 International Journal of Automation and Computing 13.3 (2016): 226-234.
 """
 struct Minimal <: AbstractGRUVariant end
diff --git a/src/esn/esn_reservoirs.jl b/src/esn/esn_reservoirs.jl
index 65bd3c40..90ddd344 100644
--- a/src/esn/esn_reservoirs.jl
+++ b/src/esn/esn_reservoirs.jl
@@ -30,8 +30,8 @@ end
     create_reservoir(reservoir::AbstractReservoir, res_size)
     create_reservoir(reservoir, args...)
 
-Given an ```AbstractReservoir` constructor and the reservoir size it returns the
-corresponding matrix. Alternatively it accepts a given matrix.
+Given an ```AbstractReservoir` constructor and the reservoir size, it returns the
+corresponding matrix. Alternatively, it accepts a given matrix.
 """
 function create_reservoir(reservoir::RandSparseReservoir,
                           res_size;
@@ -43,7 +43,7 @@ function create_reservoir(reservoir::RandSparseReservoir,
     reservoir_matrix .*= reservoir.radius / rho_w
     #TODO: change to explicit if
     Inf in unique(reservoir_matrix) || -Inf in unique(reservoir_matrix) ?
-    error("Sparsity too low for size of the matrix. 
+    error("Sparsity too low for size of the matrix.
           Increase res_size or increase sparsity") : nothing
     return Adapt.adapt(matrix_type, reservoir_matrix)
 end
@@ -85,7 +85,7 @@ end
     PseudoSVDReservoir(max_value, sparsity; sorted=true, reverse_sort=false)
 
 Returns an initializer to build a sparse reservoir matrix, with given ```sparsity```
-created using SVD as described in [1]. 
+created using SVD as described in [1].
 
 [1] Yang, Cuili, et al. "_Design of polynomial echo state networks for time
 series prediction._" Neurocomputing 290 (2018): 148-160.
@@ -164,8 +164,8 @@ end
     DelayLineReservoir(res_size, weight)
     DelayLineReservoir(res_size; weight=0.1)
 
-Returns a Delay Line Reservoir matrix constructor to obtain a deterministi reservoir as
-described in [1]. The ```weight``` can be passed as arg or kwarg and it determines the
+Returns a Delay Line Reservoir matrix constructor to obtain a deterministic reservoir as
+described in [1]. The ```weight``` can be passed as arg or kwarg, and it determines the
 absolute value of all the connections in the reservoir.
 
 [1] Rodan, Ali, and Peter Tino. "_Minimum complexity echo state network._"
@@ -234,8 +234,8 @@ end
     SimpleCycleReservoir(res_size, weight)
     SimpleCycleReservoir(res_size; weight=0.1)
 
-Returns a Simple Cycle Reservoir Reservoir constructor to biuld a reservoir matrix as
-described in [1]. The ```weight``` can be passed as arg or kwarg and it determines the
+Returns a Simple Cycle Reservoir constructor to build a reservoir matrix as
+described in [1]. The ```weight``` can be passed as arg or kwarg, and it determines the
 absolute value of all the connections in the reservoir.
 
 [1] Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
@@ -272,9 +272,9 @@ end
     CycleJumpsReservoir(res_size, cycle_weight, jump_weight, jump_size)
 
 Return a Cycle Reservoir with Jumps constructor to create a reservoir matrix as described
-in [1]. The ```weight``` and ```jump_weight``` can be passed as args or kwargs and they
+in [1]. The ```weight``` and ```jump_weight``` can be passed as args or kwargs, and they
 determine the absolute values of all the connections in the reservoir. The ```jump_size```
-can also be passed either as arg and kwarg and it detemines the jumps between
+can also be passed either as arg or kwarg, and it detemines the jumps between
 ```jump_weight```s.
 
 [1] Rodan, Ali, and Peter Tiňo. "_Simple deterministically constructed cycle reservoirs
@@ -310,7 +310,7 @@ end
 """
     NullReservoir()
 
-Return a constructor for a matrix `zeros(res_size, res_size)`
+Return a constructor for a matrix `zeros(res_size, res_size)`.
 """
 struct NullReservoir <: AbstractReservoir end
 
diff --git a/src/states.jl b/src/states.jl
index 768b5745..b5344601 100644
--- a/src/states.jl
+++ b/src/states.jl
@@ -41,7 +41,7 @@ end
     PaddedStates(padding)
     PaddedStates(;padding=1.0)
 
-The states are padded with a chosen value. Usually this value is set to one. The padding is obtained through a 
+The states are padded with a chosen value. Usually, this value is set to one. The padding is obtained through a
 vertical concatenation of the padding value and the states.
 """
 function PaddedStates(; padding = 1.0)
@@ -52,9 +52,9 @@ end
     PaddedExtendedStates(padding)
     PaddedExtendedStates(;padding=1.0)
 
-The states are extended with the training data or predicted data and subsequently padded with a chosen value. 
-Usually the padding value is set to one. The padding and the extension are obtained through a vertical concatenation 
-of the padding value, the data and the states.
+The states are extended with the training data or predicted data and subsequently padded with a chosen value.
+Usually, the padding value is set to one. The padding and the extension are obtained through a vertical concatenation
+of the padding value, the data, and the states.
 """
 function PaddedExtendedStates(; padding = 1.0)
     return PaddedExtendedStates(padding)