diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..716254c
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,18 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - master
+      - doxygen
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: DenverCoder1/doxygen-github-pages-action@v1.2.0
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: gh-pages
+          folder: docs/html
+          config_file: Doxyfile
diff --git a/docs/Doxyfile b/Doxyfile
similarity index 99%
rename from docs/Doxyfile
rename to Doxyfile
index 3dbf330..69b58c7 100644
--- a/docs/Doxyfile
+++ b/Doxyfile
@@ -42,7 +42,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "Shkyera Tensor"
+PROJECT_NAME           = "Shkyera Grad"
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -54,7 +54,7 @@ PROJECT_NUMBER         = 0.0.1
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "Header-only C++ library for Deep Learning"
+PROJECT_BRIEF          = "micrograd, but in C++ and better"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -918,7 +918,10 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = "README.md" \
-                         "include/src" 
+                         "docs/tutorials/Cheatsheet.md" \
+                         "docs/tutorials/GetStarted.md" \  
+                         "examples/README.md" \   
+                         "include/src" \
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/README.md b/README.md
index c805c20..ee9b45f 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ micrograd, but in C++ and better.
 </i>
 <p></p>
 
+[![Documentation](https://github.com/fszewczyk/shkyera-grad/actions/workflows/docs.yml/badge.svg)](https://fszewczyk.github.io/shkyera-grad/index.html)
 [![LinuxBuild](https://github.com/fszewczyk/shkyera-grad/actions/workflows/linux.yml/badge.svg)](https://github.com/fszewczyk/shkyera-grad/actions/workflows/linux.yml)
 [![MacOSBuild](https://github.com/fszewczyk/shkyera-grad/actions/workflows/macos.yml/badge.svg)](https://github.com/fszewczyk/shkyera-grad/actions/workflows/macos.yml)
 [![WindowsBuild](https://github.com/fszewczyk/shkyera-grad/actions/workflows/windows.yml/badge.svg)](https://github.com/fszewczyk/shkyera-grad/actions/workflows/windows.yml)
@@ -16,21 +17,22 @@ micrograd, but in C++ and better.
 
 This is a small header-only library of a scalar-valued autograd based on [Andrej Karpathy's micrograd](https://github.com/karpathy/micrograd). It provides a high-level, PyTorch-like API for creating and training simple neural networks.
 
+It supports multiple optimizers, such as Adam or SGD, all the most common activation functions and basic types of neural layers. All of it wrapped in a simple, header-only library.
+
 ## Usage
 
-Make sure your compiler supports C++17. Shkyera Grad is a header-only library, so the only thing you need to do is to include it in your project.
+Check out oour [Get Started Guide](https://fszewczyk.github.io/shkyera-grad/md_docs_tutorials_GetStarted.html) to learn the basics of _Shkyera Engine_.
 
-```cpp
-#include "include/ShkyeraGrad.hpp"
-```
+## Showcase
 
-Check out the [examples](examples/README.md) for a quick start on Shkyera Grad. In the meantime, here's a neural network that learns the XOR function.
+Here's a small example showcasing a feed-forward network learning the XOR function. Check out the `examples/` folder for more examples.
 
 ```cpp
-#include "include/ShkyeraGrad.hpp"
+#include "shkyera-grad/include/ShkyeraGrad.hpp"
 
 int main() {
     using namespace shkyera;
+    using T = Type::float32;
 
     std::vector<Vec32> xs;
     std::vector<Vec32> ys;
@@ -41,32 +43,38 @@ int main() {
     xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1}));
     xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0}));
 
-    auto mlp = SequentialBuilder<Type::float32>::begin()
-                .add(Linear32::create(2, 15))
-                .add(ReLU32::create())
-                .add(Dropout32::create(15, 5, 0.2))
-                .add(ReLU32::create())
-                .add(Linear32::create(5, 1))
-                .add(Sigmoid32::create())
-                .build();
+    auto network = SequentialBuilder<Type::float32>::begin()
+                    .add(Linear32::create(2, 15))
+                    .add(ReLU32::create())
+                    .add(Linear32::create(15, 5))
+                    .add(ReLU32::create())
+                    .add(Linear32::create(5, 1))
+                    .add(Sigmoid32::create())
+                    .build();
 
-    Optimizer32 optimizer = Optimizer<Type::float32>(mlp->parameters(), 0.1);
-    Loss::Function32 lossFunction = Loss::MSE<Type::float32>;
 
-    // ------ TRAINING THE NETWORK ------- //
-    for (size_t epoch = 0; epoch < 100; epoch++) {
+    auto optimizer = Adam32(network->parameters(), 0.05);
+    auto lossFunction = Loss::MSE<T>;
+
+    for (size_t epoch = 0; epoch < 100; epoch++) { // We train for 100 epochs
         auto epochLoss = Val32::create(0);
 
-        optimizer.reset();
-        for (size_t sample = 0; sample < xs.size(); ++sample) {
-            Vec32 pred = mlp->forward(xs[sample]);
-            auto loss = lossFunction(pred, ys[sample]);
+        optimizer.reset();                                      // Reset the gradients
+        for (size_t sample = 0; sample < xs.size(); ++sample) { // We go through each sample
+            Vec32 pred = network->forward(xs[sample]);          // We get some prediction
+            auto loss = lossFunction(pred, ys[sample]);         // And calculate its error
 
-            epochLoss = epochLoss + loss;
+            epochLoss = epochLoss + loss; // Store the loss for feedback
         }
-        optimizer.step();
+        optimizer.step(); // Update the parameters
+
+        auto averageLoss = epochLoss / Val32::create(xs.size());
+        std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl;
+    }
 
-        std::cout << "Epoch: " << epoch + 1 << " Loss: " << epochLoss->getValue() << std::endl;
+    for (size_t sample = 0; sample < xs.size(); ++sample) { // Go through each example
+        Vec32 pred = network->forward(xs[sample]);          // Predict result
+        std::cout << xs[sample] << " -> " << pred[0] << "\t| True: " << ys[sample][0] << std::endl;
     }
 }
 ```
diff --git a/docs/tutorials/Cheatsheet.md b/docs/tutorials/Cheatsheet.md
new file mode 100644
index 0000000..364df21
--- /dev/null
+++ b/docs/tutorials/Cheatsheet.md
@@ -0,0 +1,73 @@
+# Cheatsheet
+
+This page contains all the info you need to develop your models using Shkyera Grad.
+
+## Types
+
+Almost all of the classes in _Shkyera Grad_ are implemented using templates. To simplify creation of these objects, we introduced a standard way to instantiate objects with floating-point template parameters, i.e.
+
+```cpp
+Linear32 = Linear<float>
+Optimizer32 = Optimizer<Type::float32>>
+Loss::MSE64 = Loss::MSE<double>
+Adam64 = Adam<Type::f64>
+
+{Class}32 = {Class}<Type::float32> = {Class}<float>
+{Class}64 = {Class}<Type::float64> = {Class}<double>
+```
+
+## Layers
+
+Here's a full list of available layers:
+
+```cpp
+auto linear = Linear32::create(inputSize, outputSize);
+auto dropout = Dropout32::create(inputSize, outputSize, dropoutRate);
+```
+
+## Optimizers
+
+These are all implemented optimizers:
+
+```cpp
+auto simple = Optimizer32(network->parameters(), learningRate);
+auto sgdWithMomentum = SGD32(network->parameters(), learningRate, momentum = 0.9);
+auto adam = Adam32(network->parameters(), learningRate, beta1 = 0.9, beta2=0.999, epsilon=1e-8);
+```
+
+## Loss functions
+
+Optimization can be performed according to these predefined loss functions:
+
+```cpp
+auto L1 = Loss::MAE32;
+auto L2 = Loss::MSE32;
+auto crossEntropy = Loss::CrossEntropy32;
+```
+
+## Generic Training Loop
+
+Simply copy-pase this code to quickly train your network:
+
+```cpp
+using T = Type::float32; // feel free to change it to float64
+
+auto optimizer = Adam<T>(network->parameters(), 0.05);
+auto lossFunction = Loss::MSE<T>;
+
+for (size_t epoch = 0; epoch < 100; epoch++) {
+    auto epochLoss = Value<T>::create(0);
+
+    optimizer.reset();
+    for (size_t sample = 0; sample < xs.size(); ++sample) {
+        Vector<T> pred = network->forward(xs[sample]);
+        auto loss = lossFunction(pred, ys[sample]);
+
+        epochLoss = epochLoss + loss;
+    }
+    optimizer.step();
+
+    auto averageLoss = epochLoss / Value<T>::create(xs.size());
+    std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl;
+}
+```
diff --git a/docs/tutorials/GetStarted.md b/docs/tutorials/GetStarted.md
new file mode 100644
index 0000000..5894c85
--- /dev/null
+++ b/docs/tutorials/GetStarted.md
@@ -0,0 +1,253 @@
+# Get Started
+
+Hello! Let's get right into it. By the end of this guide, you will have created and trained your first neural in _Shkyera Grad_!
+
+## Setup
+
+This is easy, _Shkyera Grad_ is a header-only library, so simply clone the repositoryu into your project:
+
+```
+git clone https://github.com/fszewczyk/shkyera-grad.git
+```
+
+and import the main file of the library inside your own project.
+
+```cpp
+#include "shkyera-grad/include/ShkyeraGrad.hpp"
+```
+
+Now, you can use all the features of this small engine.
+
+@note _Shkyera Grad_ is tested in C++17. Make sure your compiler supports this version.
+
+## Scalars
+
+Internally, _Shkyera Grad_ **always** operates on individual scalars. For most purposes, you do not need to deal with them directly, but it's nice to understand how they work. Each scalar is wrapped inside a `Value` class. However, you should never instantiate objects of this type yourself. Instead, you should use the provided interface in the following way.
+
+```cpp
+// Creates a floating-point scalar
+ValuePtr<float> a = Value<float>::create(5.2);
+ValuePtr<Type::f32> a = Value<Type::f32>::create(5.2);
+ValuePtr<Type::float32> a = Value<Type::float32>::create(5.2);
+auto a = Value<float>::create(5.2);
+auto a = Value<Type::float32>::create(5.2);
+auto a = Value<Type::float64>::create(5.2);
+auto a = Val32::create(5.2);
+
+// Now with higher precision!
+ValuePtr<Type::float64> b = Value<double>::create(6.9);
+auto b = Value<Type::f64>::create(6.9);
+auto b = Val64::create(6.9);
+
+// You can also use integers, no clue why, but go for it!
+auto c = Value<int>::create(7);
+```
+
+You can also perform various operations directly on scalars!
+
+```cpp
+using T = Type::float32;
+
+auto a = Value<T>::create(2.1);
+auto b = Value<T>::create(3.7);
+auto c = a - b;
+auto d = a * b / c;
+c = d->log();
+auto e = (a + b - c)->pow(d);
+```
+
+@note Check out the cheatsheet for the list of all operations.
+
+The magic behind the _Shkyera Grad_ is that it keeps track of all the operations, so that you can later calculate the derivatives of your expression.
+
+```cpp
+auto a = Value<T>::create(2.0);
+auto b = Value<T>::create(3.0);
+auto c = a * b;
+
+c->getValue();                  // c = 6.0
+c->backward();                  // Calculate the gradients of the expression
+
+a->getGradient();               // dc/da = 3.0
+b->getGradient();               // dc/db = 2.0
+```
+
+If you want some refreshment on derivatives, check out [this wonderful video](https://www.youtube.com/watch?v=9vKqVkMQHKk).
+
+## Vector
+
+Multiple scalars can be grouped together in a `Vector` to simplify operating on them. Input to any `Module` (more on them later) is a `Vector`. This abstraction provides some functionality that allows you to compute, for example a dot product.
+
+```cpp
+// The easiest way to create a Vector
+auto a = Vector<T>::of({1, 2, 3});
+
+// The hard way to create a Vector
+auto b = Vector<T>(Value<T>::create(2), Value<T>::create(3), Value<T>::create(4));
+
+// You can access elements in a vector
+auto c = Vector<T>::of({a[0]*b[0], a[1]*b[1], a[2]*b[2]});
+
+// And even iterate over it
+for(auto &entry : c)
+    std::cout << c << std::endl; // prints: 2 6 12
+
+auto d = a.dot(b)       // c = 1 * 2 + 2 * 3 + 3 * 4 = 20
+d->backward();          // You can compute of this result since it's a scalar!
+```
+
+`Vectors` are very useful since this is the way both the input and output data is represented. Each sample consits of an input `Vector` and a target output `Vector`.
+
+## Sequential
+
+Nice! You got the basics! Let's build a network. The best way to create a model is through the use of the `Sequential` interface. Each function that transforms an input `Vector` into some output `Vector` is implemented as a `Module`. This includes neural layers as well as activation functions. Hey, even `Sequential` is a `Module`. This allows for creating complex strctures while using a common, simple interface.
+
+You can create your first neural network using `SequentialBuilder` in the following way.
+
+```cpp
+auto network = SequentialBuilder<T>::begin()
+               .add(Linear<T>::create(2, 15))       // Adds a layer with 2 inputs and 15 outputs
+               .add(ReLU<T>::create())              // Adds a ReLU activation function
+               .add(Linear32::create(15, 10))       // You can use {Layer}32 or {Layer}64 macros
+               .add(Sigmoid32::create())            // More fancy activation :0
+               .add(Dropout32::create(10, 2, 0.5))  // We use the dropout rate of 0.5
+               .build();                            // Don't forget to actually build your network
+```
+
+@warning Remember that subsequent layers have to have matching input and output sizes.
+
+@note For the full list of available layers and activation functions, check out the Cheat Sheet.
+
+## Training
+
+To train our network, we need to define an `Optimizer` that will optimizer the parameters as well as the `Loss` function that we will minimize. _Shkyera Grad_ comes with a set of well-known optimizers and loss functions. Again, check out the Cheat Sheet for a complete list.
+
+```cpp
+// Simple stochastic gradient descent optimizer with 0.01 learning rate
+auto optimizer = Optimizer<T>(network->parameters(), 0.01);
+
+// Stochastic gradient descent, but with momentum of 0.99!
+// If you provide no parameter for momentum, it defaults to 0.9
+auto betterOptimizer = SGD32(network->parameters(), 0.01, 0.99);
+
+// Recommended optimizer: Adam as described in the original paper
+// Again, 0.01 is the learning rate.
+auto awesomeOptimizer = Adam32(network->parameters(), 0.01);
+
+// By default, it comes with the recommended parameters,
+// but they can be changed if you feel like it in the following way:
+auto awesomeCustomOptimizer = Adam32(network->parameters(), 0.01, beta1, beta2, epsilon);
+```
+
+Here's a list of some available `Loss` functions:
+
+```cpp
+Loss::MAE<T>            // Mean Absolute Error
+Loss::MSE<T>            // Mean Squared Error
+Loss::CrossEntropy<T>   // Cross Entropy Loss - good for classification
+```
+
+They are implemented as lambda functions, not as objects, so they do not need to be instantiated.
+
+## Learning XOR
+
+XOR (Exclusive OR) is a simple Boolean function that maps two values two one:
+
+| X1  | X2  | Result |
+| --- | --- | ------ |
+| 0   | 0   | 0      |
+| 0   | 1   | 1      |
+| 1   | 0   | 1      |
+| 1   | 1   | 0      |
+
+### Let's define our dataset.
+
+Here, we basically pase the table above into `Vector`s.
+
+```cpp
+std::vector<Vec32> xs;
+std::vector<Vec32> ys;
+
+// ---------- INPUT ----------- | -------- OUTPUT --------- //
+xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({0}));
+xs.push_back(Vec32::of({1, 0})); ys.push_back(Vec32::of({1}));
+xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1}));
+xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0}));
+```
+
+### Neural Network
+
+We define a simple neural network to predict this function. Our network has a total of three layers. It is a bit of an overkill for this task, but we will use it for learning purposes.
+
+```cpp
+auto network = SequentialBuilder<Type::float32>::begin()
+                .add(Linear32::create(2, 15))
+                .add(ReLU32::create())
+                .add(Linear32::create(15, 5))
+                .add(ReLU32::create())
+                .add(Linear32::create(5, 1))
+                .add(Sigmoid32::create())
+                .build();
+```
+
+### Training Loop
+
+Now, we just need to specify the optimizer and the loss function we want to use:
+
+```cpp
+auto optimizer = Adam32(network->parameters(), 0.05);
+auto lossFunction = Loss::MSE<T>;
+```
+
+We train our model for 100 epochs. After each epoch, we pring the average loss.
+
+```cpp
+for (size_t epoch = 0; epoch < 100; epoch++) {              // We train for 100 epochs
+    auto epochLoss = Val32::create(0);
+
+    optimizer.reset();                                      // Reset the gradients
+    for (size_t sample = 0; sample < xs.size(); ++sample) { // We go through each sample
+        Vec32 pred = network->forward(xs[sample]);          // We get some prediction
+        auto loss = lossFunction(pred, ys[sample]);         // And calculate its error
+
+        epochLoss = epochLoss + loss;                       // Store the loss for feedback
+    }
+    optimizer.step();                                       // Update the parameters
+
+    auto averageLoss = epochLoss / Val32::create(xs.size());
+    std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl;
+}
+```
+
+### Verifying the results
+
+After the training, let's inspect how our network behaves.
+
+```cpp
+for (size_t sample = 0; sample < xs.size(); ++sample) {         // Go through each example
+    Vec32 pred = network->forward(xs[sample]);                  // Predict result
+    std::cout << xs[sample] << " -> " << pred << "\t| True: " << ys[sample] << std::endl;
+}
+```
+
+In case you got lost along the way, check out the `examples/xor_regression.cpp` file. It contains the exact same code and is ready to run :)
+
+### Results
+
+Nice! After compiling and running this code (make sure to use C++17), you should see something like this:
+
+```
+Epoch: 1 Loss: 0.263062
+Epoch: 2 Loss: 0.211502
+(...)
+Epoch: 99 Loss: 0.000222057
+Epoch: 100 Loss: 0.00020191
+Vector(size=2, data={Value(data=0) Value(data=0) }) -> Value(data=0.0191568)    | True: Value(data=0)
+Vector(size=2, data={Value(data=1) Value(data=0) }) -> Value(data=0.99998)      | True: Value(data=1)
+Vector(size=2, data={Value(data=0) Value(data=1) }) -> Value(data=0.999984)     | True: Value(data=1)
+Vector(size=2, data={Value(data=1) Value(data=1) }) -> Value(data=0.0191568)    | True: Value(data=0)
+```
+
+WOW! The network actually learned the XOR function.
+
+This is it. You should have enough knowledge to start experimenting with _Shkyera Engine_. Let us know on GitHub what do you think about this project :)
diff --git a/examples/README.md b/examples/README.md
index c6de2ac..ff64fd9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,4 +1,4 @@
-## Shkyera Grad Examples
+## Examples
 
 To compile an example, simply run the following command:
 
diff --git a/examples/xor_regression.cpp b/examples/xor_regression.cpp
index 9836426..46be3e9 100644
--- a/examples/xor_regression.cpp
+++ b/examples/xor_regression.cpp
@@ -2,6 +2,7 @@
 
 int main() {
     using namespace shkyera;
+    using T = Type::float32;
 
     // clang-format off
     std::vector<Vec32> xs;
@@ -13,38 +14,37 @@ int main() {
     xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1}));
     xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0}));
 
-    auto mlp = SequentialBuilder<Type::float32>::begin()
-                .add(Linear32::create(2, 15))
-                .add(ReLU32::create())
-                .add(Dropout32::create(15, 5, 0.2))
-                .add(Tanh32::create())
-                .add(Linear32::create(5, 1))
-                .add(Sigmoid32::create())
-                .build();
+    auto network = SequentialBuilder<Type::float32>::begin()
+                    .add(Linear32::create(2, 15))
+                    .add(ReLU32::create())
+                    .add(Linear32::create(15, 5))
+                    .add(ReLU32::create())
+                    .add(Linear32::create(5, 1))
+                    .add(Sigmoid32::create())
+                    .build();
     // clang-format on
 
-    Optimizer32 optimizer = Optimizer<Type::float32>(mlp->parameters(), 0.2);
-    Loss::Function32 lossFunction = Loss::MSE<Type::float32>;
+    auto optimizer = Adam32(network->parameters(), 0.05);
+    auto lossFunction = Loss::MSE<T>;
 
-    // ------ TRAINING THE NETWORK ------- //
-    for (size_t epoch = 0; epoch < 100; epoch++) {
+    for (size_t epoch = 0; epoch < 100; epoch++) { // We train for 100 epochs
         auto epochLoss = Val32::create(0);
 
-        optimizer.reset();
-        for (size_t sample = 0; sample < xs.size(); ++sample) {
-            Vec32 pred = mlp->forward(xs[sample]);
-            auto loss = lossFunction(pred, ys[sample]);
+        optimizer.reset();                                      // Reset the gradients
+        for (size_t sample = 0; sample < xs.size(); ++sample) { // We go through each sample
+            Vec32 pred = network->forward(xs[sample]);          // We get some prediction
+            auto loss = lossFunction(pred, ys[sample]);         // And calculate its error
 
-            epochLoss = epochLoss + loss;
+            epochLoss = epochLoss + loss; // Store the loss for feedback
         }
-        optimizer.step();
+        optimizer.step(); // Update the parameters
 
-        std::cout << "Epoch: " << epoch + 1 << " Loss: " << epochLoss->getValue() << std::endl;
+        auto averageLoss = epochLoss / Val32::create(xs.size());
+        std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl;
     }
 
-    // ------ VERIFYING THAT IT WORKS ------//
-    for (size_t sample = 0; sample < xs.size(); ++sample) {
-        Vec32 pred = mlp->forward(xs[sample]);
+    for (size_t sample = 0; sample < xs.size(); ++sample) { // Go through each example
+        Vec32 pred = network->forward(xs[sample]);          // Predict result
         std::cout << xs[sample] << " -> " << pred[0] << "\t| True: " << ys[sample][0] << std::endl;
     }
 }