diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..716254c --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,18 @@ +name: Documentation + +on: + push: + branches: + - master + - doxygen + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: DenverCoder1/doxygen-github-pages-action@v1.2.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: gh-pages + folder: docs/html + config_file: Doxyfile diff --git a/docs/Doxyfile b/Doxyfile similarity index 99% rename from docs/Doxyfile rename to Doxyfile index 3dbf330..69b58c7 100644 --- a/docs/Doxyfile +++ b/Doxyfile @@ -42,7 +42,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "Shkyera Tensor" +PROJECT_NAME = "Shkyera Grad" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -54,7 +54,7 @@ PROJECT_NUMBER = 0.0.1 # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "Header-only C++ library for Deep Learning" +PROJECT_BRIEF = "micrograd, but in C++ and better" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 @@ -918,7 +918,10 @@ WARN_LOGFILE = # Note: If this tag is empty the current directory is searched. INPUT = "README.md" \ - "include/src" + "docs/tutorials/Cheatsheet.md" \ + "docs/tutorials/GetStarted.md" \ + "examples/README.md" \ + "include/src" \ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/README.md b/README.md index 36f5b1a..9d2a109 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,21 @@

Shkyera Grad

- micrograd, but in C++ and with more functionality. + micrograd, but in C++ and better.

-![LinuxBuild](https://github.com/fszewczyk/shkyera-engine/actions/workflows/linux.yml/badge.svg) -![MacOSBuild](https://github.com/fszewczyk/shkyera-engine/actions/workflows/macos.yml/badge.svg) -![WindowsBuild](https://github.com/fszewczyk/shkyera-grad/actions/workflows/windows.yml/badge.svg) +![LinuxBuild](https://github.com/fszewczyk/shkyera-engine/actions/workflows/linux.yml/badge.svg) +![MacOSBuild](https://github.com/fszewczyk/shkyera-engine/actions/workflows/macos.yml/badge.svg) +![WindowsBuild](https://github.com/fszewczyk/shkyera-grad/actions/workflows/windows.yml/badge.svg) ![LICENSE](https://img.shields.io/badge/license-Beerware-yellow) This is a small header-only library of a scalar-valued autograd based on [Andrej Karpathy's micrograd](https://github.com/karpathy/micrograd). It provides a high-level, PyTorch-like API for creating and training simple neural networks. +It supports multiple optimizers, such as Adam or SGD, all the most common activation functions and basic types of neural layers. All of it wrapped in a simple, header-only library. + ## Usage Make sure your compiler supports C++17. Shkyera Grad is a header-only library, so the only thing you need to do is to include it in your project. diff --git a/docs/tutorials/Cheatsheet.md b/docs/tutorials/Cheatsheet.md new file mode 100644 index 0000000..c3c17cb --- /dev/null +++ b/docs/tutorials/Cheatsheet.md @@ -0,0 +1,3 @@ +# Cheatsheet + +This page contains all the info you need to develop your models using Shkyera Grad. diff --git a/docs/tutorials/GetStarted.md b/docs/tutorials/GetStarted.md new file mode 100644 index 0000000..78d45ad --- /dev/null +++ b/docs/tutorials/GetStarted.md @@ -0,0 +1,253 @@ +# Get Started + +Hello! Let's get right into it. By the end of this guide, you will have created and trained your first neural in _Shkyera Grad_! + +## Setup + +This is easy, _Shkyera Grad_ is a header-only library, so simply clone the repositoryu into your project: + +``` +git clone https://github.com/fszewczyk/shkyera-grad.git +``` + +and import the main file of the library inside your own project. + +```cpp +#include "shkyera-grad/include/ShkyeraGrad.hpp" +``` + +Now, you can use all the features of this small engine. + +@note _Shkyera Grad_ is tested in C++17. Make sure your compiler supports this version. + +## Scalars + +Internally, _Shkyera Grad_ **always** operates on individual scalars. For most purposes, you do not need to deal with them directly, but it's nice to understand how they work. Each scalar is wrapped inside a `Value` class. However, you should never instantiate objects of this type yourself. Instead, you should use the provided interface in the following way. + +```cpp +// Creates a floating-point scalar +ValuePtr a = Value::create(5.2); +ValuePtr a = Value::create(5.2); +ValuePtr a = Value::create(5.2); +auto a = Value::create(5.2); +auto a = Value::create(5.2); +auto a = Value::create(5.2); +auto a = Val32::create(5.2); + +// Now with higher precision! +ValuePtr b = Value::create(6.9); +auto b = Value::create(6.9); +auto b = Val64::create(6.9); + +// You can also use integers, no clue why, but go for it! +auto c = Value::create(7); +``` + +You can also perform various operations directly on scalars! + +```cpp +using T = Type::float32; + +auto a = Value::create(2.1); +auto b = Value::create(3.7); +auto c = a - b; +auto d = a * b / c; +c = d->log(); +auto e = (a + b - c)->pow(d); +``` + +@note Check out the cheatsheet for the list of all operations. + +The magic behind the _Shkyera Grad_ is that it keeps track of all the operations, so that you can later calculate the derivatives of your expression. + +```cpp +auto a = Value::create(2.0); +auto b = Value::create(3.0); +auto c = a * b; + +c->getValue(); // c = 6.0 +c->backward(); // Calculate the gradients of the expression + +a->getGradient(); // dc/da = 3.0 +b->getGradient(); // dc/db = 2.0 +``` + +If you want some refreshment on derivatives, check out [this wonderful video](https://www.youtube.com/watch?v=9vKqVkMQHKk). + +## Vector + +Multiple scalars can be grouped together in a `Vector` to simplify operating on them. Input to any `Module` (more on them later) is a `Vector`. This abstraction provides some functionality that allows you to compute, for example a dot product. + +```cpp +// The easiest way to create a Vector +auto a = Vector::of({1, 2, 3}); + +// The hard way to create a Vector +auto b = Vector(Value::create(2), Value::create(3), Value::create(4)); + +// You can access elements in a vector +auto c = Vector::of({a[0]*b[0], a[1]*b[1], a[2]*b[2]}); + +// And even iterate over it +for(auto &entry : c) + std::cout << c << std::endl; // prints: 2 6 12 + +auto d = a.dot(b) // c = 1 * 2 + 2 * 3 + 3 * 4 = 20 +d->backward(); // You can compute of this result since it's a scalar! +``` + +`Vectors` are very useful since this is the way both the input and output data is represented. Each sample consits of an input `Vector` and a target output `Vector`. + +## Sequential + +Nice! You got the basics! Let's build a network. The best way to create a model is through the use of the `Sequential` interface. Each function that transforms an input `Vector` into some output `Vector` is implemented as a `Module`. This includes neural layers as well as activation functions. Hey, even `Sequential` is a `Module`. This allows for creating complex strctures while using a common, simple interface. + +You can create your first neural network using `SequentialBuilder` in the following way. + +```cpp +auto network = SequentialBuilder::begin() + .add(Linear::create(2, 15)) // Adds a layer with 2 inputs and 15 outputs + .add(ReLU::create()) // Adds a ReLU activation function + .add(Linear32::create(15, 10)) // You can use {Layer}32 or {Layer}64 macros + .add(Sigmoid32::create()) // More fancy activation :0 + .add(Dropout32::create(10, 2, 0.5)) // We use the dropout rate of 0.5 + .build(); // Don't forget to actually build your network +``` + +@warn Remember that subsequent layers have to have matching input and output sizes. + +@note For the full list of available layers and activation functions, check out the Cheat Sheet. + +## Training + +To train our network, we need to define an `Optimizer` that will optimizer the parameters as well as the `Loss` function that we will minimize. _Shkyera Grad_ comes with a set of well-known optimizers and loss functions. Again, check out the Cheat Sheet for a complete list. + +```cpp +// Simple stochastic gradient descent optimizer with 0.01 learning rate +auto optimizer = Optimizer(network->parameters(), 0.01); + +// Stochastic gradient descent, but with momentum of 0.99! +// If you provide no parameter for momentum, it defaults to 0.9 +auto betterOptimizer = SGD32(network->parameters(), 0.01, 0.99); + +// Recommended optimizer: Adam as described in the original paper +// Again, 0.01 is the learning rate. +auto awesomeOptimizer = Adam32(network->parameters(), 0.01); + +// By default, it comes with the recommended parameters, +// but they can be changed if you feel like it in the following way: +auto awesomeCustomOptimizer = Adam32(network->parameters(), 0.01, beta1, beta2, epsilon); +``` + +Here's a list of some available `Loss` functions: + +```cpp +Loss::MAE // Mean Absolute Error +Loss::MSE // Mean Squared Error +Loss::CrossEntropy // Cross Entropy Loss - good for classification +``` + +They are implemented as lambda functions, not as objects, so they do not need to be instantiated. + +## Learning XOR + +XOR (Exclusive OR) is a simple Boolean function that maps two values two one: + +| X1 | X2 | Result | +| --- | --- | ------ | +| 0 | 0 | 0 | +| 0 | 1 | 1 | +| 1 | 0 | 1 | +| 1 | 1 | 0 | + +### Let's define our dataset. + +Here, we basically pase the table above into `Vector`s. + +```cpp +std::vector xs; +std::vector ys; + +// ---------- INPUT ----------- | -------- OUTPUT --------- // +xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({0})); +xs.push_back(Vec32::of({1, 0})); ys.push_back(Vec32::of({1})); +xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1})); +xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0})); +``` + +### Neural Network + +We define a simple neural network to predict this function. Our network has a total of three layers. It is a bit of an overkill for this task, but we will use it for learning purposes. + +```cpp +auto network = SequentialBuilder::begin() + .add(Linear32::create(2, 15)) + .add(ReLU32::create()) + .add(Linear32::create(15, 5)) + .add(ReLU32::create()) + .add(Linear32::create(5, 1)) + .add(Sigmoid32::create()) + .build(); +``` + +### Training Loop + +Now, we just need to specify the optimizer and the loss function we want to use: + +```cpp +auto optimizer = Adam32(network->parameters(), 0.05); +auto lossFunction = Loss::MSE; +``` + +We train our model for 100 epochs. After each epoch, we pring the average loss. + +```cpp +for (size_t epoch = 0; epoch < 100; epoch++) { // We train for 100 epochs + auto epochLoss = Val32::create(0); + + optimizer.reset(); // Reset the gradients + for (size_t sample = 0; sample < xs.size(); ++sample) { // We go through each sample + Vec32 pred = network->forward(xs[sample]); // We get some prediction + auto loss = lossFunction(pred, ys[sample]); // And calculate its error + + epochLoss = epochLoss + loss; // Store the loss for feedback + } + optimizer.step(); // Update the parameters + + auto averageLoss = epochLoss / Val32::create(xs.size()); + std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl; +} +``` + +### Verifying the results + +After the training, let's inspect how our network behaves. + +```cpp +for (size_t sample = 0; sample < xs.size(); ++sample) { // Go through each example + Vec32 pred = network->forward(xs[sample]); // Predict result + std::cout << xs[sample] << " -> " << pred << "\t| True: " << ys[sample] << std::endl; +} +``` + +In case you got lost along the way, check out the `examples/xor_regression.cpp` file. It contains the exact same code and is ready to run :) + +### Results + +Nice! After compiling and running this code (make sure to use C++17), you should see something like this: + +``` +Epoch: 1 Loss: 0.263062 +Epoch: 2 Loss: 0.211502 +(...) +Epoch: 99 Loss: 0.000222057 +Epoch: 100 Loss: 0.00020191 +Vector(size=2, data={Value(data=0) Value(data=0) }) -> Value(data=0.0191568) | True: Value(data=0) +Vector(size=2, data={Value(data=1) Value(data=0) }) -> Value(data=0.99998) | True: Value(data=1) +Vector(size=2, data={Value(data=0) Value(data=1) }) -> Value(data=0.999984) | True: Value(data=1) +Vector(size=2, data={Value(data=1) Value(data=1) }) -> Value(data=0.0191568) | True: Value(data=0) +``` + +WOW! The network actually learned the XOR function. + +This is it. You should have enough knowledge to start experimenting with _Shkyera Engine_. Let us know on GitHub what do you think about this project :) diff --git a/examples/README.md b/examples/README.md index c6de2ac..ff64fd9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,4 +1,4 @@ -## Shkyera Grad Examples +## Examples To compile an example, simply run the following command: diff --git a/examples/xor_regression.cpp b/examples/xor_regression.cpp index 9836426..46be3e9 100644 --- a/examples/xor_regression.cpp +++ b/examples/xor_regression.cpp @@ -2,6 +2,7 @@ int main() { using namespace shkyera; + using T = Type::float32; // clang-format off std::vector xs; @@ -13,38 +14,37 @@ int main() { xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1})); xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0})); - auto mlp = SequentialBuilder::begin() - .add(Linear32::create(2, 15)) - .add(ReLU32::create()) - .add(Dropout32::create(15, 5, 0.2)) - .add(Tanh32::create()) - .add(Linear32::create(5, 1)) - .add(Sigmoid32::create()) - .build(); + auto network = SequentialBuilder::begin() + .add(Linear32::create(2, 15)) + .add(ReLU32::create()) + .add(Linear32::create(15, 5)) + .add(ReLU32::create()) + .add(Linear32::create(5, 1)) + .add(Sigmoid32::create()) + .build(); // clang-format on - Optimizer32 optimizer = Optimizer(mlp->parameters(), 0.2); - Loss::Function32 lossFunction = Loss::MSE; + auto optimizer = Adam32(network->parameters(), 0.05); + auto lossFunction = Loss::MSE; - // ------ TRAINING THE NETWORK ------- // - for (size_t epoch = 0; epoch < 100; epoch++) { + for (size_t epoch = 0; epoch < 100; epoch++) { // We train for 100 epochs auto epochLoss = Val32::create(0); - optimizer.reset(); - for (size_t sample = 0; sample < xs.size(); ++sample) { - Vec32 pred = mlp->forward(xs[sample]); - auto loss = lossFunction(pred, ys[sample]); + optimizer.reset(); // Reset the gradients + for (size_t sample = 0; sample < xs.size(); ++sample) { // We go through each sample + Vec32 pred = network->forward(xs[sample]); // We get some prediction + auto loss = lossFunction(pred, ys[sample]); // And calculate its error - epochLoss = epochLoss + loss; + epochLoss = epochLoss + loss; // Store the loss for feedback } - optimizer.step(); + optimizer.step(); // Update the parameters - std::cout << "Epoch: " << epoch + 1 << " Loss: " << epochLoss->getValue() << std::endl; + auto averageLoss = epochLoss / Val32::create(xs.size()); + std::cout << "Epoch: " << epoch + 1 << " Loss: " << averageLoss->getValue() << std::endl; } - // ------ VERIFYING THAT IT WORKS ------// - for (size_t sample = 0; sample < xs.size(); ++sample) { - Vec32 pred = mlp->forward(xs[sample]); + for (size_t sample = 0; sample < xs.size(); ++sample) { // Go through each example + Vec32 pred = network->forward(xs[sample]); // Predict result std::cout << xs[sample] << " -> " << pred[0] << "\t| True: " << ys[sample][0] << std::endl; } }