From 1f49913c0cf83feceadf1b4ae859f8a09cbf1961 Mon Sep 17 00:00:00 2001 From: "szewczyk.franciszek02" Date: Wed, 8 Nov 2023 14:26:12 +0100 Subject: [PATCH] Softmax and Cross Entropy Loss --- examples/xor_classification.cpp | 50 ++++++++++++++ examples/{xor_nn.cpp => xor_regression.cpp} | 0 include/ShkyeraGrad.hpp | 1 + include/core/Value.hpp | 13 +++- include/core/Vector.hpp | 73 ++++++++++++++++++++- include/nn/Loss.hpp | 28 ++++++++ include/nn/activation/Softmax.hpp | 50 ++++++++++++++ include/nn/layers/Dropout.hpp | 7 +- 8 files changed, 215 insertions(+), 7 deletions(-) create mode 100644 examples/xor_classification.cpp rename examples/{xor_nn.cpp => xor_regression.cpp} (100%) create mode 100644 include/nn/activation/Softmax.hpp diff --git a/examples/xor_classification.cpp b/examples/xor_classification.cpp new file mode 100644 index 0000000..d4961c5 --- /dev/null +++ b/examples/xor_classification.cpp @@ -0,0 +1,50 @@ +#include "../include/ShkyeraGrad.hpp" + +int main() { + using namespace shkyera; + + // clang-format off + std::vector xs; + std::vector ys; + + // ---------- INPUT ----------- | -------- OUTPUT --------- // + xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({1, 0})); + xs.push_back(Vec32::of({1, 0})); ys.push_back(Vec32::of({0, 1})); + xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({0, 1})); + xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({1, 0})); + + auto mlp = SequentialBuilder::begin() + .add(Linear32::create(2, 15)) + .add(ReLU32::create()) + .add(Dropout32::create(15, 5, 0.2)) + .add(Tanh32::create()) + .add(Linear32::create(5, 2)) + .add(Softmax32::create()) + .build(); + // clang-format on + + Optimizer32 optimizer = Optimizer(mlp->parameters(), 0.1); + Loss::Function32 lossFunction = Loss::CrossEntropy; + + // ------ TRAINING THE NETWORK ------- // + for (size_t epoch = 0; epoch < 200; epoch++) { + auto epochLoss = Val32::create(0); + + optimizer.reset(); + for (size_t sample = 0; sample < xs.size(); ++sample) { + Vec32 pred = mlp->forward(xs[sample]); + auto loss = lossFunction(pred, ys[sample]); + + epochLoss = epochLoss + loss; + } + optimizer.step(); + + std::cout << "Epoch: " << epoch + 1 << " Loss: " << epochLoss->getValue() / xs.size() << std::endl; + } + + // ------ VERIFYING THAT IT WORKS ------// + for (size_t sample = 0; sample < xs.size(); ++sample) { + Vec32 pred = mlp->forward(xs[sample]); + std::cout << xs[sample] << " -> " << pred << "\t| True: " << ys[sample] << std::endl; + } +} diff --git a/examples/xor_nn.cpp b/examples/xor_regression.cpp similarity index 100% rename from examples/xor_nn.cpp rename to examples/xor_regression.cpp diff --git a/include/ShkyeraGrad.hpp b/include/ShkyeraGrad.hpp index 47f785f..275e810 100644 --- a/include/ShkyeraGrad.hpp +++ b/include/ShkyeraGrad.hpp @@ -21,6 +21,7 @@ #include "nn/activation/Exp.hpp" #include "nn/activation/ReLU.hpp" #include "nn/activation/Sigmoid.hpp" +#include "nn/activation/Softmax.hpp" #include "nn/activation/Tanh.hpp" #include "nn/layers/Dropout.hpp" diff --git a/include/core/Value.hpp b/include/core/Value.hpp index 109099d..c7f9007 100644 --- a/include/core/Value.hpp +++ b/include/core/Value.hpp @@ -50,6 +50,7 @@ template class Value : public std::enable_shared_from_this ValuePtr relu(); ValuePtr sigmoid(); ValuePtr exp(); + ValuePtr log(); ValuePtr pow(ValuePtr exponent); template friend ValuePtr operator+(ValuePtr a, ValuePtr b); @@ -157,6 +158,16 @@ template ValuePtr Value::exp() { return result; } +template ValuePtr Value::log() { + auto thisValue = this->shared_from_this(); + + ValuePtr result = Value::create(std::log(_data)); + result->_children = {thisValue}; + result->_backward = [thisValue, result]() { thisValue->_gradient += (1 / thisValue->_data) * result->_gradient; }; + + return result; +} + template ValuePtr Value::pow(ValuePtr exponent) { auto thisValue = this->shared_from_this(); @@ -165,7 +176,7 @@ template ValuePtr Value::pow(ValuePtr exponent) { result->_backward = [thisValue, exponent, result]() { thisValue->_gradient += (exponent->_data * std::pow(thisValue->_data, exponent->_data - 1)) * result->_gradient; exponent->_gradient += - (std::pow(thisValue->_data, exponent->_data) * log(thisValue->_data)) * result->_gradient; + (std::pow(thisValue->_data, exponent->_data) * std::log(thisValue->_data)) * result->_gradient; }; return result; diff --git a/include/core/Vector.hpp b/include/core/Vector.hpp index 994f700..b13783a 100644 --- a/include/core/Vector.hpp +++ b/include/core/Vector.hpp @@ -25,14 +25,24 @@ template class Vector { public: Vector() = default; Vector(std::vector> values); - static Vector of(const std::vector &values); + static Vector of(const std::vector &values); ValuePtr dot(const Vector &other) const; - ValuePtr operator[](size_t index) const; - + ValuePtr sum() const; size_t size() const; template friend std::ostream &operator<<(std::ostream &os, const Vector &vector); + + template friend Vector operator/(Vector x, U val); + template friend Vector operator*(Vector x, U val); + template friend Vector operator/(Vector x, ValuePtr val); + template friend Vector operator*(Vector x, ValuePtr val); + Vector &operator/=(T val); + Vector &operator*=(T val); + Vector &operator/=(ValuePtr val); + Vector &operator*=(ValuePtr val); + + ValuePtr operator[](size_t index) const; }; template Vector::Vector(std::vector> values) { _values = values; } @@ -62,6 +72,63 @@ template ValuePtr Vector::dot(const Vector &other) const { return result; } +template ValuePtr Vector::sum() const { + auto sum = Value::create(0); + for (const auto &entry : _values) + sum = sum + entry; + return sum; +} + +template Vector operator/(Vector x, T val) { + x /= val; + return x; +} + +template Vector operator*(Vector x, T val) { + x *= val; + return x; +} + +template Vector operator/(Vector x, ValuePtr val) { + auto out = x; + for (size_t i = 0; i < out._values.size(); ++i) + out._values[i] = out._values[i] / val; + return out; +} + +template Vector operator*(Vector x, ValuePtr val) { + auto out = x; + for (size_t i = 0; i < out._values.size(); ++i) + out._values[i] = out._values[i] * val; + return out; +} + +template Vector &Vector::operator/=(T val) { + auto divisor = Value::create(val); + for (size_t i = 0; i < _values.size(); ++i) + _values[i] = _values[i] / divisor; + return *this; +} + +template Vector &Vector::operator*=(T val) { + auto divisor = Value::create(val); + for (size_t i = 0; i < _values.size(); ++i) + _values[i] = _values[i] * divisor; + return *this; +} + +template Vector &Vector::operator/=(ValuePtr val) { + for (size_t i = 0; i < _values.size(); ++i) + _values[i] = _values[i] / val; + return *this; +} + +template Vector &Vector::operator*=(ValuePtr val) { + for (size_t i = 0; i < _values.size(); ++i) + _values[i] = _values[i] * val; + return *this; +} + template ValuePtr Vector::operator[](size_t index) const { return _values[index]; } template std::ostream &operator<<(std::ostream &os, const Vector &vector) { diff --git a/include/nn/Loss.hpp b/include/nn/Loss.hpp index 122d018..9d598ae 100644 --- a/include/nn/Loss.hpp +++ b/include/nn/Loss.hpp @@ -57,4 +57,32 @@ Function MAE = [](Vector a, Vector b) { return loss; }; +template +Function CrossEntropy = [](Vector a, Vector b) { + if (a.size() != b.size()) { + throw std::invalid_argument( + "Vectors need to be of the same size to compute the Cross Entropy loss. Sizes are " + + std::to_string(a.size()) + " and " + std::to_string(b.size()) + "."); + } + + auto aSum = a.sum(); + auto bSum = b.sum(); + + if (aSum->getValue() < 0.99 || aSum->getValue() > 1.01 || aSum->getValue() < 0.99 || aSum->getValue() > 1.01) { + throw std::invalid_argument("To compute Cross Entropy Loss, both elements of each vector need to sum to 1(+/- " + "0.01). Currently, they sum to:" + + std::to_string(aSum->getValue()) + " and " + std::to_string(bSum->getValue()) + + "."); + } + + auto loss = Value::create(0); + for (size_t i = 0; i < a.size(); ++i) { + loss = loss - (b[i] * (a[i]->log())); + } + + loss->backward(); + + return loss; +}; + } // namespace shkyera::Loss diff --git a/include/nn/activation/Softmax.hpp b/include/nn/activation/Softmax.hpp new file mode 100644 index 0000000..6f80c09 --- /dev/null +++ b/include/nn/activation/Softmax.hpp @@ -0,0 +1,50 @@ +/** + * Copyright © 2023 Franciszek Szewczyk. None of the rights reserved. + * This code is released under the Beerware License. If you find this code useful or you appreciate the work, you are + * encouraged to buy the author a beer in return. + * Contact the author at szewczyk.franciszek02@gmail.com for inquiries and support. + */ + +#pragma once + +#include "Activation.hpp" + +namespace shkyera { + +template class Softmax; +using Softmax32 = Softmax; +using Softmax64 = Softmax; + +template class Softmax : public Activation { + public: + static std::shared_ptr> create(); + + virtual Vector operator()(const Vector &x) const override; +}; + +template std::shared_ptr> Softmax::create() { + return std::shared_ptr>(new Softmax()); +} + +template Vector Softmax::operator()(const Vector &x) const { + std::vector> out; + out.reserve(x.size()); + + auto maxValue = Value::create(x[0]->getValue()); + for (size_t i = 1; i < x.size(); ++i) + if (x[i] > maxValue) + maxValue = x[i]; + + auto sumExponentiated = Value::create(0); + for (size_t i = 0; i < x.size(); ++i) { + auto exponentiated = (x[i] - maxValue)->exp(); + out.emplace_back(exponentiated); + sumExponentiated = sumExponentiated + exponentiated; + } + + auto vectorizedOut = Vector(out) / sumExponentiated; + + return vectorizedOut; +} + +} // namespace shkyera diff --git a/include/nn/layers/Dropout.hpp b/include/nn/layers/Dropout.hpp index fb90769..c3a6063 100644 --- a/include/nn/layers/Dropout.hpp +++ b/include/nn/layers/Dropout.hpp @@ -45,15 +45,16 @@ template DropoutPtr Dropout::create(size_t input, size_t size template Vector Dropout::operator()(const Vector &x) const { std::vector> alteredInput; alteredInput.reserve(x.size()); - auto scaling = Value::create(1.0 / (1 - _dropout)); for (size_t i = 0; i < x.size(); ++i) - alteredInput.push_back(x[i] * scaling); + alteredInput.push_back(x[i]); std::vector indicesToRemove = utils::sample(0, x.size() - 1, _dropout * x.size(), false); for (size_t idxToRemove : indicesToRemove) alteredInput[idxToRemove] = Value::create(0); - return Linear::operator()(Vector(alteredInput)); + auto transformedInput = Vector(alteredInput) * static_cast(1.0 / (1 - _dropout)); + + return Linear::operator()(transformedInput); } } // namespace shkyera