Merge pull request #8 from fszewczyk/cross-entropy-loss

Softmax and Cross Entropy Loss
fszewczyk · Nov 8, 2023 · b54001a · b54001a
2 parents c681626 + 67dfd68
commit b54001a
Show file tree

Hide file tree

Showing 12 changed files with 235 additions and 18 deletions.
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -30,5 +30,9 @@ jobs:
         env:
           CXX: ${{matrix.conf.compiler}}
         run: |
-          g++ examples/scalars.cpp --std=c++17
-          g++ examples/xor_nn.cpp --std=c++17
+          g++ examples/scalars.cpp -O3 --std=c++17
+          ./a.out
+          g++ examples/xor_classification.cpp -O3 --std=c++17
+          ./a.out
+          g++ examples/xor_regression.cpp -O3 --std=c++17
+          ./a.out
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
@@ -26,9 +26,13 @@ jobs:
         run: |
           g++ include/ShkyeraGrad.hpp --std=c++17
 
-      - name: Build examples
+      - name: Build and run examples
         env:
           CXX: ${{matrix.conf.compiler}}
         run: |
-          g++ examples/scalars.cpp --std=c++17
-          g++ examples/xor_nn.cpp --std=c++17
+          g++ examples/scalars.cpp -O3 --std=c++17
+          ./a.out
+          g++ examples/xor_classification.cpp -O3 --std=c++17
+          ./a.out
+          g++ examples/xor_regression.cpp -O3 --std=c++17
+          ./a.out
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -29,5 +29,6 @@ jobs:
         env:
           CXX: ${{matrix.conf.compiler}}
         run: |
-          g++ examples/scalars.cpp --std=c++17
-          g++ examples/xor_nn.cpp --std=c++17
+          g++ -o out examples/scalars.cpp -O3 --std=c++17
+          g++ -o out examples/xor_classification.cpp -O3 --std=c++17
+          g++ -o out examples/xor_regression.cpp -O3 --std=c++17
diff --git a/examples/README.md b/examples/README.md
@@ -27,5 +27,5 @@ Epoch: 100 Loss: 0.0371898
 Vector(size=2, data={Value(data=0) Value(data=0) }) -> Value(data=0.115728)| True: Value(data=0)
 Vector(size=2, data={Value(data=1) Value(data=0) }) -> Value(data=0.93215) | True: Value(data=1)
 Vector(size=2, data={Value(data=0) Value(data=1) }) -> Value(data=0.937625)| True: Value(data=1)
-Vector(size=2, data={Value(data=0) Value(data=0) }) -> Value(data=0.115728)| True: Value(data=0)
+Vector(size=2, data={Value(data=1) Value(data=1) }) -> Value(data=0.115728)| True: Value(data=0)
 ```
diff --git a/examples/xor_classification.cpp b/examples/xor_classification.cpp
@@ -0,0 +1,50 @@
+#include "../include/ShkyeraGrad.hpp"
+
+int main() {
+    using namespace shkyera;
+
+    // clang-format off
+    std::vector<Vec32> xs;
+    std::vector<Vec32> ys;
+
+    // ---------- INPUT ----------- | -------- OUTPUT --------- //
+    xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({1, 0}));
+    xs.push_back(Vec32::of({1, 0})); ys.push_back(Vec32::of({0, 1}));
+    xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({0, 1}));
+    xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({1, 0}));
+
+    auto mlp = SequentialBuilder<Type::float32>::begin()
+                .add(Linear32::create(2, 15))
+                .add(ReLU32::create())
+                .add(Dropout32::create(15, 5, 0.2))
+                .add(Tanh32::create())
+                .add(Linear32::create(5, 2))
+                .add(Softmax32::create())
+                .build();
+    // clang-format on
+
+    Optimizer32 optimizer = Optimizer<Type::float32>(mlp->parameters(), 0.1);
+    Loss::Function32 lossFunction = Loss::CrossEntropy<Type::float32>;
+
+    // ------ TRAINING THE NETWORK ------- //
+    for (size_t epoch = 0; epoch < 200; epoch++) {
+        auto epochLoss = Val32::create(0);
+
+        optimizer.reset();
+        for (size_t sample = 0; sample < xs.size(); ++sample) {
+            Vec32 pred = mlp->forward(xs[sample]);
+            auto loss = lossFunction(pred, ys[sample]);
+
+            epochLoss = epochLoss + loss;
+        }
+        optimizer.step();
+
+        std::cout << "Epoch: " << epoch + 1 << " Loss: " << epochLoss->getValue() / xs.size() << std::endl;
+    }
+
+    // ------ VERIFYING THAT IT WORKS ------//
+    for (size_t sample = 0; sample < xs.size(); ++sample) {
+        Vec32 pred = mlp->forward(xs[sample]);
+        std::cout << xs[sample] << " -> " << pred << "\t| True: " << ys[sample] << std::endl;
+    }
+}
diff --git a/examples/xor_nn.cpp → examples/xor_regression.cpp b/examples/xor_nn.cpp → examples/xor_regression.cpp
@@ -11,19 +11,19 @@ int main() {
     xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({0}));
     xs.push_back(Vec32::of({1, 0})); ys.push_back(Vec32::of({1}));
     xs.push_back(Vec32::of({0, 1})); ys.push_back(Vec32::of({1}));
-    xs.push_back(Vec32::of({0, 0})); ys.push_back(Vec32::of({0}));
+    xs.push_back(Vec32::of({1, 1})); ys.push_back(Vec32::of({0}));
 
     auto mlp = SequentialBuilder<Type::float32>::begin()
                 .add(Linear32::create(2, 15))
                 .add(ReLU32::create())
                 .add(Dropout32::create(15, 5, 0.2))
-                .add(ReLU32::create())
+                .add(Tanh32::create())
                 .add(Linear32::create(5, 1))
                 .add(Sigmoid32::create())
                 .build();
     // clang-format on
 
-    Optimizer32 optimizer = Optimizer<Type::float32>(mlp->parameters(), 0.1);
+    Optimizer32 optimizer = Optimizer<Type::float32>(mlp->parameters(), 0.2);
     Loss::Function32 lossFunction = Loss::MSE<Type::float32>;
 
     // ------ TRAINING THE NETWORK ------- //

diff --git a/include/ShkyeraGrad.hpp b/include/ShkyeraGrad.hpp
@@ -21,6 +21,7 @@
 #include "nn/activation/Exp.hpp"
 #include "nn/activation/ReLU.hpp"
 #include "nn/activation/Sigmoid.hpp"
+#include "nn/activation/Softmax.hpp"
 #include "nn/activation/Tanh.hpp"
 
 #include "nn/layers/Dropout.hpp"

diff --git a/include/core/Value.hpp b/include/core/Value.hpp
@@ -50,6 +50,7 @@ template <typename T> class Value : public std::enable_shared_from_this<Value<T>
     ValuePtr<T> relu();
     ValuePtr<T> sigmoid();
     ValuePtr<T> exp();
+    ValuePtr<T> log();
     ValuePtr<T> pow(ValuePtr<T> exponent);
 
     template <typename U> friend ValuePtr<U> operator+(ValuePtr<U> a, ValuePtr<U> b);
@@ -157,6 +158,16 @@ template <typename T> ValuePtr<T> Value<T>::exp() {
     return result;
 }
 
+template <typename T> ValuePtr<T> Value<T>::log() {
+    auto thisValue = this->shared_from_this();
+
+    ValuePtr<T> result = Value<T>::create(std::log(_data));
+    result->_children = {thisValue};
+    result->_backward = [thisValue, result]() { thisValue->_gradient += (1 / thisValue->_data) * result->_gradient; };
+
+    return result;
+}
+
 template <typename T> ValuePtr<T> Value<T>::pow(ValuePtr<T> exponent) {
     auto thisValue = this->shared_from_this();
 
@@ -165,7 +176,7 @@ template <typename T> ValuePtr<T> Value<T>::pow(ValuePtr<T> exponent) {
     result->_backward = [thisValue, exponent, result]() {
         thisValue->_gradient += (exponent->_data * std::pow(thisValue->_data, exponent->_data - 1)) * result->_gradient;
         exponent->_gradient +=
-            (std::pow(thisValue->_data, exponent->_data) * log(thisValue->_data)) * result->_gradient;
+            (std::pow(thisValue->_data, exponent->_data) * std::log(thisValue->_data)) * result->_gradient;
     };
 
     return result;

diff --git a/include/core/Vector.hpp b/include/core/Vector.hpp
@@ -25,14 +25,24 @@ template <typename T> class Vector {
   public:
     Vector() = default;
     Vector(std::vector<ValuePtr<T>> values);
-    static Vector<T> of(const std::vector<T> &values);
 
+    static Vector<T> of(const std::vector<T> &values);
     ValuePtr<T> dot(const Vector<T> &other) const;
-    ValuePtr<T> operator[](size_t index) const;
-
+    ValuePtr<T> sum() const;
     size_t size() const;
 
     template <typename U> friend std::ostream &operator<<(std::ostream &os, const Vector<U> &vector);
+
+    template <typename U> friend Vector<U> operator/(Vector<U> x, U val);
+    template <typename U> friend Vector<U> operator*(Vector<U> x, U val);
+    template <typename U> friend Vector<U> operator/(Vector<U> x, ValuePtr<U> val);
+    template <typename U> friend Vector<U> operator*(Vector<U> x, ValuePtr<U> val);
+    Vector<T> &operator/=(T val);
+    Vector<T> &operator*=(T val);
+    Vector<T> &operator/=(ValuePtr<T> val);
+    Vector<T> &operator*=(ValuePtr<T> val);
+
+    ValuePtr<T> operator[](size_t index) const;
 };
 
 template <typename T> Vector<T>::Vector(std::vector<ValuePtr<T>> values) { _values = values; }
@@ -62,6 +72,63 @@ template <typename T> ValuePtr<T> Vector<T>::dot(const Vector<T> &other) const {
     return result;
 }
 
+template <typename T> ValuePtr<T> Vector<T>::sum() const {
+    auto sum = Value<T>::create(0);
+    for (const auto &entry : _values)
+        sum = sum + entry;
+    return sum;
+}
+
+template <typename T> Vector<T> operator/(Vector<T> x, T val) {
+    x /= val;
+    return x;
+}
+
+template <typename T> Vector<T> operator*(Vector<T> x, T val) {
+    x *= val;
+    return x;
+}
+
+template <typename T> Vector<T> operator/(Vector<T> x, ValuePtr<T> val) {
+    auto out = x;
+    for (size_t i = 0; i < out._values.size(); ++i)
+        out._values[i] = out._values[i] / val;
+    return out;
+}
+
+template <typename T> Vector<T> operator*(Vector<T> x, ValuePtr<T> val) {
+    auto out = x;
+    for (size_t i = 0; i < out._values.size(); ++i)
+        out._values[i] = out._values[i] * val;
+    return out;
+}
+
+template <typename T> Vector<T> &Vector<T>::operator/=(T val) {
+    auto divisor = Value<T>::create(val);
+    for (size_t i = 0; i < _values.size(); ++i)
+        _values[i] = _values[i] / divisor;
+    return *this;
+}
+
+template <typename T> Vector<T> &Vector<T>::operator*=(T val) {
+    auto divisor = Value<T>::create(val);
+    for (size_t i = 0; i < _values.size(); ++i)
+        _values[i] = _values[i] * divisor;
+    return *this;
+}
+
+template <typename T> Vector<T> &Vector<T>::operator/=(ValuePtr<T> val) {
+    for (size_t i = 0; i < _values.size(); ++i)
+        _values[i] = _values[i] / val;
+    return *this;
+}
+
+template <typename T> Vector<T> &Vector<T>::operator*=(ValuePtr<T> val) {
+    for (size_t i = 0; i < _values.size(); ++i)
+        _values[i] = _values[i] * val;
+    return *this;
+}
+
 template <typename T> ValuePtr<T> Vector<T>::operator[](size_t index) const { return _values[index]; }
 
 template <typename T> std::ostream &operator<<(std::ostream &os, const Vector<T> &vector) {

diff --git a/include/nn/Loss.hpp b/include/nn/Loss.hpp
@@ -57,4 +57,32 @@ Function<T> MAE = [](Vector<T> a, Vector<T> b) {
     return loss;
 };
 
+template <typename T>
+Function<T> CrossEntropy = [](Vector<T> a, Vector<T> b) {
+    if (a.size() != b.size()) {
+        throw std::invalid_argument(
+            "Vectors need to be of the same size to compute the Cross Entropy loss. Sizes are " +
+            std::to_string(a.size()) + " and " + std::to_string(b.size()) + ".");
+    }
+
+    auto aSum = a.sum();
+    auto bSum = b.sum();
+
+    if (aSum->getValue() < 0.99 || aSum->getValue() > 1.01 || aSum->getValue() < 0.99 || aSum->getValue() > 1.01) {
+        throw std::invalid_argument("To compute Cross Entropy Loss, both elements of each vector need to sum to 1(+/- "
+                                    "0.01). Currently, they sum to:" +
+                                    std::to_string(aSum->getValue()) + " and " + std::to_string(bSum->getValue()) +
+                                    ".");
+    }
+
+    auto loss = Value<T>::create(0);
+    for (size_t i = 0; i < a.size(); ++i) {
+        loss = loss - (b[i] * (a[i]->log()));
+    }
+
+    loss->backward();
+
+    return loss;
+};
+
 } // namespace shkyera::Loss
diff --git a/include/nn/activation/Softmax.hpp b/include/nn/activation/Softmax.hpp
@@ -0,0 +1,50 @@
+/**
+ * Copyright © 2023 Franciszek Szewczyk. None of the rights reserved.
+ * This code is released under the Beerware License. If you find this code useful or you appreciate the work, you are
+ * encouraged to buy the author a beer in return.
+ * Contact the author at [email protected] for inquiries and support.
+ */
+
+#pragma once
+
+#include "Activation.hpp"
+
+namespace shkyera {
+
+template <typename T> class Softmax;
+using Softmax32 = Softmax<Type::float32>;
+using Softmax64 = Softmax<Type::float64>;
+
+template <typename T> class Softmax : public Activation<T> {
+  public:
+    static std::shared_ptr<Softmax<T>> create();
+
+    virtual Vector<T> operator()(const Vector<T> &x) const override;
+};
+
+template <typename T> std::shared_ptr<Softmax<T>> Softmax<T>::create() {
+    return std::shared_ptr<Softmax<T>>(new Softmax<T>());
+}
+
+template <typename T> Vector<T> Softmax<T>::operator()(const Vector<T> &x) const {
+    std::vector<ValuePtr<T>> out;
+    out.reserve(x.size());
+
+    auto maxValue = Value<T>::create(x[0]->getValue());
+    for (size_t i = 1; i < x.size(); ++i)
+        if (x[i] > maxValue)
+            maxValue = x[i];
+
+    auto sumExponentiated = Value<T>::create(0);
+    for (size_t i = 0; i < x.size(); ++i) {
+        auto exponentiated = (x[i] - maxValue)->exp();
+        out.emplace_back(exponentiated);
+        sumExponentiated = sumExponentiated + exponentiated;
+    }
+
+    auto vectorizedOut = Vector<T>(out) / sumExponentiated;
+
+    return vectorizedOut;
+}
+
+} // namespace shkyera
diff --git a/include/nn/layers/Dropout.hpp b/include/nn/layers/Dropout.hpp
@@ -45,15 +45,16 @@ template <typename T> DropoutPtr<T> Dropout<T>::create(size_t input, size_t size
 template <typename T> Vector<T> Dropout<T>::operator()(const Vector<T> &x) const {
     std::vector<ValuePtr<T>> alteredInput;
     alteredInput.reserve(x.size());
-    auto scaling = Value<T>::create(1.0 / (1 - _dropout));
     for (size_t i = 0; i < x.size(); ++i)
-        alteredInput.push_back(x[i] * scaling);
+        alteredInput.push_back(x[i]);
 
     std::vector<size_t> indicesToRemove = utils::sample<size_t>(0, x.size() - 1, _dropout * x.size(), false);
     for (size_t idxToRemove : indicesToRemove)
         alteredInput[idxToRemove] = Value<T>::create(0);
 
-    return Linear<T>::operator()(Vector<T>(alteredInput));
+    auto transformedInput = Vector<T>(alteredInput) * static_cast<T>(1.0 / (1 - _dropout));
+
+    return Linear<T>::operator()(transformedInput);
 }
 
 } // namespace shkyera