From f7292de4e7d34ca54c03cd90b066bc65eac725bf Mon Sep 17 00:00:00 2001
From: Roman Gershman <roman@dragonflydb.io>
Date: Mon, 18 Mar 2024 13:51:33 +0200
Subject: [PATCH] chore: Introduce fiber stack allocator (#2730)

1. Use clib malloc for allocating fiber stacks but reduce the fiber stack size.
   clib malloc uses default 4K OS pages when reserving memory from the OS.
   The reason for not using mi_malloc, because we use 2MB large OS pages with mimalloc.
   However, allocating stacks is one of the cases, when using smaller 4KB memory pages is actually more
   RSS efficient because memory pages become hot at better granularity.

2. Add "memory_fiberstack_vms_bytes" metric exposing fiber stack vm usage.
3. Fix macos dependencies & update ci versions.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
---
 .github/actions/lint-test-chart/action.yml    |  4 +--
 .github/workflows/ci.yml                      |  2 +-
 .github/workflows/daily-builds.yml            |  9 +++--
 .github/workflows/docker-release.yml          |  2 +-
 .github/workflows/regression-tests.yml        |  4 +--
 .github/workflows/release.yml                 |  4 +--
 .../reusable-container-workflow.yaml          |  2 +-
 .../ci/priorityclassname-values.golden.yaml   | 18 +++++-----
 helio                                         |  2 +-
 src/core/mi_memory_resource.cc                | 11 +++---
 src/core/mi_memory_resource.h                 |  1 +
 src/server/CMakeLists.txt                     |  2 +-
 src/server/dfly_main.cc                       | 34 ++++++++++++++-----
 src/server/memory_cmd.cc                      |  6 +++-
 src/server/replica.cc                         |  1 -
 src/server/server_family.cc                   | 13 +++++--
 src/server/server_family.h                    |  2 ++
 17 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/.github/actions/lint-test-chart/action.yml b/.github/actions/lint-test-chart/action.yml
index 5fd8e34d369f..a0583d31571a 100644
--- a/.github/actions/lint-test-chart/action.yml
+++ b/.github/actions/lint-test-chart/action.yml
@@ -10,7 +10,7 @@ runs:
           fetch-depth: 0
 
       - name: Set up Helm
-        uses: azure/setup-helm@v3
+        uses: azure/setup-helm@v4
 
       - uses: actions/setup-python@v5
         with:
@@ -43,7 +43,7 @@ runs:
             ${{github.event_name == 'workflow_dispatch' && '--all'}} ;
 
       - name: Create kind cluster
-        uses: helm/kind-action@v1.8.0
+        uses: helm/kind-action@v1
 
       - name: Getting cluster ready
         shell: bash
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2ccf0f7aacc1..15b6cebfc82c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -176,5 +176,5 @@ jobs:
   lint-test-chart:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: ./.github/actions/lint-test-chart
diff --git a/.github/workflows/daily-builds.yml b/.github/workflows/daily-builds.yml
index d56857bf95dd..8dd5b4177331 100644
--- a/.github/workflows/daily-builds.yml
+++ b/.github/workflows/daily-builds.yml
@@ -85,8 +85,13 @@ jobs:
 
       - name: Install dependencies
         run: |
-          brew update && brew install ninja boost openssl automake gcc zstd bison c-ares \
-              autoconf libtool automake flatbuffers
+          brew uninstall --formula node kotlin harfbuzz sbt selenium-server imagemagick \
+              gradle maven openjdk postgresql r ant mongodb-community@5.0 mongosh \
+              node@18 php composer
+
+          brew update && brew install --force ninja boost openssl automake gcc zstd bison c-ares \
+              autoconf libtool automake
+
           # brew info icu4c
           mkdir -p $GITHUB_WORKSPACE/build
 
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index dab5b6342d9e..6cd4428a2be6 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -68,7 +68,7 @@ jobs:
         fetch-depth: 0
 
     - name: Install helm
-      uses: azure/setup-helm@v3
+      uses: azure/setup-helm@v4
 
     - name: Setup Go
       uses: actions/setup-go@v4
diff --git a/.github/workflows/regression-tests.yml b/.github/workflows/regression-tests.yml
index 69b9d32ea837..baf75b403ffc 100644
--- a/.github/workflows/regression-tests.yml
+++ b/.github/workflows/regression-tests.yml
@@ -18,7 +18,7 @@ jobs:
     container:
       image: ghcr.io/romange/${{ matrix.container }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
 
@@ -57,5 +57,5 @@ jobs:
   lint-test-chart:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: ./.github/actions/lint-test-chart
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3cf69afc95a7..a82b5b9a130d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -29,7 +29,7 @@ jobs:
     name: Build aarch64 on ubuntu20.04
     needs: create-release
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - name: Cache build deps
@@ -111,7 +111,7 @@ jobs:
     container:
       image: ghcr.io/romange/${{ matrix.container }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - name: Configure
diff --git a/.github/workflows/reusable-container-workflow.yaml b/.github/workflows/reusable-container-workflow.yaml
index e526b19e80f2..35981ce1c4ed 100644
--- a/.github/workflows/reusable-container-workflow.yaml
+++ b/.github/workflows/reusable-container-workflow.yaml
@@ -60,7 +60,7 @@ jobs:
             tag_main: true
     steps:
       - name: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 1
           submodules: true
diff --git a/contrib/charts/dragonfly/ci/priorityclassname-values.golden.yaml b/contrib/charts/dragonfly/ci/priorityclassname-values.golden.yaml
index 793d917329ae..3e50dbfc52e7 100644
--- a/contrib/charts/dragonfly/ci/priorityclassname-values.golden.yaml
+++ b/contrib/charts/dragonfly/ci/priorityclassname-values.golden.yaml
@@ -1,4 +1,13 @@
 ---
+# Source: dragonfly/templates/extra-manifests.yaml
+apiVersion: scheduling.k8s.io/v1
+description: This priority class should be used only for tests.
+globalDefault: false
+kind: PriorityClass
+metadata:
+  name: high-priority
+value: 1000000
+---
 # Source: dragonfly/templates/serviceaccount.yaml
 apiVersion: v1
 kind: ServiceAccount
@@ -92,12 +101,3 @@ spec:
           resources:
             limits: {}
             requests: {}
----
-# Source: dragonfly/templates/extra-manifests.yaml
-apiVersion: scheduling.k8s.io/v1
-description: This priority class should be used only for tests.
-globalDefault: false
-kind: PriorityClass
-metadata:
-  name: high-priority
-value: 1000000
diff --git a/helio b/helio
index b3ed89d13ea3..91c4f48d025b 160000
--- a/helio
+++ b/helio
@@ -1 +1 @@
-Subproject commit b3ed89d13ea3b7095e3a48915ec9c8cd01223039
+Subproject commit 91c4f48d025bccd6fc3ff14e471accf1c7801f38
diff --git a/src/core/mi_memory_resource.cc b/src/core/mi_memory_resource.cc
index 37ed7d81ac33..48c1dec44d87 100644
--- a/src/core/mi_memory_resource.cc
+++ b/src/core/mi_memory_resource.cc
@@ -3,18 +3,21 @@
 //
 #include "core/mi_memory_resource.h"
 
+#include <sys/mman.h>
+
 #include "base/logging.h"
 
 namespace dfly {
 
-void* MiMemoryResource::do_allocate(std::size_t size, std::size_t align) {
+using namespace std;
+
+void* MiMemoryResource::do_allocate(size_t size, size_t align) {
   DCHECK(align);
 
   void* res = mi_heap_malloc_aligned(heap_, size, align);
 
   if (!res)
-    throw std::bad_alloc{};
-
+    throw bad_alloc{};
 
   // It seems that mimalloc has a bug with larger allocations that causes
   // mi_heap_contains_block to lie. See https://github.com/microsoft/mimalloc/issues/587
@@ -28,7 +31,7 @@ void* MiMemoryResource::do_allocate(std::size_t size, std::size_t align) {
   return res;
 }
 
-void MiMemoryResource::do_deallocate(void* ptr, std::size_t size, std::size_t align) {
+void MiMemoryResource::do_deallocate(void* ptr, size_t size, size_t align) {
   DCHECK(size > 33554400 || mi_heap_contains_block(heap_, ptr));
 
   size_t usable = mi_usable_size(ptr);
diff --git a/src/core/mi_memory_resource.h b/src/core/mi_memory_resource.h
index a1194336ff27..fd842eddbb0d 100644
--- a/src/core/mi_memory_resource.h
+++ b/src/core/mi_memory_resource.h
@@ -10,6 +10,7 @@
 
 namespace dfly {
 
+// Per thread memory resource that uses mimalloc.
 class MiMemoryResource : public PMR_NS::memory_resource {
  public:
   explicit MiMemoryResource(mi_heap_t* heap) : heap_(heap) {
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
index 0e0422d53667..df40cf6cee3b 100644
--- a/src/server/CMakeLists.txt
+++ b/src/server/CMakeLists.txt
@@ -7,7 +7,7 @@ cxx_link(dragonfly base dragonfly_lib)
 if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_BUILD_TYPE STREQUAL "Release")
   # Add core2 only to this file, thus avoiding instructions in this object file that
   # can cause SIGILL.
-  set_source_files_properties(dfly_main.cc PROPERTIES COMPILE_FLAGS -march=core2)
+  set_source_files_properties(dfly_main.cc PROPERTIES COMPILE_FLAGS "-march=core2")
 endif()
 
 set_property(SOURCE dfly_main.cc APPEND PROPERTY COMPILE_DEFINITIONS
diff --git a/src/server/dfly_main.cc b/src/server/dfly_main.cc
index a2ec48e67f57..3e7dcc366ab2 100644
--- a/src/server/dfly_main.cc
+++ b/src/server/dfly_main.cc
@@ -90,6 +90,11 @@ namespace dfly {
 
 namespace {
 
+// Default stack size for fibers. We decrease it by 16 bytes because some allocators
+// need additional 8-16 bytes for their internal structures, thus over reserving additional
+// memory pages if using round sizes.
+constexpr size_t kFiberDefaultStackSize = 32_KB - 16;
+
 using util::http::TlsClient;
 
 enum class TermColor { kDefault, kRed, kGreen, kYellow };
@@ -151,6 +156,12 @@ string NormalizePaths(std::string_view path) {
   return string(path);
 }
 
+template <typename... Args> unique_ptr<Listener> MakeListener(Args&&... args) {
+  auto res = make_unique<Listener>(forward<Args>(args)...);
+  res->SetConnFiberStackSize(kFiberDefaultStackSize);
+  return res;
+}
+
 bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
   uint64_t maxmemory = GetMaxMemoryFlag();
   if (maxmemory > 0 && maxmemory < pool->size() * 256_MB) {
@@ -165,12 +176,14 @@ bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
   Listener* main_listener = nullptr;
 
   std::vector<facade::Listener*> listeners;
+
   // If we ever add a new listener, plz don't change this,
   // we depend on tcp listener to be at the front since we later
   // need to pass it to the AclFamily::Init
   if (!tcp_disabled) {
-    main_listener = new Listener{Protocol::REDIS, &service, Listener::Role::MAIN};
-    listeners.push_back(main_listener);
+    auto listener = MakeListener(Protocol::REDIS, &service, Listener::Role::MAIN);
+    main_listener = listener.get();
+    listeners.push_back(listener.release());
   }
 
   Service::InitOpts opts;
@@ -218,7 +231,7 @@ bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
     }
     unlink(unix_sock.c_str());
 
-    auto uds_listener = std::make_unique<Listener>(Protocol::REDIS, &service);
+    auto uds_listener = MakeListener(Protocol::REDIS, &service);
     error_code ec =
         acceptor->AddUDSListener(unix_sock.c_str(), unix_socket_perm, uds_listener.get());
     if (ec) {
@@ -248,8 +261,8 @@ bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
     const char* interface_addr = admin_bind.empty() ? nullptr : admin_bind.c_str();
     const std::string printable_addr =
         absl::StrCat("admin socket ", interface_addr ? interface_addr : "any", ":", admin_port);
-    auto admin_listener =
-        std::make_unique<Listener>(Protocol::REDIS, &service, Listener::Role::PRIVILEGED);
+    auto admin_listener = MakeListener(Protocol::REDIS, &service, Listener::Role::PRIVILEGED);
+
     error_code ec = acceptor->AddListener(interface_addr, admin_port, admin_listener.get());
 
     if (ec) {
@@ -260,7 +273,7 @@ bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
     }
   }
 
-  if (!tcp_disabled) {
+  if (main_listener) {
     error_code ec = acceptor->AddListener(bind_addr, port, main_listener);
 
     if (ec) {
@@ -274,7 +287,8 @@ bool RunEngine(ProactorPool* pool, AcceptServer* acceptor) {
   }
 
   if (mc_port > 0 && !tcp_disabled) {
-    acceptor->AddListener(mc_port, new Listener{Protocol::MEMCACHE, &service});
+    auto listener = MakeListener(Protocol::MEMCACHE, &service);
+    acceptor->AddListener(mc_port, listener.release());
   }
 
   service.Init(acceptor, listeners, opts);
@@ -690,10 +704,14 @@ Usage: dragonfly [FLAGS]
     LOG(INFO) << "Max memory limit is: " << hr_limit;
   }
 
+  // Initialize mi_malloc options
+  // export MIMALLOC_VERBOSE=1 to see the options before the override.
   mi_option_enable(mi_option_show_errors);
   mi_option_set(mi_option_max_warnings, 0);
   mi_option_set(mi_option_decommit_delay, 1);
 
+  fb2::SetDefaultStackResource(&fb2::std_malloc_resource, kFiberDefaultStackSize);
+
   unique_ptr<util::ProactorPool> pool;
 
 #ifdef __linux__
@@ -716,7 +734,7 @@ Usage: dragonfly [FLAGS]
 
   pool->Run();
 
-  AcceptServer acceptor(pool.get());
+  AcceptServer acceptor(pool.get(), &fb2::std_malloc_resource, true);
   acceptor.set_back_log(absl::GetFlag(FLAGS_tcp_backlog));
 
   int res = dfly::RunEngine(pool.get(), &acceptor) ? 0 : -1;
diff --git a/src/server/memory_cmd.cc b/src/server/memory_cmd.cc
index 228ed1ff0f1c..03d0c6f00e3f 100644
--- a/src/server/memory_cmd.cc
+++ b/src/server/memory_cmd.cc
@@ -1,11 +1,15 @@
-// Copyright 2022, DragonflyDB authors.  All rights reserved.
+// Copyright 2024, DragonflyDB authors.  All rights reserved.
 // See LICENSE for licensing terms.
 //
 
 #include "server/memory_cmd.h"
 
 #include <absl/strings/str_cat.h>
+
+#ifdef __linux__
 #include <malloc.h>
+#endif
+
 #include <mimalloc.h>
 
 #include "base/io_buf.h"
diff --git a/src/server/replica.cc b/src/server/replica.cc
index ad832a249c40..a5051a81739b 100644
--- a/src/server/replica.cc
+++ b/src/server/replica.cc
@@ -6,7 +6,6 @@
 #include <chrono>
 
 #include "absl/strings/match.h"
-#include "util/fibers/fiber2.h"
 
 extern "C" {
 #include "redis/rdb.h"
diff --git a/src/server/server_family.cc b/src/server/server_family.cc
index c242146dbd11..41aaf52d3920 100644
--- a/src/server/server_family.cc
+++ b/src/server/server_family.cc
@@ -1023,8 +1023,11 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
                             &resp->body());
   AppendMetricWithoutLabels("memory_used_peak_bytes", "", used_mem_peak.load(memory_order_relaxed),
                             MetricType::GAUGE, &resp->body());
-  AppendMetricWithoutLabels("comitted_memory", "", GetMallocCurrentCommitted(), MetricType::GAUGE,
+  AppendMetricWithoutLabels("memory_fiberstack_vms_bytes", "", m.worker_fiber_stack_size,
+                            MetricType::GAUGE, &resp->body());
+  AppendMetricWithoutLabels("fibers_count", "", m.worker_fiber_count, MetricType::GAUGE,
                             &resp->body());
+
   AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
                             &resp->body());
 
@@ -1773,6 +1776,8 @@ Metrics ServerFamily::GetMetrics() const {
     result.fiber_switch_delay_usec += fb2::FiberSwitchDelayUsec();
     result.fiber_longrun_cnt += fb2::FiberLongRunCnt();
     result.fiber_longrun_usec += fb2::FiberLongRunSumUsec();
+    result.worker_fiber_stack_size += fb2::WorkerFibersStackSize();
+    result.worker_fiber_count += fb2::WorkerFibersCount();
 
     result.coordinator_stats.Add(ss->stats);
 
@@ -1888,13 +1893,15 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
     append("used_memory_peak", ump);
     append("used_memory_peak_human", HumanReadableNumBytes(ump));
 
+    // Virtual memory size, upper bound estimation on the RSS memory used by the fiber stacks.
+    append("fibers_stack_vms", m.worker_fiber_stack_size);
+    append("fibers_count", m.worker_fiber_count);
+
     size_t rss = rss_mem_current.load(memory_order_relaxed);
     append("used_memory_rss", rss);
     append("used_memory_rss_human", HumanReadableNumBytes(rss));
     append("used_memory_peak_rss", rss_mem_peak.load(memory_order_relaxed));
 
-    append("comitted_memory", GetMallocCurrentCommitted());
-
     append("maxmemory", max_memory_limit);
     append("maxmemory_human", HumanReadableNumBytes(max_memory_limit));
 
diff --git a/src/server/server_family.h b/src/server/server_family.h
index f589924ad12b..a33c05b62cea 100644
--- a/src/server/server_family.h
+++ b/src/server/server_family.h
@@ -99,6 +99,8 @@ struct Metrics {
 
   // Max length of the all the tx shard-queues.
   uint32_t tx_queue_len = 0;
+  uint32_t worker_fiber_count = 0;
+  size_t worker_fiber_stack_size = 0;
 
   // command call frequencies (count, aggregated latency in usec).
   std::map<std::string, std::pair<uint64_t, uint64_t>> cmd_stats_map;