Skip to content

Commit

Permalink
add malloc cases
Browse files Browse the repository at this point in the history
  • Loading branch information
archibate committed Jan 21, 2022
1 parent aed3b01 commit e9270dd
Show file tree
Hide file tree
Showing 100 changed files with 5,349 additions and 54 deletions.
22 changes: 22 additions & 0 deletions 07/02_cache/03/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,26 @@ void BM_aosoa(benchmark::State &bm) {
}
BENCHMARK(BM_aosoa);

void BM_aosoa_16(benchmark::State &bm) {
struct MyClass {
float x[16];
float y[16];
float z[16];
};

std::vector<MyClass> mc(n / 16);

for (auto _: bm) {
#pragma omp parallel for
for (size_t i = 0; i < n / 16; i++) {
#pragma omp simd
for (size_t j = 0; j < 16; j++) {
mc[i].x[j] = mc[i].x[j] + mc[i].y[j];
}
}
benchmark::DoNotOptimize(mc);
}
}
BENCHMARK(BM_aosoa_16);

BENCHMARK_MAIN();
23 changes: 23 additions & 0 deletions 07/02_cache/04/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(main LANGUAGES CXX)

add_executable(main main.cpp)

find_package(OpenMP REQUIRED)
target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)

#find_package(TBB REQUIRED)
#target_link_libraries(main PUBLIC TBB::tbb)

find_package(benchmark REQUIRED)
target_link_libraries(main PUBLIC benchmark::benchmark)

if (MSVC)
target_compile_options(main PUBLIC /fp:fast /arch:AVX)
else()
target_compile_options(main PUBLIC -ffast-math -march=native)
endif()
81 changes: 81 additions & 0 deletions 07/02_cache/04/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include <iostream>
#include <vector>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <array>
#include <benchmark/benchmark.h>
#include <x86intrin.h>
#include <omp.h>

// L1: 32KB
// L2: 256KB
// L3: 12MB

constexpr size_t n = 1<<26; // 512MB

void BM_aos(benchmark::State &bm) {
struct MyClass {
float x;
float y;
float z;
};

std::vector<MyClass> mc(n);

for (auto _: bm) {
#pragma omp parallel for
for (size_t i = 0; i < n; i++) {
mc[i].x += 1;
mc[i].y += 1;
mc[i].z += 1;
}
benchmark::DoNotOptimize(mc);
}
}
BENCHMARK(BM_aos);

void BM_soa(benchmark::State &bm) {
std::vector<float> mc_x(n);
std::vector<float> mc_y(n);
std::vector<float> mc_z(n);

for (auto _: bm) {
#pragma omp parallel for
for (size_t i = 0; i < n; i++) {
mc_x[i] += 1;
mc_y[i] += 1;
mc_z[i] += 1;
}
benchmark::DoNotOptimize(mc_x);
benchmark::DoNotOptimize(mc_y);
benchmark::DoNotOptimize(mc_z);
}
}
BENCHMARK(BM_soa);

void BM_aosoa(benchmark::State &bm) {
struct MyClass {
float x[1024];
float y[1024];
float z[1024];
};

std::vector<MyClass> mc(n / 1024);

for (auto _: bm) {
#pragma omp parallel for
for (size_t i = 0; i < n / 1024; i++) {
#pragma omp simd
for (size_t j = 0; j < 1024; j++) {
mc[i].x[j] += 1;
mc[i].y[j] += 1;
mc[i].z[j] += 1;
}
}
benchmark::DoNotOptimize(mc);
}
}
BENCHMARK(BM_aosoa);

BENCHMARK_MAIN();
File renamed without changes.
File renamed without changes.
36 changes: 36 additions & 0 deletions 07/02_cache/04/pseudo.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// fully SOA, **bad**, for small size simd use
struct ParticleList {
std::vector<float> pos_x;
std::vector<float> pos_y;
std::vector<float> pos_z;
std::vector<float> vel_x;
std::vector<float> vel_y;
std::vector<float> vel_z;
};

// partially SOA, partially AOS, **good**, for daily use
struct ParticleList {
std::vector<glm::vec3> pos;
std::vector<glm::vec3> vel;
};

// littlemine's favo AOSOA, **good**, for hpc experts
struct ParticleBlock {
float pos_x[1024];
float pos_y[1024];
float pos_z[1024];
float vel_x[1024];
float vel_y[1024];
float vel_z[1024];
};
using ParticleList = std::vector<ParticleBlock>;

// fully AOS, **bad**, for naive students
struct Particle {
glm::vec3 pos;
glm::vec3 vel;
};
using ParticleList = std::vector<Particle>;



File renamed without changes.
2 changes: 1 addition & 1 deletion 07/06_stencil/01/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ constexpr size_t nx = 1<<13;
constexpr size_t ny = 1<<13;
constexpr int nblur = 8;

ndarray<2, float, nblur> a(nx, ny);
ndarray<2, float, 16> a(nx, ny);
ndarray<2, float> b(nx, ny);

void BM_copy(benchmark::State &bm) {
Expand Down
2 changes: 1 addition & 1 deletion 07/06_stencil/02/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ constexpr size_t nx = 1<<13;
constexpr size_t ny = 1<<13;
constexpr int nblur = 8;

ndarray<2, float, nblur, nblur, 32> a(nx, ny);
ndarray<2, float, 16, 16, 32> a(nx, ny);
ndarray<2, float, 0, 0, 32> b(nx, ny);

void BM_copy(benchmark::State &bm) {
Expand Down
4 changes: 4 additions & 0 deletions 07/06_stencil/03/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ int main() {
read_image(a, "original.jpg");
TOCK(read);

//TICK(boxblur);
//boxblur(a, 32, 32);
//TOCK(boxblur);

TICK(gaussblur);
gaussblur(a, 32, 12);
TOCK(gaussblur);
Expand Down
28 changes: 28 additions & 0 deletions 07/07_morton/02/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(main LANGUAGES CXX)

add_executable(main main.cpp)

find_package(OpenMP REQUIRED)
target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)

find_package(TBB)
if (NOT TARGET TBB::tbb)
message(WARNING "TBB not found")
else()
target_link_libraries(main PUBLIC TBB::tbb)
target_compile_definitions(main PUBLIC -DWITH_TBB)
endif()

find_package(benchmark REQUIRED)
target_link_libraries(main PUBLIC benchmark::benchmark)

if (MSVC)
target_compile_options(main PUBLIC /fp:fast /arch:AVX)
else()
target_compile_options(main PUBLIC -ffast-math -march=native -Wno-narrowing)
endif()
83 changes: 83 additions & 0 deletions 07/07_morton/02/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include <iostream>
#include <vector>
#include <cmath>
#include <cstring>
#include <cstdlib>
#include <array>
#include <benchmark/benchmark.h>
#include <x86intrin.h>
#include <omp.h>
#include "ndarray.h"
#include "morton.h"
#ifdef WITH_TBB
#include <tbb/parallel_for.h>
#include <tbb/blocked_range2d.h>
#endif

// L1: 32KB
// L2: 256KB
// L3: 12MB

constexpr int n = 1<<10;
constexpr int nblur = 8;

ndarray<2, float, 16> a(n, n);
ndarray<2, float> b(n, n);

void BM_y_blur(benchmark::State &bm) {
for (auto _: bm) {
#pragma omp parallel for collapse(2)
for (int y = 0; y < n; y++) {
for (int x = 0; x < n; x++) {
for (int t = 0; t < nblur; t++) {
b(x, y) += a(x, y + t);
}
}
}
benchmark::DoNotOptimize(b);
}
}
BENCHMARK(BM_y_blur);

void BM_y_blur_tiled(benchmark::State &bm) {
for (auto _: bm) {
constexpr int blockSize = 32;
#pragma omp parallel for collapse(2)
for (int yBase = 0; yBase < n; yBase += blockSize) {
for (int xBase = 0; xBase < n; xBase += blockSize) {
for (int y = yBase; y < yBase + blockSize; y++) {
for (int x = xBase; x < xBase + blockSize; x++) {
for (int t = -nblur; t <= nblur; t++) {
b(x, y) += a(x, y + t);
}
}
}
}
}
benchmark::DoNotOptimize(b);
}
}
BENCHMARK(BM_y_blur_tiled);

void BM_y_blur_morton_tiled(benchmark::State &bm) {
for (auto _: bm) {
constexpr int blockSize = 32;
#pragma omp parallel for
for (int mortonCode = 0; mortonCode < n * n / blockSize / blockSize; mortonCode++) {
auto [xBase, yBase] = morton2d::decode(mortonCode);
xBase *= blockSize;
yBase *= blockSize;
for (int y = yBase; y < yBase + blockSize; y++) {
for (int x = xBase; x < xBase + blockSize; x++) {
for (int t = -nblur; t <= nblur; t++) {
b(x, y) += a(x, y + t);
}
}
}
}
benchmark::DoNotOptimize(b);
}
}
BENCHMARK(BM_y_blur_morton_tiled);

BENCHMARK_MAIN();
File renamed without changes.
13 changes: 13 additions & 0 deletions 07/07_morton/02/mtprint.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <iostream>
#include <sstream>

template <class T, class ...Ts>
static void mtprint(T &&t, Ts &&...ts) {
std::stringstream ss;
ss << std::forward<T>(t);
((ss << ' ' << std::forward<Ts>(ts)), ...);
ss << std::endl;
std::cout << ss.str();
}
File renamed without changes.
66 changes: 66 additions & 0 deletions 07/07_morton/02/pod.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include <new>
#include <utility>

template <class T>
struct pod {
private:
T m_t;
public:
pod() {}

pod(pod &&p) : m_t(std::move(p.m_t)) {}

pod(pod const &p) : m_t(p.m_t) {}

pod &operator=(pod &&p) {
m_t = std::move(p.m_t);
return *this;
}

pod &operator=(pod const &p) {
m_t = p.m_t;
return *this;
}

pod(T &&t) : m_t(std::move(t)) {}

pod(T const &t) : m_t(t) {}

pod &operator=(T &&t) {
m_t = std::move(t);
return *this;
}

pod &operator=(T const &t) {
m_t = t;
return *this;
}

operator T const &() const {
return m_t;
}

operator T &() {
return m_t;
}

T const &get() const {
return m_t;
}

T &get() {
return m_t;
}

template <class ...Ts>
pod &emplace(Ts &&...ts) {
::new (&m_t) T(std::forward<Ts>(ts)...);
return *this;
}

void destroy() {
m_t.~T();
}
};
Loading

0 comments on commit e9270dd

Please sign in to comment.