Merge pull request #27 from t-hishinuma/dev

Support AVX512
t-hishinuma · Oct 10, 2020 · c5acc25 · c5acc25
2 parents 7ae7150 + a8c00ae
commit c5acc25
Show file tree

Hide file tree

Showing 6 changed files with 591 additions and 211 deletions.
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
@@ -62,3 +62,16 @@ jobs:
 
     - name: test cg
       run: cd sample/; make; make test
+
+  avx512:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: make
+      run: make avx512
+    - name: make install
+      run: make install
+
+    - name: make test
+      run: cd test; make
diff --git a/README.md b/README.md
@@ -3,13 +3,14 @@
 # DD-AVX Library: Library of High Precision Sparse Matrix Operations Accelerated by SIMD
 
 ## About
-DD-AVX_v3 is SIMD accelerated simple interface high precision BLAS Lv.1 and Sparse BLAS Library.
+DD-AVX_v3 is SIMD accelerated simple interface high precision BLAS Lv.1 and Sparse BLAS operations Library.
 
-BLAS Lv.1 and Sparse BLAS operations can be performed by combining double and double-double precision.
+These operations can be performed by combining double and double-double precision.
+
+**Multi-threading** (OpenMP) and **SIMD AVX / AVX2 / AVX512** acceleration are available.
 
 This library provides an easy way to implement a fast and accurate Krylov subspace method.
 
-OpenMP and SIMD AVX / AVX2 acceleration are available.
 
 This library is extensions of 
 [Lis_DD_AVXKernals](https://github.com/t-hishinuma/Lis_DD_AVXkernels) and 
@@ -40,30 +41,38 @@ for more information on how to use it.
 This library requires the QD library for scalar operations as a submodule.
 The QD library is downloaded and built automatically by `make`.
 
-You can specify the destination directory with `DDAVX_DIR` and `make` .
+You can specify the destination directory with `DD_AVX_DIR` and `make` .
 The QD libraries are installed in the same directory.
 
 You can build and install the QD library with the following commands:
 
-## AVX
+## System Requirements
+* g++ 7.1 or higher (C++17)
+* GNU make
+
+## Build for each SIMD instruction
+### AVX
 > make avx
 > 
 > make install
 
-## AVX2
+### AVX2
 > make avx2
 > 
 > make install
 
-## AVX512 (not yet implemented)
-In the future, AVX512 can be built with the following command.
+### AVX512
 > make avx512
 > 
 > make install
 
-# System Requirements
-* g++ 7.1 or higher
-* GNU make
+#### AVX512 testing is inadequate.
+Since github actions's runner does not support AVX512, github actions only verifies the build.
+We performed manual testing on an Intel® Xeon® Platinum 8280.
+
+
+# Document
+It can be generated using Doxygen.
 
 # Current Status and Restrictions
 This is a beta version, and there are some restrictions and changes planned.
@@ -75,14 +84,21 @@ The detailed todo is discussed in [Issue](https://github.com/t-hishinuma/DD-AVX_
 * (SIMD_REG class is difficult to share with Scalar, so I want to change it to REG class.)
 * The conversion routine to BCRS format doesn't work because I'm currently trying to rework it to make it multi-threaded.
 
-# Document
-It can be generated using Doxygen.
 
-# Testing
-We have a complete set of tests for each feature in the test directory. You can find them in the test directory.
+# Test and sample
+## test
+We have a complete set of tests for each feature in the `test/` directory. 
 
 > cd test/
 > 
 > make
 > 
-> make test
+> make test_all
+
+## sample
+We have implemented an example of the CG method, which is available in `sample/`.
+
+# Publications
+- [Hishinuma_t's Dr. thesis](https://hishinuma-t.dev/papers/dr_thesis/)
+- [Implementation of AVX512](https://hishinuma-t.dev/papers/wo_review/jsiam_os2020_doi/) thanks to Mr. [Itsuki Doi](https://github.com/doi-master)
+- [and more](https://hishinuma-t.dev/papers/list/)
diff --git a/src/Makefile.avx512 b/src/Makefile.avx512
@@ -1,7 +1,7 @@
 TARGET=../lib/libdd-avx.so
 LINK=-I../include/ -I./core/ -L../lib/ -lqd
 
-CXXFLAGS+=-O3 -std=c++17 -fopenmp -lm -g -fPIC 
+CXXFLAGS= -O3 -std=c++17 -fopenmp -lm -fPIC 
 CXXFLAGS+= -DUSE_AVX512 -DUSE_FMA -mavx512f -mfma
 LIBFLAGS=-shared
 
@@ -31,4 +31,4 @@ $(TARGET): $(OBJS)
 	g++ $(CXXFLAGS) $(LIBFLAGS) $(LINK) $(OBJS) -o $(TARGET)
 
 $(OBJDIR)/%.o: %.cpp 
-	g++ $(CXXFLAGS) $(LINK) -c $< -o $@
+	g++ $(CXXFLAGS) $(LINK) -I../include -c $< -o $@
diff --git a/src/core/AVX512_core.hpp b/src/core/AVX512_core.hpp
@@ -3,18 +3,18 @@
 #define DD_AVX_CORE_HPP_
 
 #include <immintrin.h>
-#define SIMD_FUNC(NAME) _mm256_##NAME
-using reg = __m256d;
+#define SIMD_FUNC(NAME) _mm512_##NAME
+using reg = __m512d;
 struct dd_real;
 
 namespace ddavx_core{
-	const int SIMD_Length=4;
+	const int SIMD_Length=8;
 	class registers{
 		public:
 			double splitter = 134217729.0;
-			reg sp = SIMD_FUNC(broadcast_sd)(&splitter); 
-			reg minus = SIMD_FUNC(set_pd)(-1.0,-1.0,-1.0,-1.0);
-			reg zeros = SIMD_FUNC(set_pd)(0,0,0,0);
+			reg sp = SIMD_FUNC(set1_pd)(splitter);
+			reg minus = SIMD_FUNC(set_pd)(-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0);
+			reg zeros = SIMD_FUNC(set_pd)(0,0,0,0,0,0,0,0);
 			reg one,bh,bl,ch,cl,sh,sl,wh,wl,th,tl,p1,p2,t0,t1,t2,eh,t3; 
 	};
 
@@ -29,14 +29,14 @@ namespace ddavx_core{
 	}
 
 	//set
-	inline reg set(const double a, const double b, const double c, const double d){
-		reg ret = SIMD_FUNC(set_pd)(a, b, c, d);
+	inline reg set(const double a, const double b, const double c, const double d, const double e, const double f, const double g, const double h){
+		reg ret = SIMD_FUNC(set_pd)(a, b, c, d, e, f, g, h);
 		return ret;
 	}
 
 	//broadcast
 	inline reg broadcast(const double a){
-		reg ret = SIMD_FUNC(broadcast_sd)(&a); 
+		reg ret = SIMD_FUNC(set1_pd)(a); 
 		return ret;
 	}
 
@@ -77,27 +77,35 @@ namespace ddavx_core{
 
 	// need to change for AVX512
 	inline dd_real reduction(reg a_hi, reg a_lo){
-		double hi[4];
-		double lo[4];
+		double hi[8];
+		double lo[8];
 		store(*hi, a_hi); 
 		store(*lo, a_lo); 
 
 		dd_real a; a.x[0] = hi[0]; a.x[1] = lo[0];
 		dd_real b; b.x[0] = hi[1]; b.x[1] = lo[1];
 		dd_real c; c.x[0] = hi[2]; c.x[1] = lo[2];
 		dd_real d; d.x[0] = hi[3]; d.x[1] = lo[3];
+		dd_real e; e.x[0] = hi[4]; e.x[1] = lo[4];
+		dd_real f; f.x[0] = hi[5]; f.x[1] = lo[5];
+		dd_real g; g.x[0] = hi[6]; g.x[1] = lo[6];
+		dd_real h; h.x[0] = hi[7]; h.x[1] = lo[7];
 
-		return a + b + c + d;
+		return a + b + c + d + e + f + g + h;
 	}
 
 	// need to change for AVX512
-	inline void store(double& ret1, double& ret2, double& ret3, double& ret4, const reg val){
-		double tmp[4];
+	inline void store(double& ret1, double& ret2, double& ret3, double& ret4,double& ret5, double& ret6, double& ret7, double& ret8, const reg val){
+		double tmp[8];
 		store(*tmp, val); 
 		ret1 = tmp[0];
 		ret2 = tmp[1];
 		ret3 = tmp[2];
 		ret4 = tmp[3];
+		ret5 = tmp[4];
+		ret6 = tmp[5];
+		ret7 = tmp[6];
+		ret8 = tmp[7];
 	}
 
 	inline void print(reg a){