gunrock · richardlett · Jun 22, 2021 · Jun 22, 2021 · Jun 22, 2021 · Jun 22, 2021
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "ext/moderngpu"]
 	path = ext/moderngpu
-	url = https://ctcyang@github.com/ctcyang/moderngpu.git
-[submodule "ext/cub"]
-	path = ext/cub
-	url = https://ctcyang@github.com/NVlabs/cub.git
+	url = git@github.com:ctcyang/moderngpu.git
+[submodule "ext/GALATIC"]
+	path = ext/GALATIC
+	url = git@github.com:richardlett/GALATIC.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,22 +13,22 @@ set( PROJ_PATH      ${CMAKE_SOURCE_DIR})
 set( PROJ_OUT_PATH  ${CMAKE_BINARY_DIR})
 set( PROJ_HEADERS   "" )
 set( PROJ_LIBRARIES "" )
-set( PROJ_INCLUDES  "./" "ext/moderngpu/include" "ext/cub/cub")
+set( PROJ_INCLUDES  "./" "ext/moderngpu/include" "ext")
 set( mgpu_SRC_FILES "ext/moderngpu/src/mgpucontext.cu" "ext/moderngpu/src/mgpuutil.cpp")
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin )
 #set( CUDA_CURAND_LIBRARY "$ENV{CUDA_HOME}/lib64/libcurand.so" )
 #set( CUDA_CUBLAS_LIBRARY "$ENV{CUDA_HOME}/lib64/libcublas.so" )
 set( CUDA_CUSPARSE_LIBRARY "$ENV{CUDA_HOME}/lib64/libcusparse.so" )
-#FILE( GLOB_RECURSE PROJ_SOURCES graphblas/*.cu ../graphblas/*.cpp )
+#FILE( G     LOB_RECURSE PROJ_SOURCES graphblas/*.cu ../graphblas/*.cpp )
 #FILE( GLOB_RECURSE PROJ_LIBRARIES ext/cublas1.1/*.cu )
 FILE( GLOB_RECURSE PROJ_HEADERS graphblas/*.hpp)
 # nvcc flags
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr ")
 #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-fpermissive;-arch=sm_35;-lineinfo;-Xptxas=-v;-dlcm=ca;-maxrregcount=64)
 #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_21)
 # needed for cudamalloc
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11" )
+set(CMAKE_CXX_FLAGS "-fpermissive -g  -std=c++14" )
 #set(CMAKE_CXX_FLAGS "-fpermissive -pg -m64 -std=c++11" )
 #set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11 -H" )
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ include common.mk
 #-------------------------------------------------------------------------------
 
 # Includes
-INC += -I$(MGPU_DIR) -I$(CUB_DIR) -I$(BOOST_DIR) -I$(GRB_DIR)
+INC += -I$(MGPU_DIR)  -I$(BOOST_DIR) -I$(GRB_DIR)
 
 #-------------------------------------------------------------------------------
 # Dependency Lists

diff --git a/README.md b/README.md
@@ -9,11 +9,13 @@ GraphBLAST is a GPU implementation of [GraphBLAS](http://www.graphblas.org), an
 
 ## Prerequisites
 
-This software has been tested on the following dependencies:
+This software has been tested to build with the following dependencies:
 
-* CUDA 9.1, 9.2
-* Boost 1.58
-* g++ 4.9.3, 5.4.0
+* CUDA 11.3
+  * (Change: CUDA > 11 is now required)
+* Boost 1.74
+* g++ 8.3.0
+  * (Change: C++14 is required)
 
 Optional:
 

diff --git a/ext/GALATIC b/ext/GALATIC
diff --git a/ext/cub b/ext/cub
diff --git a/graphblas/backend/cuda/descriptor.hpp b/graphblas/backend/cuda/descriptor.hpp
@@ -280,6 +280,14 @@ Info Descriptor::loadArgs(const po::variables_map& vm) {
       std::cout << "Error: incorrect nthread selection!\n";
   }
 
+  if(mode_ == "galatic") {
+      CHECK(set(GrB_MODE, GrB_GALATIC));
+  } else if (mode_ == "cusparse2") {
+      CHECK(set(GrB_MODE, GrB_CUSPARSE2));
+  } else {
+      std::cout << R"(Invalid mode: Options are "galatic" and "cusparse2")" << std::endl;
+  }
+
   // TODO(@ctcyang): Enable device selection using ndevice_
   // if( ndevice_!=0 )
 

diff --git a/graphblas/backend/cuda/operations.hpp b/graphblas/backend/cuda/operations.hpp
@@ -43,11 +43,30 @@ Info mxm(Matrix<c>*       C,
           desc));
     } else if (typeid(c) == typeid(float) && typeid(a) == typeid(float) &&
                typeid(b) == typeid(float)) {
-      CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
-            &B->sparse_, desc));
+
+        Desc_value s_mode;
+        CHECK(desc->get(GrB_MODE, &s_mode));
+
+        if (s_mode == GrB_CUSPARSE2)
+            CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
+             &B->sparse_, desc));
+        else {
+            if (s_mode != GrB_GALATIC) {
+                std::cout << R"(Unknown mode (Options are: "cuspare2" and "galatic"; defaulting to galatic)" << std::endl;
+            }
+            CHECK(GALATIC_spgemm(&C->sparse_,
+                                 op,
+                                 &A->sparse_,
+                                 &B->sparse_,
+                                 desc));
+
+        }
     } else {
-      std::cout << "Error: Unmasked SpGEMM not implemented yet!\n";
-      return GrB_NOT_IMPLEMENTED;
+        CHECK(GALATIC_spgemm(&C->sparse_,
+                             op,
+                             &A->sparse_,
+                             &B->sparse_,
+                             desc));
     }
   } else {
     std::cout << "Error: SpMM and GEMM not implemented yet!\n";

diff --git a/graphblas/backend/cuda/reduce.hpp b/graphblas/backend/cuda/reduce.hpp
@@ -1,7 +1,7 @@
 #ifndef GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_
 #define GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_
 
-#include <cub.cuh>
+#include <cub/cub.cuh>
 
 #include <iostream>
 

diff --git a/graphblas/backend/cuda/spgemm.hpp b/graphblas/backend/cuda/spgemm.hpp
@@ -1,6 +1,8 @@
 #ifndef GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_
 #define GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_
 
+#include <GALATIC/GALATICMinimumIncludes.cuh>
+
 #include "graphblas/backend/cuda/sparse_matrix.hpp"
 
 #include <cuda.h>
@@ -108,6 +110,182 @@ Info spgemmMasked(SparseMatrix<c>*       C,
   C->csc_initialized_ = false;
   return GrB_SUCCESS;
 }
+// Shallow copy graphblast sparsematrix -> Galatic dCSR format
+template<typename T>
+static void matrixToGalatic(const SparseMatrix<T> *input , dCSR<T>& output) {
+  output.col_ids     = reinterpret_cast<unsigned int*>(input->d_csrColInd_);
+  output.data        = input->d_csrVal_;
+  output.row_offsets = reinterpret_cast<unsigned int*>(input->d_csrRowPtr_);
+  output.rows        = input->nrows_;
+  output.cols        = input->ncols_;
+  output.nnz         = input->nvals_;
+}
+
+// Shallow copy Galatic dCSR format -> graphblast sparsematrix 
+template<typename T>
+static void galaticToSparse(SparseMatrix<T> *output , const dCSR<T>& input) {
+  output->d_csrColInd_ = reinterpret_cast<Index*>(input.col_ids);
+  output->d_csrVal_    = input.data;
+  output->d_csrRowPtr_ = reinterpret_cast<Index*>(input.row_offsets);
+  output->nvals_       = input.nnz;
+  output->ncapacity_   = input.nnz;
+}
+
+// Nullize pointers in Galatic's sparse matrices;
+// Galatic's destructors check for null. Doing this will prevent double
+// freeing when shallowcopying with  matrixToGalatic & galaticToSparse
+template<typename T>
+static void nullizeGalaticMatrix(dCSR<T>& m) {
+  m.data        = nullptr;
+  m.col_ids     = nullptr;
+  m.row_offsets = nullptr;
+}
+
+// A generic shim between graphblast's and GALATIC's semiring interfaces
+template<typename NativeSR, typename a, typename b, typename c>
+struct GalaticSemiring : SemiRing<a, b, c>  {
+  NativeSR nativeSemiring;
+
+  __device__ c multiply(const a& left, const b& right) const
+    { return nativeSemiring.mul_op(left, right);  }
+  __device__ c add(const c& left,const  c& right)      const
+    { return nativeSemiring.add_op(left, right);  }
+  __device__ static c AdditiveIdentity()
+    { return NativeSR::identity();                }
+};
+
+template <typename c, typename a, typename b,   typename SemiringT>
+Info GALATIC_spgemm(SparseMatrix<c>*        C,
+                     SemiringT              op,
+                     const SparseMatrix<a>* A,
+                     const SparseMatrix<b>* B,
+                     Descriptor*            desc) {
+
+  Index A_nrows, A_ncols, A_nvals;
+  Index B_nrows, B_ncols, B_nvals;
+  Index C_nrows, C_ncols, C_nvals;
+
+  A_nrows = A->nrows_;
+  A_ncols = A->ncols_;
+  A_nvals = A->nvals_;
+  B_nrows = B->nrows_;
+  B_ncols = B->ncols_;
+  B_nvals = B->nvals_;
+  C_nrows = C->nrows_;
+  C_ncols = C->ncols_;
+
+  // Dimension compatibility check
+  if ((A_ncols != B_nrows) || (C_ncols != B_ncols) || (C_nrows != A_nrows)) {
+    std::cout << "Dim mismatch mxm" << std::endl;
+    std::cout << A_ncols << " " << B_nrows << std::endl;
+    std::cout << C_ncols << " " << B_ncols << std::endl;
+    std::cout << C_nrows << " " << A_nrows << std::endl;
+    return GrB_DIMENSION_MISMATCH;
+  }
+
+  if (C->d_csrColInd_ != NULL) {
+    CUDA_CALL(cudaFree(C->d_csrColInd_));
+    CUDA_CALL(cudaFree(C->d_csrVal_));
+    C->d_csrColInd_ = NULL;
+    C->d_csrVal_ = NULL;
+  }
+
+  if (C->d_csrRowPtr_ != NULL) {
+    CUDA_CALL(cudaFree(C->d_csrRowPtr_));
+    C->d_csrRowPtr_ = NULL;
+  }
+
+  if (C->h_csrColInd_ != NULL) {
+    free(C->h_csrColInd_);
+    free(C->h_csrVal_);
+    C->h_csrColInd_ = NULL;
+    C->h_csrVal_ = NULL;
+  }
+
+  dCSR<c> outMatrixGPU;
+  dCSR<a> leftInputMatrixGPU;
+  dCSR<b> rightInputMatrixGPU;
+
+  //shallow copy input matrices to galatic format
+  matrixToGalatic(A, leftInputMatrixGPU);
+  matrixToGalatic(B, rightInputMatrixGPU);
+
+  GPUMatrixMatrixMultiplyTraits DefaultTraits;
+
+  // GALATIC has its own semiring interface; 
+  // GalaticSemiring is a shim here for conversion of graphblast-style
+  // SemiringT type. GalaticSemiring definition is above this function
+  GalaticSemiring<SemiringT, a, b, c> semiring_shim;
+  semiring_shim.nativeSemiring = op;
+
+  ExecutionStats stats;
+  try {
+      Desc_value nt_mode;
+      CHECK(desc->get(GrB_NT, &nt_mode));
+      const int num_threads  = static_cast<int>(nt_mode);
+
+      switch (num_threads) {
+          case 64:
+          ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
+                      64, 4, 2, 8, 4, 16, 512, 8, 0, a, b, c,
+                      GalaticSemiring<SemiringT, a, b, c>>
+                      (leftInputMatrixGPU, rightInputMatrixGPU,
+                       outMatrixGPU, DefaultTraits, stats, semiring_shim);
+              break;
+          case 128:
+              ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
+                      128, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
+                      GalaticSemiring<SemiringT, a, b, c>>
+                      ( leftInputMatrixGPU, rightInputMatrixGPU,
+                      outMatrixGPU, DefaultTraits, stats, semiring_shim);
+              break;
+          case 512:
+              ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
+                      512, 1, 1, 1, 2, 16, 512, 8, 0, a, b, c,
+                      GalaticSemiring<SemiringT, a, b, c>>
+                      (leftInputMatrixGPU, rightInputMatrixGPU,
+                              outMatrixGPU, DefaultTraits, stats, semiring_shim);
+              break;
+          default: // 256
+              ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
+                      256, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
+                      GalaticSemiring<SemiringT, a, b, c>>
+                      (leftInputMatrixGPU, rightInputMatrixGPU,
+                              outMatrixGPU, DefaultTraits, stats, semiring_shim);
+              break;
+      }
+  } catch(std::exception& e) {
+    std::cerr
+      << "Exception occured in GALATIC SpGEMM, called from GALATIC_spgemm\n"
+      << "Exception:\n" 
+      << e.what()
+      << std::endl;
+    return GrB_OUT_OF_MEMORY; //the most likely issue, fixme
+  }
+
+  // shallow copy to native format.
+  galaticToSparse(C , outMatrixGPU);
+
+  // prevent allocations being freed twice when destructors are ran, 
+  // as we are doing shallow copies:
+  //
+  // A, B -> leftInputMatrixGPU, rightInputMatrixGPU
+  // outMatrixGPU -> C.
+  nullizeGalaticMatrix(outMatrixGPU);
+  nullizeGalaticMatrix(leftInputMatrixGPU);
+  nullizeGalaticMatrix(rightInputMatrixGPU);
+
+  if (C->h_csrRowPtr_ == NULL)
+    C->h_csrRowPtr_ = reinterpret_cast<Index*>(malloc((A_nrows+1)*
+        sizeof(Index)));
+  C->h_csrColInd_ = reinterpret_cast<Index*>(malloc(C->ncapacity_*sizeof(Index)));
+  C->h_csrVal_    = reinterpret_cast<c*>(malloc(C->ncapacity_*sizeof(c)));
+
+  C->need_update_ = true;  // Set flag that we need to copy data from GPU
+  C->csr_initialized_ = true;
+  C->csc_initialized_ = false;
+  return GrB_SUCCESS;
+}
 
 template <typename c, typename a, typename b, typename m,
           typename BinaryOpT,     typename SemiringT>

diff --git a/graphblas/backend/cuda/spmspv_inner.hpp b/graphblas/backend/cuda/spmspv_inner.hpp
@@ -5,7 +5,7 @@
 #include <cusparse.h>
 
 #include <moderngpu.cuh>
-#include <cub.cuh>
+#include <cub/cub.cuh>
 
 #include <iostream>
 #include <algorithm>

diff --git a/graphblas/backend/cuda/spmv.hpp b/graphblas/backend/cuda/spmv.hpp
@@ -5,7 +5,7 @@
 #include <cusparse.h>
 
 #include <moderngpu.cuh>
-#include <cub.cuh>
+#include <cub/cub.cuh>
 
 #include <iostream>
 #include <string>

diff --git a/graphblas/stddef.hpp b/graphblas/stddef.hpp
@@ -73,21 +73,20 @@ struct less_equal {
     return lhs <= rhs;
   }
 };
-
 template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
-struct first {
+struct left_arg {
   inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
     return lhs;
   }
 };
-
 template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
-struct second {
+struct right_arg {
   inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
     return rhs;
   }
 };
 
+
 template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>
 struct minimum {
   inline GRB_HOST_DEVICE T_out operator()(T_in1 lhs, T_in2 rhs) {
@@ -173,21 +172,19 @@ REGISTER_MONOID(NotEqualToMonoid, not_equal_to, std::numeric_limits<T_out>::max(
 }  // namespace graphblas
 
 // Semiring generator macro provided by Scott McMillan
-#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP)             \
-template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1> \
-struct SR_NAME                                                            \
-{                                                                         \
-  typedef T_out result_type;                                              \
-  typedef T_out T_out_type;                                               \
-                                                                          \
-  inline T_out identity() const                                           \
-  { return ADD_MONOID<T_out>().identity(); }                              \
-                                                                          \
-  inline __host__ __device__ T_out add_op(T_out lhs, T_out rhs)           \
-  { return ADD_MONOID<T_out>()(lhs, rhs); }                               \
-                                                                          \
-  inline __host__ __device__ T_out mul_op(T_in1 lhs, T_in2 rhs)           \
-  { return MULT_BINARYOP<T_in1, T_in2, T_out>()(lhs, rhs); }              \
+#define REGISTER_SEMIRING(SR_NAME, ADD_MONOID, MULT_BINARYOP)                   \
+template <typename T_in1, typename T_in2 = T_in1, typename T_out = T_in1>       \
+struct SR_NAME                                                                  \
+{                                                                               \
+  typedef T_out result_type;                                                    \
+  typedef T_out T_out_type;                                                     \
+                                                                                \
+  static inline GRB_HOST_DEVICE T_out identity()                                \
+  { return ADD_MONOID<T_out>().identity(); }                                    \
+  inline GRB_HOST_DEVICE T_out add_op(const T_out& lhs, const T_out& rhs) const \
+  { return ADD_MONOID<T_out>()(lhs, rhs); }                                     \
+  inline GRB_HOST_DEVICE T_out mul_op(const T_in1& lhs, const T_in2& rhs) const \
+  { return MULT_BINARYOP<T_in1, T_in2, T_out>()(lhs, rhs); }                    \
 };
 
 namespace graphblas {