Merge pull request #546 from DrTimothyAldenDavis/dev2

GraphBLAS: CUDA fixes, demo output
DrTimothyAldenDavis · Nov 29, 2023 · 2895129 · 2895129
2 parents b0c0d61 + ea2d215
commit 2895129
Show file tree

Hide file tree

Showing 29 changed files with 5,890 additions and 3,263 deletions.
diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt
@@ -71,7 +71,7 @@ if ( SUITESPARSE_CUDA )
     # with CUDA and RMM
     add_subdirectory ( CUDA )
     set ( GB_CUDA GraphBLAS_CUDA  ${CUDA_LIBRARIES} )
-    set ( GB_RMM rmm_wrap ${CUDA_LIBRARIES} )
+    set ( GB_RMM RMM_wrap ${CUDA_LIBRARIES} )
     add_subdirectory ( rmm_wrap )
     include_directories ( "rmm_wrap" ${CUDA_INCLUDE_DIRS}
         "/usr/local/cuda/include/cub" )
@@ -259,7 +259,7 @@ if ( BUILD_SHARED_LIBS )
 
     if ( SUITESPARSE_CUDA )
         add_dependencies ( GraphBLAS GraphBLAS_CUDA )
-        # add_dependencies ( GraphBLAS rmm_wrap )
+        add_dependencies ( GraphBLAS RMM_wrap )
         target_compile_definitions ( GraphBLAS PRIVATE "SUITESPARSE_CUDA" )
     endif ( )
 
@@ -300,7 +300,7 @@ if ( BUILD_STATIC_LIBS )
     if ( SUITESPARSE_CUDA )
         add_dependencies ( GraphBLAS_static GraphBLAS_CUDA )
         set ( GRAPHBLAS_STATIC_MODULES "${GRAPHBLAS_STATIC_MODULES} GraphBLAS_CUDA" )
-        # add_dependencies ( GraphBLAS_static rmm_wrap )
+        add_dependencies ( GraphBLAS_static RMM_wrap )
         target_compile_definitions ( GraphBLAS_static PRIVATE "SUITESPARSE_CUDA" )
     endif ( )
 

diff --git a/GraphBLAS/CUDA/CMakeLists.txt b/GraphBLAS/CUDA/CMakeLists.txt
@@ -29,6 +29,7 @@ set ( CMAKE_CUDA_FLAGS "-cudart=static -lineinfo " )
 set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++17 -fPIC " )
 
 add_compile_definitions ( GBNCPUFEAT )
+add_compile_definitions ( GBCUDA_CPLUSPLUS )
 
 message ( STATUS "C++ flags for CUDA: ${CMAKE_CXX_FLAGS}" )
 
@@ -54,6 +55,7 @@ set ( GRAPHBLAS_CUDA_INCLUDES
         ../Source/Shared
         ../Source/Template
         ../Source/Factories
+        Template
         ../Include
         ../CUDA )
 
@@ -149,6 +151,8 @@ endif ( )
 # test suite for the CUDA kernels
 #-------------------------------------------------------------------------------
 
+if ( 0 )
+
 # 1. Execute enumify/stringify/jitify logic to compile ptx kernels and
 # compile/link w/ relevant *.cu files.
 
@@ -266,3 +270,4 @@ target_include_directories ( graphblascuda_test
     ${CUDAToolkit_INCLUDE_DIRS}
     ${GRAPHBLAS_CUDA_INCLUDES} )
 
+endif ( )
diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cpp
@@ -9,6 +9,7 @@
 
 // Decide branch direction for GPU use for the dot-product MxM
 
+#include "GraphBLAS.h"
 extern "C" 
 {
   #include "GB_mxm.h"

diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h
@@ -18,10 +18,11 @@ extern "C"
     #include "GB_compiler.h"
     #include "GB_cpu_features.h"
     #include "GB_warnings.h"
-    #define GB_LIBRARY
-    #include "GraphBLAS.h"
 }
 
+#define GB_LIBRARY
+#include "GraphBLAS.h"
+
 extern "C"
 {
     #include <cassert>

diff --git a/GraphBLAS/CUDA/GB_cuda_init.c b/GraphBLAS/CUDA/GB_cuda_init.c
@@ -1,8 +1,8 @@
 //------------------------------------------------------------------------------
-// GB_cuda_init: initialize the GPUs for use by GraphBLAS
+// GraphBLAS/CUDA/GB_cuda_init: initialize the GPUs for use by GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2022, All Rights Reserved.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
@@ -12,6 +12,7 @@
 // assumed.  Then each GPU is "warmed up" by allocating a small amount of
 // memory.
 
+#undef GBCUDA_CPLUSPLUS
 #include "GB.h"
 
 GrB_Info GB_cuda_init (void)

diff --git a/GraphBLAS/CUDA/GB_cuda_type_bits.c b/GraphBLAS/CUDA/GB_cuda_type_bits.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
+#if 0
 #include "GB.h"
 
 size_t GB_cuda_type_bits (GB_Type_code);
@@ -25,3 +26,4 @@ size_t GB_cuda_type_bits (GB_Type_code type_code)
     }
 }
 
+#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_type_branch.cpp b/GraphBLAS/CUDA/GB_cuda_type_branch.cpp
@@ -14,6 +14,7 @@
 
 // All built-in types pass this rule.
 
+#include "GraphBLAS.h"
 extern "C" 
 {
     #include "GB.h"

diff --git a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cpp
@@ -14,6 +14,7 @@
 // threadblock.  Then GB_reduce_to_scalar on the CPU sees this V as the result,
 // and calls itself recursively to continue the reduction.
 
+#include "GraphBLAS.h"
 extern "C"
 {
     #include "GB_reduce.h"

diff --git a/GraphBLAS/Config/GraphBLAS.h.in b/GraphBLAS/Config/GraphBLAS.h.in
@@ -19,7 +19,7 @@
 // This GraphBLAS.h file contains GraphBLAS definitions for user applications
 // to #include.  A few functions and variables with the prefix GB_ need to be
 // defined in this file and are thus technically visible to the user, but they
-// must not be accessed in user code.  They are here only so that the ANSI C11
+// must not be accessed in user code.  They are here only so that the C11
 // _Generic feature can be used in the user-accessible polymorphic functions,
 // or to implement a fast GxB_Iterator using macros.
 
@@ -110,24 +110,32 @@
     #define GB_GLOBAL extern
 #endif
 
-// GraphBLAS requires an ANSI C11 compiler for its polymorphic functions (using
+// GraphBLAS requires an C11 compiler for its polymorphic functions (using
 // the _Generic keyword), but it can be used in an C90 compiler if those
 // functions are disabled.
 
-// With ANSI C11 and later, _Generic keyword and polymorphic functions can be
+// With C11 and later, _Generic keyword and polymorphic functions can be
 // used.  Earlier versions of the language do not have this feature.
 
 #ifdef __STDC_VERSION__
-// ANSI C17: 201710L
-// ANSI C11: 201112L
-// ANSI C99: 199901L
-// ANSI C95: 199409L
+// C17: 201710L
+// C11: 201112L
+// C99: 199901L
+// C95: 199409L
 #define GxB_STDC_VERSION __STDC_VERSION__
 #else
-// assume ANSI C90 / C89
+// assume C90 / C89
 #define GxB_STDC_VERSION 199001L
 #endif
 
+//------------------------------------------------------------------------------
+// CUDA (currently experimental, not for production use)
+//------------------------------------------------------------------------------
+
+#ifndef SUITESPARSE_CUDA
+#cmakedefine SUITESPARSE_CUDA
+#endif
+
 //------------------------------------------------------------------------------
 // definitions for complex types
 //------------------------------------------------------------------------------
@@ -138,7 +146,19 @@
 #ifndef GXB_COMPLEX_H
 #define GXB_COMPLEX_H
 
-    #if defined (_MSC_VER) && !(defined (__INTEL_COMPILER) || defined(__INTEL_CLANG_COMPILER))
+    #if defined ( GBCUDA_CPLUSPLUS )
+
+        // C++ complex types for CUDA
+        #include <cmath>
+        #include <complex>
+        #undef I
+        typedef std::complex<float>  GxB_FC32_t ;
+        typedef std::complex<double> GxB_FC64_t ;
+        #define GxB_CMPLXF(r,i) GxB_FC32_t(r,i)
+        #define GxB_CMPLX(r,i)  GxB_FC64_t(r,i)
+        #define GB_HAS_CMPLX_MACROS 1
+
+    #elif defined (_MSC_VER) && !(defined (__INTEL_COMPILER) || defined(__INTEL_CLANG_COMPILER))
 
         // Microsoft Windows complex types for C
         #include <complex.h>
@@ -151,13 +171,13 @@
 
     #else
 
-        // ANSI C11 complex types
+        // C11 complex types
         #include <complex.h>
         #undef I
         typedef float  _Complex GxB_FC32_t ;
         typedef double _Complex GxB_FC64_t ;
         #if (defined (CMPLX) && defined (CMPLXF))
-            // use the ANSI C11 CMPLX and CMPLXF macros
+            // use the C11 CMPLX and CMPLXF macros
             #define GxB_CMPLX(r,i) CMPLX (r,i)
             #define GxB_CMPLXF(r,i) CMPLXF (r,i)
             #define GB_HAS_CMPLX_MACROS 1
@@ -185,10 +205,10 @@
     // NVIDIA nvcc
     #define GB_restrict __restrict__
 #elif GxB_STDC_VERSION >= 199901L
-    // ANSI C99 or later
+    // C99 or later
     #define GB_restrict restrict
 #else
-    // ANSI C95 and earlier: no restrict keyword
+    // C95 and earlier: no restrict keyword
     #define GB_restrict
 #endif
 
@@ -885,7 +905,7 @@ GB_GLOBAL GrB_UnaryOp
 // Unary operators for floating-point types only
 //------------------------------------------------------------------------------
 
-// The following floating-point unary operators and their ANSI C11 equivalents,
+// The following floating-point unary operators and their C11 equivalents,
 // are only defined for floating-point (real and complex) types.
 
 GB_GLOBAL GrB_UnaryOp
@@ -949,7 +969,7 @@ GB_GLOBAL GrB_UnaryOp
     GxB_CBRT_FP64,
 
     // frexpx and frexpe return the mantissa and exponent, respectively,
-    // from the ANSI C11 frexp function.  The exponent is returned as a
+    // from the C11 frexp function.  The exponent is returned as a
     // floating-point value, not an integer.
 
     // z = frexpx (x)   z = frexpe (x)
@@ -10737,7 +10757,7 @@ GrB_Info GrB_Matrix_exportHint  // suggest the best export format
 
 // GrB_Matrix_serialize/deserialize are slightly different from their GxB*
 // counterparts.  The blob is allocated by GxB_Matrix_serialize, and must be
-// freed by the same free() method passed to GxB_init (or the ANSI C11 free()
+// freed by the same free() method passed to GxB_init (or the C11 free()
 // if GrB_init was used).  By contrast, the GrB* methods require the user
 // application to pass in a preallocated blob to GrB_Matrix_serialize, whose
 // size can be given by GrB_Matrix_serializeSize (as a loose upper bound).

diff --git a/GraphBLAS/Config/README.md.in b/GraphBLAS/Config/README.md.in
@@ -36,8 +36,8 @@ QUICK START: To compile and install, do these commands in this directory:
     make
     sudo make install
 
-Please be patient; some files can take several minutes to compile.  Requires an
-ANSI C11 compiler, so cmake will fail if your compiler is not C11 compliant.
+Please be patient; some files can take several minutes to compile.  Requires a
+C11 compiler, so cmake will fail if your compiler is not C11 compliant.
 See the User Guide PDF in Doc/ for directions on how to use another compiler.
 
 For faster compilation, do this instead of just "make", which uses 32