diff --git a/MANIFEST.in b/MANIFEST.in index bbb89e4..1108773 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include AUTHORS CHANGES INSTALL LICENSE README.rst tox.ini recursive-include scikits *.h +recursive-include scikits *.json recursive-include demos *.py recursive-include tests *.py recursive-include docs * diff --git a/scikits/cuda/_cusparse_cffi.py b/scikits/cuda/_cusparse_cffi.py new file mode 100644 index 0000000..afd5117 --- /dev/null +++ b/scikits/cuda/_cusparse_cffi.py @@ -0,0 +1,101 @@ +""" +Autogenerate Python interface to cuSPARSE functions. + +""" +from __future__ import absolute_import, print_function + +import os +import re +import numpy as np + +from os.path import join as pjoin + +from ._cusparse_cffi_autogen import (generate_cffi_cdef, + ffi_init_cusparse, + generate_func_descriptions_json, + generate_cusparse_python_wrappers) + +base_dir = os.path.dirname(__file__) +cffi_file = pjoin(base_dir, '_cusparse.cffi') +python_wrapper_file = pjoin(base_dir, '_cusparse_python.py') +variable_defs_json = pjoin(base_dir, 'cusparse_variable_descriptions.json') +func_defs_json = pjoin(base_dir, 'cusparse_func_descriptions.json') + +if not os.path.exists(cffi_file): + print("first import of cusparse: cffi interface file being created." + "This may take several seconds") + cffi_cdef = generate_cffi_cdef(cffi_out_file=cffi_file) +else: + with open(cffi_file, 'r') as f: + cffi_cdef = f.read() + +ffi, ffi_lib = ffi_init_cusparse(cffi_cdef) + +if not os.path.exists(python_wrapper_file): + print("first import of cusparse: python wrappers being created.") + if not os.path.exists(func_defs_json): + generate_func_descriptions_json(ffi_lib, json_file=func_defs_json) + generate_cusparse_python_wrappers(cffi_cdef, + variable_defs_json=variable_defs_json, + func_defs_json=func_defs_json, + python_wrapper_file=python_wrapper_file) + + +class CUSPARSE_ERROR(Exception): + """CUSPARSE error""" + pass + +# Use CUSPARSE_STATUS* definitions to dynamically create corresponding +# exception classes and populate dictionary used to raise appropriate +# exception in response to the corresponding CUSPARSE error code: +CUSPARSE_STATUS_SUCCESS = ffi_lib.CUSPARSE_STATUS_SUCCESS +CUSPARSE_EXCEPTIONS = {-1: CUSPARSE_ERROR} +for k, v in ffi_lib.__dict__.items(): + # Skip CUSPARSE_STATUS_SUCCESS: + if re.match('CUSPARSE_STATUS.*', k) and v != CUSPARSE_STATUS_SUCCESS: + CUSPARSE_EXCEPTIONS[v] = vars()[k] = type(k, (CUSPARSE_ERROR,), {}) + + +# Import various other enum values into module namespace: +regex = 'CUSPARSE_(?!STATUS).*' +for k, v in ffi_lib.__dict__.items(): + if re.match(regex, k): + # print("k={}, v={}".format(k,v)) + vars()[k] = v + + +def cusparseCheckStatus(status): + """ + Raise CUSPARSE exception + + Raise an exception corresponding to the specified CUSPARSE error + code. + + Parameters + ---------- + status : int + CUSPARSE error code. + + See Also + -------- + CUSPARSE_EXCEPTIONS + """ + + if status != 0: + try: + raise CUSPARSE_EXCEPTIONS[status] + except KeyError: + raise CUSPARSE_ERROR + +# execute the python wrapper code +with open(python_wrapper_file) as f: + code = compile(f.read(), python_wrapper_file, 'exec') + exec(code) + + +__all__ = [k for k, v in ffi_lib.__dict__.items()] +__all__.append('CUSPARSE_ERROR') +__all__.append('CUSPARSE_EXCEPTIONS') +__all__.append('cusparseCheckStatus') +__all__.append('ffi') +__all__.append('ffi_lib') diff --git a/scikits/cuda/_cusparse_cffi_autogen.py b/scikits/cuda/_cusparse_cffi_autogen.py new file mode 100644 index 0000000..1a1d210 --- /dev/null +++ b/scikits/cuda/_cusparse_cffi_autogen.py @@ -0,0 +1,714 @@ +""" +Support functions for autogenerating Python interface to the cuSPARSE library. + +""" + +""" +Developed on linux for CUDA v6.5, but should support any version >3.1 where +cusparse.h or cusparse_v2.h can be found in CUDA_ROOT/include. + +Set the environment variable CUDA_ROOT to the base of your CUDA installation + +Note from the NVIDIA CUSPARSE release notes: + +"The csr2csc() and bsr2bsc() routines contain a bug in the CUDA 6.0 and 6.5 +releases. As a consequence, csrsv(), csrsv2(), csrsm(), bsrsv2(), bsrsm2(), +and csrgemm() may produce incorrect results when working with transpose +(CUSPARSE_OPERATION_TRANSPOSE) or conjugate-transpose +(CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE) operations. These routines work +correctly when the non-transpose (CUSPARSE_OPERATION_NON_TRANSPOSE) operation +is selected. The bug has been fixed in the CUDA 7.0 release." +""" + +import os +import re +import json +import numpy as np + +import cffi + +# TODO: improve autodetection of cuda include folder +CUDA_ROOT = os.environ.get('CUDA_ROOT', None) or '/usr/local/cuda' +cuda_include_path = os.path.join(CUDA_ROOT, 'include') + +# on versions where cusparse_v2.h exists, use it +cusparse_header = os.path.join(cuda_include_path, 'cusparse_v2.h') +if not os.path.exists(cusparse_header): + # on old versions there was only cusparse.h + cusparse_header = os.path.join(cuda_include_path, 'cusparse.h') + if not os.path.exists(cusparse_header): + raise IOError("cusparse header file not found in expected " + "location. Try defining CUDA_ROOT") + + +base_dir = os.path.dirname(__file__) + + +def ffi_init_cusparse(cffi_cdef, cusparse_header=cusparse_header): + ffi = cffi.FFI() + ffi.cdef(cffi_cdef) + + # Get the address in a cdata pointer: + __verify_scr = """ + #include <{}> + #include + """.format(os.path.basename(cusparse_header)) + + ffi_lib = ffi.verify(__verify_scr, libraries=['cusparse'], + include_dirs=['/usr/local/cuda/include'], + library_dirs=['/usr/local/cuda/lib64/']) + return ffi, ffi_lib + + +def generate_cffi_cdef( + cuda_include_path=cuda_include_path, cusparse_header=cusparse_header, + cffi_out_file=None): + """ generate the CUSPARSE FFI definition + + Parameters + ---------- + cuda_include_path : str + CUDA include path + cffi_out_file : str, optional + if provided, write the definition out to a file + + Returns + ------- + cffi_cdef : str + function definitions for use with cffi. e.g. input to FFI.verify() + + """ + + with open(cusparse_header, 'r') as f: + cusparse_hdr = f.readlines() + + # in some version cusparse_v2.h just points to cusparse.h, so read it + # instead + for line in cusparse_hdr: + # if v2 header includes cusparse.h, read that one instead + if line.startswith('#include "cusparse.h"'): + cusparse_header = os.path.join(cuda_include_path, 'cusparse.h') + with open(cusparse_header, 'r') as f: + cusparse_hdr = f.readlines() + + # skip lines leading up to first typedef + for idx, line in enumerate(cusparse_hdr): + if line.startswith('typedef'): + start_line = idx + break + + # skip closing #if defined logic + for idx, line in enumerate(cusparse_hdr[start_line:]): + if line.startswith('#if defined(__cplusplus)') or \ + 'Define the following symbols for the new API' in line: + # second match is to avoid CFFI compilation errror due to the final + # define statements in v4.1 through v5.5 + end_line = start_line + idx + break + + # define other data types needed by FFI + # ... will be filled in from cuComplex.h by the C compiler + cffi_cdef = """ + typedef struct CUstream_st *cudaStream_t; + + typedef struct float2 { + ...; + } float2; + typedef float2 cuFloatComplex; + typedef float2 cuComplex; + + typedef struct double2 { + ...; + } double2; + typedef double2 cuDoubleComplex; + + typedef float cufftReal; + typedef double cufftDoubleReal; + + typedef cuComplex cufftComplex; + typedef cuDoubleComplex cufftDoubleComplex; + + /* definitions from cusparse header below this point */ + """ + + cffi_cdef += ''.join(cusparse_hdr[start_line:end_line]) + + """ + don't use the _v2 versions of the function names defined in CUDA v4.1 + through v5.5 + """ + cffi_cdef = cffi_cdef.replace('_v2(', '(') + + if os.name == 'nt': # Win + cffi_cdef = cffi_cdef.replace('CUSPARSEAPI', '__stdcall') + else: # posix, etc + cffi_cdef = cffi_cdef.replace('CUSPARSEAPI', '') + + if cffi_out_file is not None: + # create specified output directory if it doesn't already exist + out_dir = os.path.dirname(cffi_out_file) + if out_dir and not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(cffi_out_file, 'w') as f: + f.write(cffi_cdef) + + return cffi_cdef + +# sourcefilename = ffi.verifier.sourcefilename +# modulefilename = ffi.verifier.modulefilename + + +def reindent(s, numSpaces=4, lstrip=True): + """add indentation to a multiline string. + + Parameters + ---------- + s : str + string to reformat + numSpaces : str, optional + number of spaces to indent each line by + lstrip : bool, optional + if True, lstrip() prior to adding numSpaces + + Returns + ------- + s : str + reformatted str + """ + s = s.split('\n') + if lstrip: + s = [line.lstrip() for line in s] + + for idx, line in enumerate(s): + if line.strip() == '': + # don't indent empty lines + s[idx] = '' + else: + s[idx] = (numSpaces * ' ') + line + s = '\n'.join(s) + return s + + +def _find_breakpoint(line, break_pattern=', ', nmax=80): + """ determine where to break the line """ + locs = [m.start() for m in re.finditer(break_pattern, line)] + if len(locs) > 0: + break_loc = locs[np.where( + np.asarray(locs) < (nmax - len(break_pattern)))[0][-1]] + break_loc += len(break_pattern) + else: + break_loc = None + return locs, break_loc + + +def _split_line(line, break_pattern=', ', nmax=80, pad_char='('): + """ split a line (repeatedly) until length < nmax chars. + + split will occur at last occurence of break_pattern occuring before nmax + characters + + subsequent lines will be indented until the first occurance of pad_char + in the initial line + + Parameters + ---------- + line : str + line to reformat + break_pattern : str, optional + break line only where this pattern occurs + nmax : int, optional + max number of characters to allow + pad_char : str, optional + auto-indent subsequent lines up to the first occurance of pad_char + + Returns + ------- + new_line : str + reformatted line + """ + if len(line) < nmax: + return line.rstrip() + '\n' + locs, break_loc = _find_breakpoint(line, + break_pattern=break_pattern, + nmax=nmax) + if break_loc is None: + return line.rstrip() + '\n' + if pad_char is not None: + npad = line.find(pad_char) + 1 + else: + npad = 0 + + lines = [] + lines.append(line[:break_loc].rstrip()) + line = ' ' * npad + line[break_loc:] + while (len(line) > nmax - 1) and (break_loc is not None): + locs, break_loc = _find_breakpoint(line, + break_pattern=break_pattern, + nmax=nmax) + lines.append(line[:break_loc].rstrip()) + line = ' ' * npad + line[break_loc:] + lines.append(line.rstrip()) + return '\n'.join(lines) + '\n' + + +def _build_func_sig(func_name, arg_dict, return_type): + """ generate the python wrapper function signature line(s). """ + + if 'Create' in func_name: + # don't pass in any argument to creation functions + return "def %s():\n" % func_name + + if ('Get' in func_name) and (return_type == 'cusparseStatus_t') and \ + len(arg_dict) == 2: + basic_getter = True + else: + basic_getter = False + + sig = "def %s(" % func_name + for k, v in arg_dict.items(): + is_ptr = '*' in v + if is_ptr and basic_getter: + continue + sig += k + ", " + sig = sig[:-2] + "):\n" + # wrap to 2nd line if too long + return _split_line(sig, break_pattern=', ', nmax=79) + + +def _build_doc_str(arg_dict, func_description='', variable_descriptions={}): + """ generate python wrapper docstring """ + docstr = '"""' + func_description + '\n' + docstr += 'Parameters\n----------\n' + for k, v in arg_dict.items(): + docstr += k + " : " + v + "\n" + if k in variable_descriptions: + docstr += reindent(variable_descriptions[k], + numSpaces=4, + lstrip=True) + else: + print("no variable description provided for {}".format(k)) + docstr += '"""\n' + return reindent(docstr, numSpaces=4, lstrip=False) + + +def _build_body(func_name, arg_dict, return_type): + """ generate python_wrapper function body + + Note: this code is highly specific to the particulars of the cuSPARSE + library + + """ + body = "" + arg_list = "" + + # the following are pointers to scalar outputs + # Note: pBufferSize was renamed pBufferSizeInBytes in v6.5 + scalar_ptr_outputs = ['nnzTotalDevHostPtr', + 'pBufferSize', + 'pBufferSizeInBytes', + 'resultDevHostPtr'] + + is_creator = 'cusparseCreate' in func_name + is_getter = 'cusparseGet' in func_name + + if return_type == 'cusparseStatus_t' and not (is_creator or is_getter): + is_return = False + else: + is_return = True + + # else: + return_str = '' + for k, v in arg_dict.items(): + + """ + set some flags based on the name/type of the argument + will use these flags to determine whether and how to call ffi.new or + ffi.cast on each variable + """ + is_ptr = '*' in v + is_cusparse_type = '_t' in v + is_cusparse_ptr = is_ptr and is_cusparse_type + is_output_scalar = k in scalar_ptr_outputs + if k in ['alpha', 'beta']: + is_scalar = True + else: + is_scalar = False + if is_getter: + is_gpu_array = False + else: + is_gpu_array = is_ptr and (not is_cusparse_ptr) and (not is_scalar) + if 'Complex' in v: + is_complex = True + else: + is_complex = False + + # convert variable to appropriate type for the FFI + if is_output_scalar: + # for scalar outputs make a new pointer + body += "%s = ffi.cast('%s', %s)\n" % (k, v, k) + elif is_getter and is_ptr and (return_type == 'cusparseStatus_t'): + # any pointers in cusparseGet* are new outputs to be created + body += "%s = ffi.new('%s')\n" % (k, v) + elif is_gpu_array: + # pass pointer to GPU array data (use either .ptr or .gpudata) + body += "%s = ffi.cast('%s', %s.ptr)\n" % (k, v, k) + elif is_cusparse_ptr: + if is_creator: + # generate custom cusparse type + body += "%s = ffi.new('%s')\n" % (k, v) + else: + # cast to the custom cusparse type + body += "%s = ffi.cast('%s', %s)\n" % (k, v, k) + elif is_ptr and is_scalar: + # create new pointer, with value initialized to scalar + if is_complex: + # complex case is a bit tricky. requires ffi.buffer + body += "%sffi = ffi.new('%s')\n" % (k, v) + if 'cusparseC' in func_name: + body += "ffi.buffer(%sffi)[:] = \ + np.complex64(%s).tostring()\n" % (k, k) + elif 'cusparseZ' in func_name: + body += "ffi.buffer(%sffi)[:] = \ + np.complex128(%s).tostring()\n" % (k, k) + else: + body += "%s = ffi.new('%s', %s)\n" % (k, v, k) + elif is_ptr or v == 'cudaStream_t': + # case non-scalar pointer to appropriate type + body += "%s = ffi.cast('%s', %s)\n" % (k, v, k) + else: + # don't need explicit cast for plain int, float, etc + pass + + # build the list of arguments to pass to the API + if is_ptr and is_scalar and is_complex: + # take into account modified argument name for complex scalars + arg_list += "%sffi, " % k + else: + arg_list += "%s, " % k + + # add the function call and optionally return the result + last_key = k + arg_list = arg_list[:-2] # remove trailing ", " + if is_getter and return_type != 'cusparseStatus_t': + body += "return ffi_lib.%s(%s)\n" % (func_name, arg_list) + else: + # check cusparseStatus_t state before returning + call_str = "status = ffi_lib.%s(%s)\n" % (func_name, arg_list) + body += _split_line(call_str, break_pattern=', ', nmax=76) + body += "cusparseCheckStatus(status)\n" + if is_return: + # len(arg_dict) == 2) is to avoid return for cusparseGetLevelInfo + if is_creator or (is_getter and (len(arg_dict) == 2)): + body += "return %s[0]\n" % last_key + else: + body += "#TODO: return the appropriate result" + body += '\n\n' + return reindent(body, numSpaces=4, lstrip=False) + + +def _func_str(func_name, arg_dict, return_type, + variable_descriptions={}, func_description=''): + """ build a single python wrapper """ + fstr = _build_func_sig(func_name, arg_dict, return_type) + fstr += _build_doc_str(arg_dict, func_description=func_description, + variable_descriptions=variable_descriptions) + fstr += _build_body(func_name, arg_dict, return_type) + return fstr + + +def build_python_func(cdef, variable_descriptions={}, func_descriptions={}): + """ wrap a single python function corresponding to the given cdef C + function string. + + Parameters + ---------- + cdef : str + single line string containing a C function definition + variable_descriptions : dict + dictionary of variable descriptions for the docstring + func_descriptions : dict + dictionary of function descriptions for the docstring + + Returns + ------- + str corresponding to the python_wrapper + """ + cdef_regex = "(\w*)\s*(\w*)\s*\((.*)\).*" + p = re.compile(cdef_regex) + match = p.search(cdef) + (return_type, func_name, func_args) = match.group(1, 2, 3) + func_args = func_args.split(', ') + + from collections import OrderedDict + arg_dict = OrderedDict() + for arg in func_args: + substr = arg.split() + if len(substr) == 2: + val = substr[0] + else: + val = substr[-2] + key = substr[-1] + # handle pointer + if key[0] == '*': + val += ' *' + key = key[1:] + # handle pointer to pointer + if key[0] == '*': + val += '*' + key = key[1:] + arg_dict[key] = val + + func_description = func_descriptions.get(func_name, '') + return _func_str(func_name, arg_dict, return_type, + variable_descriptions=variable_descriptions, + func_description=func_description) + + +# with open('cusparse_variable_descriptions.json', 'w') as fid: +# json.dump(variable_descriptions, fid, sort_keys=True, indent=4) + +def get_variable_descriptions(var_def_json): + """ load variable description dictionary from .json file""" + with open(var_def_json, 'r') as fid: + variable_descriptions = json.load(fid) + for k, v in variable_descriptions.items(): + variable_descriptions[k] = _split_line(v, break_pattern=' ', nmax=72, + pad_char=None) + return variable_descriptions + + +def get_function_descriptions(func_def_json): + """ load function description dictionary from .json file""" + with open(func_def_json, 'r') as fid: + func_descriptions = json.load(fid) + for k, v in func_descriptions.items(): + func_descriptions[k] = _split_line(v, break_pattern=' ', nmax=72, + pad_char=None) + return func_descriptions + + +def generate_func_descriptions_json(ffi_lib, json_file): + func_descriptions = {} + for t in ['S', 'D', 'C', 'Z']: + # functions introduced in CUDA 3.2 + func_descriptions['cusparse' + t + 'axpyi'] = 'scalar multiply and add: y = y + alpha * x' + func_descriptions['cusparse' + t + 'coo2csr'] = 'convert sparse matrix formats: COO to CSR' + func_descriptions['cusparse' + t + 'csc2dense'] = 'convert sparse matrix formats: CSC to dense' + func_descriptions['cusparse' + t + 'csr2coo'] = 'convert sparse matrix formats: CSR to COO' + func_descriptions['cusparse' + t + 'csr2csc'] = 'convert sparse matrix formats: CSR to CSC' + func_descriptions['cusparse' + t + 'csr2dense'] = 'convert sparse matrix formats: CSR to dense' + func_descriptions['cusparse' + t + 'csrmv'] = 'sparse CSR matrix vector product: y = alpha * op(A)*x + beta * y' + func_descriptions['cusparse' + t + 'csrmm'] = 'sparse CSR matrix-matrix product: C = alpha * op(A) * B + beta * C' + func_descriptions['cusparse' + t + 'dense2csc'] = 'convert sparse matrix formats: dense to CSC' + func_descriptions['cusparse' + t + 'dense2csr'] = 'convert sparse matrix formats: dense to CSR' + func_descriptions['cusparse' + t + 'doti'] = 'dot product: result = y.T * x' + func_descriptions['cusparse' + t + 'dotci'] = 'complex conjugate dot product: result = y.H * x' + func_descriptions['cusparse' + t + 'gthr'] = 'gather elements of y at locations xInd into data array xVal' + func_descriptions['cusparse' + t + 'gthrz'] = 'gather elements of y at locations xInd into data array xVal. Also zeros the gathered elements in y' + func_descriptions['cusparse' + t + 'nnz'] = 'compute number of nonzero elements per row or column and the total number of nonzero elements' + func_descriptions['cusparse' + t + 'roti'] = 'applies Givens rotation matrix to sparse x and dense y' + func_descriptions['cusparse' + t + 'sctr'] = 'scatters elements of vector x into the vector y (at locations xInd)' + # for CUDA 4.0+ below here + func_descriptions['cusparse' + t + 'csrsv_analysis'] = 'perform analysis phase of csrsv' + func_descriptions['cusparse' + t + 'csrsv_solve'] = 'perform solve phase of csrsv' + # for CUDA 4.1+ below here + func_descriptions['cusparse' + t + 'csr2hyb'] = 'convert sparse matrix formats: CSR to HYB' + func_descriptions['cusparse' + t + 'csrsm_analysis'] = 'perform analysis phase of csrsm' + func_descriptions['cusparse' + t + 'csrsm_solve'] = 'perform solve phase of csrsm' + func_descriptions['cusparse' + t + 'dense2hyb'] = 'convert sparse matrix formats: dense to HYB' + func_descriptions['cusparse' + t + 'gtsv'] = 'solve tridiagonal system with multiple right-hand sides: A * Y = alpha * X' + func_descriptions['cusparse' + t + 'gtsvStridedBatch'] = 'solve multiple tridiagonal systems for i = 0, ..., batchCount: A_i * y_i = alpha * x_i' + func_descriptions['cusparse' + t + 'hyb2dense'] = 'convert sparse matrix formats: HYB to dense' + func_descriptions['cusparse' + t + 'hybmv'] = 'sparse HYB matrix vector product: y = alpha * op(A)*x + beta * y' + func_descriptions['cusparse' + t + 'hybsv_analysis'] = 'perform analysis phase of hybsv' + func_descriptions['cusparse' + t + 'hybsv_solve'] = 'perform solve phase of hybsv' + # for CUDA 5.0+ below here + func_descriptions['cusparse' + t + 'bsr2csr'] = 'convert sparse matrix formats: BSR to CSR' + func_descriptions['cusparse' + t + 'bsrmv'] = 'sparse BSR matrix vector product: y = alpha * op(A)*x + beta * y' + func_descriptions['cusparse' + t + 'bsrxmv'] = 'sparse BSRX matrix vector product: y = alpha * op(A)*x + beta * y' + func_descriptions['cusparse' + t + 'csr2bsr'] = 'convert sparse matrix formats: CSR to BSR' + func_descriptions['cusparse' + t + 'csrgeam'] = 'sparse CSR matrix-matrix operation: C = alpha * A + beta * B' + func_descriptions['cusparse' + t + 'csrgemm'] = 'sparse CSR matrix-matrix operation: C = op(A) * op(B)' + func_descriptions['cusparse' + t + 'csric0'] = 'CSR incomplete-Cholesky factorization: op(A) ~= R.T * R' + func_descriptions['cusparse' + t + 'csrilu0'] = 'CSR incomplete-LU factorization: op(A) ~= LU' + func_descriptions['cusparse' + t + 'hyb2csr'] = 'convert sparse matrix formats: HYB to CSR' + # for CUDA 5.5+ below here + func_descriptions['cusparse' + t + 'csc2hyb'] = 'convert sparse matrix formats: CSC to HYB' + func_descriptions['cusparse' + t + 'csrmm2'] = 'sparse CSR matrix-matrix product type 2: C = alpha * op(A) * op(B) + beta * C' + func_descriptions['cusparse' + t + 'gtsv_nopivot'] = 'solve tridiagonal system with multiple right-hand sides: A * Y = alpha * X' + func_descriptions['cusparse' + t + 'hyb2csc'] = 'convert sparse matrix formats: HYB to CSC' + # for CUDA 6.0+ below here + func_descriptions['cusparse' + t + 'bsric02_bufferSize'] = 'return bsric02 (A ~= L * L.H) buffer size' + func_descriptions['cusparse' + t + 'bsric02_analysis'] = 'perform bsric02 (A ~= L * L.H) analysis phase' + func_descriptions['cusparse' + t + 'bsric02'] = 'perform bsric02 (A ~= L * L.H) solve phase' + func_descriptions['cusparse' + t + 'bsrilu02'] = 'perform bsrilu02 (A ~= LU) solve phase' + func_descriptions['cusparse' + t + 'bsrilu02_analysis'] = 'perform bsrilu02 (A ~= LU) analysis phase' + func_descriptions['cusparse' + t + 'bsrilu02_bufferSize'] = 'return bsrilu02 (A ~= LU) buffer size' + func_descriptions['cusparse' + t + 'bsrilu02_numericBoost'] = 'use a boost value to replace a numerical value in incomplete LU factorization' + func_descriptions['cusparse' + t + 'bsrsv2_analysis'] = 'perform analysis phase of bsrsv2' + func_descriptions['cusparse' + t + 'bsrsv2_bufferSize'] = 'return size of buffer used in bsrsv2' + func_descriptions['cusparse' + t + 'bsrsv2_solve'] = 'perform solve phase of bsrsv2' + func_descriptions['cusparse' + t + 'csr2gebsr'] = 'convert sparse matrix formats: CSR to GEBSR' + func_descriptions['cusparse' + t + 'csr2gebsr_bufferSize'] = 'return csr2gebsr buffer size' + func_descriptions['cusparse' + t + 'csrsv2_analysis'] = 'perform analysis phase of csrsv2' + func_descriptions['cusparse' + t + 'csrsv2_bufferSize'] = 'return size of buffer used in csrsv2' + func_descriptions['cusparse' + t + 'csrsv2_solve'] = 'perform solve phase of csrsv2' + func_descriptions['cusparse' + t + 'csric02_analysis'] = 'perform csric02 (A ~= L * L.H) analysis phase' + func_descriptions['cusparse' + t + 'csric02_bufferSize'] = 'return csric02 (A ~= L * L.H) buffer size' + func_descriptions['cusparse' + t + 'csric02'] = 'perform csric02 (A ~= L * L.H) solve phase' + func_descriptions['cusparse' + t + 'csrilu02'] = 'perform csrilu02 (A ~= LU) solve phase' + func_descriptions['cusparse' + t + 'csrilu02_analysis'] = 'perform csrilu02 (A ~= LU) analysis phase' + func_descriptions['cusparse' + t + 'csrilu02_bufferSize'] = 'return csrilu02 (A ~= LU) buffer size' + func_descriptions['cusparse' + t + 'csrilu02_numericBoost'] = 'use a boost value to replace a numerical value in incomplete LU factorization' + func_descriptions['cusparse' + t + 'gebsr2csr'] = 'convert sparse matrix formats: GEBSR to CSR' + func_descriptions['cusparse' + t + 'gebsr2gebsc'] = 'convert sparse matrix formats: GEBSR to GEBSC' + func_descriptions['cusparse' + t + 'gebsr2gebsc_bufferSize'] = 'return gebsr2gebsc buffer size' + func_descriptions['cusparse' + t + 'gebsr2gebsr'] = 'convert sparse matrix formats: GEBSR to GEBSR' + func_descriptions['cusparse' + t + 'gebsr2gebsr_bufferSize'] = 'return gebsr2gebsr or gebsr2gebsrNnz buffer size' + # for CUDA 6.5+ below here + func_descriptions['cusparse' + t + 'bsrmm'] = 'sparse BSR matrix-matrix product: C = alpha * op(A) * B + beta * C' + func_descriptions['cusparse' + t + 'bsrsm2_analysis'] = 'perform analysis phase of bsrsm2' + func_descriptions['cusparse' + t + 'bsrsm2_bufferSize'] = 'return size of buffer used in bsrsm2' + func_descriptions['cusparse' + t + 'bsrsm2_solve'] = 'perform solve phase of bsrsm2' + # for CUDA 7.0+ below here + func_descriptions['cusparse' + t + 'coosort_bufferSizeExt'] = 'determine buffer size for coosort' + func_descriptions['cusparse' + t + 'coosortByColumn'] = 'in-place sort of COO format by column' + func_descriptions['cusparse' + t + 'coosortByRow'] = 'in-place sort of COO format by row' + func_descriptions['cusparse' + t + 'cscsort'] = 'in-place sort of CSC' + func_descriptions['cusparse' + t + 'cscsort_bufferSizeExt'] = 'determine buffer size for cscsort' + func_descriptions['cusparse' + t + 'csr2csru'] = 'convert sorted CSR to unsorted CSR' + func_descriptions['cusparse' + t + 'csrcolor'] = 'performs the coloring of the adjacency graph associated with the matrix A stored in CSR format' + func_descriptions['cusparse' + t + 'csrgemm2'] = 'generalization of the csrgemm. a matrix-matrix operation: C=alpha*A*B + beta*D' + func_descriptions['cusparse' + t + 'csrgemm2_bufferSizeExt'] = 'returns the buffer size required by csrgemm2' + func_descriptions['cusparse' + t + 'csrsort'] = 'in-place sort of CSR' + func_descriptions['cusparse' + t + 'csrsort_bufferSizeExt'] = 'determine buffer size for csrsort' + func_descriptions['cusparse' + t + 'csru2csr_bufferSizeExt'] = 'determine buffer size for csru2csr and csr2csru' + func_descriptions['cusparse' + t + 'csru2csr'] = 'convert unsorted CSR to sorted CSR' + # CUDA 6.0 functions renamed in CUDA 7.0+ below here + func_descriptions['cusparse' + t + 'bsric02_bufferSizeExt'] = 'return bsric02 (A ~= L * L.H) buffer size' + func_descriptions['cusparse' + t + 'bsrilu02_bufferSizeExt'] = 'return bsrilu02 (A ~= LU) buffer size' + func_descriptions['cusparse' + t + 'bsrsm2_bufferSizeExt'] = 'return size of buffer used in bsrsm2' + func_descriptions['cusparse' + t + 'bsrsv2_bufferSizeExt'] = 'return size of buffer used in bsrsv2' + func_descriptions['cusparse' + t + 'csr2gebsr_bufferSizeExt'] = 'return csr2gebsr buffer size' + func_descriptions['cusparse' + t + 'csric02_bufferSizeExt'] = 'return csric02 (A ~= L * L.H) buffer size' + func_descriptions['cusparse' + t + 'csrilu02_bufferSizeExt'] = 'return csrilu02 (A ~= LU) buffer size' + func_descriptions['cusparse' + t + 'csrsv2_bufferSizeExt'] = 'return size of buffer used in csrsv2' + func_descriptions['cusparse' + t + 'gebsr2gebsc_bufferSizeExt'] = 'return gebsr2gebsc buffer size' + func_descriptions['cusparse' + t + 'gebsr2gebsr_bufferSizeExt'] = 'return gebsr2gebsr or gebsr2gebsrNnz buffer size' + + # operations common across all precisions below here + # for CUDA 5.0+ + func_descriptions['cusparseXcsr2bsrNnz'] = 'determine the number of nonzero block columns per block row for csr2bsr' + func_descriptions['cusparseXcsrgemmNnz'] = 'determine csrRowPtrC and the total number of nonzero elements for gemm' + func_descriptions['cusparseXcsrgeamNnz'] = 'determine csrRowPtrC and the total number of nonzero elements for geam' + # for CUDA 6.0+ + func_descriptions['cusparseXbsrsv2_zeroPivot'] = 'return numerical zero location for bsrsv2' + func_descriptions['cusparseXbsric02_zeroPivot'] = 'return numerical zero location for bsric02' + func_descriptions['cusparseXbsrilu02_zeroPivot'] = 'return numerical zero location for bsrilu02' + func_descriptions['cusparseXcsr2gebsrNnz'] = 'determine the number of nonzero block columns per block row for csr2gebsr' + func_descriptions['cusparseXcsric02_zeroPivot'] = 'return numerical zero location for csric02' + func_descriptions['cusparseXcsrilu02_zeroPivot'] = 'return numerical zero location for csrilu02' + func_descriptions['cusparseXcsrsv2_zeroPivot'] = 'return numerical zero location for csrsv2' + func_descriptions['cusparseXgebsr2gebsrNnz'] = 'determine the number of nonzero block columns per block row for gebsr2gebsr' + # for CUDA 6.5+ + func_descriptions['cusparseXbsrsm2_zeroPivot'] = 'return numerical zero location for bsrsm2' + # for CUDA 7.0+ + func_descriptions['cusparseCreateIdentityPermutation'] = 'creates an identity map for use with coosort, csrsort, cscsort, csr2csc_indexOnly' + func_descriptions['cusparseXcsrgemm2Nnz'] = 'determine csrRowPtrC and the number of non-zero elements for csrgemm2' + + create_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Create' in cdef] + for func in create_funcs: + tmp, obj = func.split('Create') + if obj: + func_descriptions[func] = "Create cuSPARSE {} structure.".format(obj) + else: + func_descriptions[func] = "Create cuSPARSE context." + destroy_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Destroy' in cdef] + for func in destroy_funcs: + tmp, obj = func.split('Destroy') + if obj: + func_descriptions[func] = "Destroy cuSPARSE {} structure.".format(obj) + else: + func_descriptions[func] = "Destroy cuSPARSE context." + get_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Get' in cdef] + for func in get_funcs: + tmp, obj = func.split('Get') + func_descriptions[func] = "Get cuSPARSE {}.".format(obj) + + set_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Set' in cdef] + for func in set_funcs: + tmp, obj = func.split('Set') + func_descriptions[func] = "Set cuSPARSE {}.".format(obj) + + # prune any of the above that aren't in ffi_lib: + func_descriptions = dict( + (k, v) for k, v in func_descriptions.items( + ) if k in ffi_lib.__dict__) + + with open(json_file, 'w') as fid: + json.dump(func_descriptions, fid, sort_keys=True, indent=4) + + +def generate_cusparse_python_wrappers(cffi_cdef=None, variable_defs_json='', + func_defs_json='', + python_wrapper_file=None): + """ generate python wrappers for all functions within cffi_cdef. + + Parameters + ---------- + cffi_cdef : str + cffi definition string as generated by `generate_cffi_cdef` + variable_defs_json : str, optional + filename of .json file containing dictionary of variable descriptions + func_defs_json : str, optional + filename of .json file containing dictionary of function descriptions + python_wrapper_file : str, optional + file to output the generated python wrappers to + + Returns + ------- + python_wrappers : str + string containing all of the python wrappers + + """ + cffi_cdef_list = cffi_cdef.split('\n') + + # find lines containing a function definition + func_def_lines = [] + for idx, line in enumerate(cffi_cdef_list): + if line.startswith('cusparse'): + func_def_lines.append(idx) + + # reformat each definition into a single line for easier string processing + n_funcs = len(func_def_lines) + cdef_list = [] + for i in range(len(func_def_lines)): + loc1 = func_def_lines[i] + if i < n_funcs - 1: + loc2 = func_def_lines[i + 1] + cdef = ' '.join([l.strip() for l in cffi_cdef_list[loc1:loc2]]) + else: + cdef = ' '.join([l.strip() for l in cffi_cdef_list[loc1:]]) + # strip any remaining comments after the semicolon + cdef = cdef[:cdef.find(';') + 1] + cdef_list.append(cdef) + + # read function and variable definition strings to use when building the + # the Python doc strings + if variable_defs_json: + variable_descriptions = get_function_descriptions(variable_defs_json) + if func_defs_json: + func_descriptions = get_function_descriptions(func_defs_json) + + # build the wrappers + python_wrappers = '' + for cdef in cdef_list: + python_wrappers += build_python_func( + cdef, + variable_descriptions=variable_descriptions, + func_descriptions=func_descriptions) + + if python_wrapper_file is not None: + with open(python_wrapper_file, 'w') as f: + f.write(python_wrappers) + return python_wrappers diff --git a/scikits/cuda/cusparse.py b/scikits/cuda/cusparse.py index 97cf0ff..038a972 100644 --- a/scikits/cuda/cusparse.py +++ b/scikits/cuda/cusparse.py @@ -1,387 +1,1455 @@ #!/usr/bin/env python +from __future__ import division, print_function, absolute_import +import functools +import numpy as np +import pycuda.autoinit +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv -""" -Python interface to CUSPARSE functions. +from . import misc -Note: this module does not explicitly depend on PyCUDA. -""" +from .misc import init -import atexit -import ctypes.util -import platform -from string import Template -import sys -import warnings - -import numpy as np +try: + import scipy.sparse + from scipy.sparse.sputils import isscalarlike + has_scipy = True +except ImportError: + has_scipy = False -import cuda - -# Load library: -_version_list = [6.5, 6.0, 5.5, 5.0, 4.0] -if 'linux' in sys.platform: - _libcusparse_libname_list = ['libcusparse.so'] + \ - ['libcusparse.so.%s' % v for v in _version_list] -elif sys.platform == 'darwin': - _libcusparse_libname_list = ['libcusparse.dylib'] -elif sys.platform == 'win32': - if platform.machine().endswith('64'): - _libcusparse_libname_list = ['cusparse.dll'] + \ - ['cusparse64_%s.dll' % int(10*v) for v in _version_list] - else: - _libcusparse_libname_list = ['cusparse.dll'] + \ - ['cusparse32_%s.dll' % int(10*v) for v in _version_list] -else: - raise RuntimeError('unsupported platform') - -# Print understandable error message when library cannot be found: -_libcusparse = None -for _libcusparse_libname in _libcusparse_libname_list: - try: - if sys.platform == 'win32': - _libcusparse = ctypes.windll.LoadLibrary(_libcusparse_libname) - else: - _libcusparse = ctypes.cdll.LoadLibrary(_libcusparse_libname) - except OSError: - pass - else: - break -if _libcusparse == None: - OSError('CUDA sparse library not found') - -class cusparseError(Exception): - """CUSPARSE error""" - pass - -class cusparseStatusNotInitialized(cusparseError): - """CUSPARSE library not initialized""" - pass - -class cusparseStatusAllocFailed(cusparseError): - """CUSPARSE resource allocation failed""" - pass - -class cusparseStatusInvalidValue(cusparseError): - """Unsupported value passed to the function""" - pass - -class cusparseStatusArchMismatch(cusparseError): - """Function requires a feature absent from the device architecture""" - pass - -class cusparseStatusMappingError(cusparseError): - """An access to GPU memory space failed""" - pass - -class cusparseStatusExecutionFailed(cusparseError): - """GPU program failed to execute""" - pass - -class cusparseStatusInternalError(cusparseError): - """An internal CUSPARSE operation failed""" - pass - -class cusparseStatusMatrixTypeNotSupported(cusparseError): - """The matrix type is not supported by this function""" - pass - -cusparseExceptions = { - 1: cusparseStatusNotInitialized, - 2: cusparseStatusAllocFailed, - 3: cusparseStatusInvalidValue, - 4: cusparseStatusArchMismatch, - 5: cusparseStatusMappingError, - 6: cusparseStatusExecutionFailed, - 7: cusparseStatusInternalError, - 8: cusparseStatusMatrixTypeNotSupported, - } - -# Matrix types: -CUSPARSE_MATRIX_TYPE_GENERAL = 0 -CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 -CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 -CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 - -CUSPARSE_FILL_MODE_LOWER = 0 -CUSPARSE_FILL_MODE_UPPER = 1 - -# Whether or not a matrix' diagonal entries are unity: -CUSPARSE_DIAG_TYPE_NON_UNIT = 0 -CUSPARSE_DIAG_TYPE_UNIT = 1 - -# Matrix index bases: -CUSPARSE_INDEX_BASE_ZERO = 0 -CUSPARSE_INDEX_BASE_ONE = 1 - -# Operation types: -CUSPARSE_OPERATION_NON_TRANSPOSE = 0 -CUSPARSE_OPERATION_TRANSPOSE = 1 -CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 - -# Whether or not to parse elements of a dense matrix row or column-wise. -CUSPARSE_DIRECTION_ROW = 0 -CUSPARSE_DIRECTION_COLUMN = 1 - -# Helper functions: -class cusparseMatDescr(ctypes.Structure): - _fields_ = [ - ('MatrixType', ctypes.c_int), - ('FillMode', ctypes.c_int), - ('DiagType', ctypes.c_int), - ('IndexBase', ctypes.c_int) - ] - -def cusparseCheckStatus(status): - """ - Raise CUSPARSE exception + # copy of isscalarlike from scipy.sparse.sputils + def isscalarlike(x): + """Is x either a scalar, an array scalar, or a 0-dim array?""" + return np.isscalar(x) or (isdense(x) and x.ndim == 0) - Raise an exception corresponding to the specified CUSPARSE error - code. +toolkit_version = drv.get_version() - Parameters - ---------- - status : int - CUSPARSE error code. +if toolkit_version < (3, 2, 0): + raise ImportError("cuSPARSE not present prior to v3.2 of the CUDA toolkit") - See Also - -------- - cusparseExceptions +""" +Python interface to cuSPARSE functions. +Note: You may need to set the environment variable CUDA_ROOT to the base of +your CUDA installation. +""" +# import low level cuSPARSE python wrappers and constants + +try: + from ._cusparse_cffi import * +except Exception as e: + print(repr(e)) + estr = "autogenerattion and import of cuSPARSE wrappers failed\n" + estr += ("Try setting the CUDA_ROOT environment variable to the base of " + "your CUDA installation. The autogeneration script tries to " + "find the CUSPARSE header at CUDA_ROOT/include/cusparse_v2.h or " + "CUDA_ROOT/include/cusparse.h\n") + raise ImportError(estr) + +# define higher level wrappers for common functions +# will check dimensions, autoset some variables and call the appriopriate +# function based on the input dtype + +def defineIf(condition): + def decorator(func): + @functools.wraps(func) + def func_wrapper(*args, **kwargs): + if condition: + return func(*args, **kwargs) + else: + raise NotImplementedError("requested cuSPARSE function not " + "available for your CUDA version") + return func_wrapper + return decorator + + +def copyMatDescr(descr): + """ create a new copy of Matrix Descriptor, descr """ + descr_copy = cusparseCreateMatDescr() + cusparseSetMatType(descr_copy, cusparseGetMatType(descr)) + cusparseSetMatIndexBase(descr_copy, cusparseGetMatIndexBase(descr)) + cusparseSetMatDiagType(descr_copy, cusparseGetMatDiagType(descr)) + cusparseSetMatFillMode(descr_copy, cusparseGetMatFillMode(descr)) + return descr_copy + + +def dense_nnz(descrA, A, handle=None, dirA=CUSPARSE_DIRECTION_ROW, lda=None, + nnzPerRowCol=None, nnzTotalDevHostPtr=None): + """ higher level wrapper to cusparsennz routines """ + if not isinstance(A, pycuda.gpuarray.GPUArray): + raise ValueError("A must be a pyCUDA gpuarray") + if len(A.shape) != 2: + raise ValueError("A must be 2D") + if lda is None: + lda = A.shape[0] + + if handle is None: + handle = misc._global_cusparse_handle + + m, n = A.shape + assert lda >= m + dtype = A.dtype + + alloc = misc._global_cusparse_allocator + + if nnzPerRowCol is None: + if dirA == CUSPARSE_DIRECTION_ROW: + nnzPerRowCol = gpuarray.zeros((m, ), dtype=np.int32, + allocator=alloc) + elif dirA == CUSPARSE_DIRECTION_COLUMN: + nnzPerRowCol = gpuarray.zeros((n, ), dtype=np.int32, + allocator=alloc) + else: + raise ValueError("Invalid dirA") + if nnzTotalDevHostPtr is None: + nnzTotalDevHostPtr = ffi.new('int *', 0) + if dtype == np.float32: + fn = cusparseSnnz + elif dtype == np.float64: + fn = cusparseDnnz + elif dtype == np.complex64: + fn = cusparseCnnz + elif dtype == np.complex128: + fn = cusparseZnnz + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + fn(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, + nnzTotalDevHostPtr) + return nnzPerRowCol, nnzTotalDevHostPtr[0] + + +def dense2csr(A, handle=None, descrA=None, lda=None, check_inputs=True): + """ Convert dense matrix to CSR. """ + if not isinstance(A, pycuda.gpuarray.GPUArray): + # try moving list or numpy array to GPU + A = np.asfortranarray(np.atleast_2d(A)) + A = gpuarray.to_gpu(A) + if check_inputs: + if not isinstance(A, pycuda.gpuarray.GPUArray): + raise ValueError("A must be a pyCUDA gpuarray") + if len(A.shape) != 2: + raise ValueError("A must be 2D") + if descrA is not None: + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if not A.flags.f_contiguous: + raise ValueError("Dense matrix A must be in column-major order") + + if lda is None: + lda = A.shape[0] + m, n = A.shape + assert lda >= m + dtype = A.dtype + + if handle is None: + handle = misc._global_cusparse_handle + + if descrA is None: + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + nnzPerRow, nnz = dense_nnz( + descrA, A, handle=handle, dirA=CUSPARSE_DIRECTION_ROW, lda=lda) + + alloc = misc._global_cusparse_allocator + + csrRowPtrA = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc) + csrColIndA = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc) + csrValA = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc) + + if dtype == np.float32: + fn = cusparseSdense2csr + elif dtype == np.float64: + fn = cusparseDdense2csr + elif dtype == np.complex64: + fn = cusparseCdense2csr + elif dtype == np.complex128: + fn = cusparseZdense2csr + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + + fn(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, + csrColIndA) + return (descrA, csrValA, csrRowPtrA, csrColIndA) + + +def csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A=None, + handle=None, lda=None, check_inputs=True): + """ convert CSR matrix to dense """ + if check_inputs: + if A is not None: + if not isinstance(A, pycuda.gpuarray.GPUArray): + raise ValueError("A must be a pyCUDA gpuarray") + if len(A.shape) != 2: + raise ValueError("A must be 2D") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ZERO: + raise ValueError("Only base 0 matrix supported") + for arr in [csrValA, csrRowPtrA, csrColIndA]: + if not isinstance(arr, pycuda.gpuarray.GPUArray): + raise ValueError("csr* inputs must be a pyCUDA gpuarrays") + if (csrRowPtrA.size != m + 1): + raise ValueError("A: inconsistent size") + + if handle is None: + handle = misc._global_cusparse_handle + + if lda is None: + lda = m + assert lda >= m + dtype = csrValA.dtype + + alloc = misc._global_cusparse_allocator + A = gpuarray.zeros((m, n), dtype=dtype, order='F', allocator=alloc) + + if dtype == np.float32: + fn = cusparseScsr2dense + elif dtype == np.float64: + fn = cusparseDcsr2dense + elif dtype == np.complex64: + fn = cusparseCcsr2dense + elif dtype == np.complex128: + fn = cusparseZcsr2dense + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + + fn(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda) + return A + + +def dense2csc(A, handle=None, descrA=None, lda=None, check_inputs=True): + """ Convert dense matrix to CSC. """ + if not isinstance(A, pycuda.gpuarray.GPUArray): + # try moving list or numpy array to GPU + A = np.asfortranarray(np.atleast_2d(A)) + A = gpuarray.to_gpu(A) + if check_inputs: + if not isinstance(A, pycuda.gpuarray.GPUArray): + raise ValueError("A must be a pyCUDA gpuarray") + if len(A.shape) != 2: + raise ValueError("A must be 2D") + if descrA is not None: + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if not A.flags.f_contiguous: + raise ValueError("Dense matrix A must be in column-major order") + + if lda is None: + lda = A.shape[0] + m, n = A.shape + assert lda >= m + dtype = A.dtype + + if handle is None: + handle = misc._global_cusparse_handle + + if descrA is None: + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + nnzPerCol, nnz = dense_nnz( + descrA, A, handle=handle, dirA=CUSPARSE_DIRECTION_COLUMN, lda=lda) + + alloc = misc._global_cusparse_allocator + cscColPtrA = gpuarray.zeros((n+1, ), dtype=np.int32, allocator=alloc) + cscRowIndA = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc) + cscValA = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc) + if dtype == np.float32: + fn = cusparseSdense2csc + elif dtype == np.float64: + fn = cusparseDdense2csc + elif dtype == np.complex64: + fn = cusparseCdense2csc + elif dtype == np.complex128: + fn = cusparseZdense2csc + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + + fn(handle, m, n, descrA, A, lda, nnzPerCol, cscValA, cscRowIndA, + cscColPtrA) + return (descrA, cscValA, cscColPtrA, cscRowIndA) + + +def csc2dense(m, n, descrA, cscValA, cscColPtrA, cscRowIndA, A=None, + handle=None, lda=None, check_inputs=True): + """ convert CSC matrix to dense """ + if check_inputs: + if A is not None: + if not isinstance(A, pycuda.gpuarray.GPUArray): + raise ValueError("A must be a pyCUDA gpuarray") + if len(A.shape) != 2: + raise ValueError("A must be 2D") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ZERO: + raise ValueError("Only base 0 matrix supported") + for arr in [cscValA, cscColPtrA, cscRowIndA]: + if not isinstance(arr, pycuda.gpuarray.GPUArray): + raise ValueError("csc* inputs must be a pyCUDA gpuarrays") + if (cscColPtrA.size != n + 1): + raise ValueError("A: inconsistent size") + + if handle is None: + handle = misc._global_cusparse_handle + + if lda is None: + lda = m + assert lda >= m + dtype = cscValA.dtype + alloc = misc._global_cusparse_allocator + A = gpuarray.zeros((m, n), dtype=dtype, order='F', allocator=alloc) + + if dtype == np.float32: + fn = cusparseScsc2dense + elif dtype == np.float64: + fn = cusparseDcsc2dense + elif dtype == np.complex64: + fn = cusparseCcsc2dense + elif dtype == np.complex128: + fn = cusparseZcsc2dense + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + + fn(handle, m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda) + return A + + +def csr2coo(csrRowPtr, nnz, handle=None, m=None, cooRowInd=None, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert CSR to COO """ + if check_inputs: + if cooRowInd is not None: + if not isinstance(cooRowInd, pycuda.gpuarray.GPUArray): + raise ValueError("cooRowInd must be a pyCUDA gpuarray") + if not isinstance(csrRowPtr, pycuda.gpuarray.GPUArray): + raise ValueError("csrRowPtr must be a pyCUDA gpuarraya") + if handle is None: + handle = misc._global_cusparse_handle + if m is None: + m = csrRowPtr.size - 1 + if cooRowInd is None: + alloc = misc._global_cusparse_allocator + cooRowInd = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc) + cusparseXcsr2coo(handle, csrRowPtr, nnz, m, cooRowInd, idxBase) + return cooRowInd + + +# define with alternate naming for convenience +def csc2coo(cscColPtr, nnz, handle=None, m=None, cooColInd=None, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert CSC to COO """ + # if m is None: + # m = cooColPtr.size - 1 + cooColInd = csr2coo(csrRowPtr=cscColPtr, nnz=nnz, handle=handle, m=m, + cooRowInd=cooColInd, idxBase=idxBase, + check_inputs=check_inputs) + return cooColInd + + +def coo2csr(cooRowInd, m, handle=None, nnz=None, csrRowPtr=None, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert COO to CSR """ + if check_inputs: + if csrRowPtr is not None: + if not isinstance(csrRowPtr, pycuda.gpuarray.GPUArray): + raise ValueError("csrRowPtr must be a pyCUDA gpuarray") + if not isinstance(cooRowInd, pycuda.gpuarray.GPUArray): + raise ValueError("cooRowInd must be a pyCUDA gpuarraya") + if handle is None: + handle = misc._global_cusparse_handle + if nnz is None: + nnz = cooRowInd.size + if csrRowPtr is None: + alloc = misc._global_cusparse_allocator + csrRowPtr = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc) + cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, idxBase) + return csrRowPtr + + +# define with alternate naming for convenience +def coo2csc(cooColInd, m, handle=None, nnz=None, cscColPtr=None, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert COO to CSC""" + cscColPtr = coo2csr(cooRowInd=cooColInd, m=m, handle=handle, nnz=nnz, + csrRowPtr=cscColPtr, idxBase=idxBase, + check_inputs=check_inputs) + return cscColPtr + + +def csr2csc(m, n, csrVal, csrRowPtr, csrColInd, handle=None, nnz=None, + cscVal=None, cscColPtr=None, cscRowInd=None, A=None, + copyValues=CUSPARSE_ACTION_NUMERIC, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert CSR to CSC """ + if check_inputs: + if (cscVal is not None) or (cscColPtr is not None) or \ + (cscRowInd is not None): + for arr in [cscVal, cscColPtr, csrRowInd]: + if not isinstance(arr, pycuda.gpuarray.GPUArray): + raise ValueError("csc* inputs must all be pyCUDA gpuarrays" + " or None") + for arr in [csrVal, csrRowPtr, csrColInd]: + if not isinstance(arr, pycuda.gpuarray.GPUArray): + raise ValueError("csr* inputs must be a pyCUDA gpuarrays") + if (csrRowPtr.size != m + 1): + raise ValueError("A: inconsistent size") + if handle is None: + handle = misc._global_cusparse_handle + dtype = csrVal.dtype + nnz = csrVal.size + if cscVal is None: + alloc = misc._global_cusparse_allocator + cscVal = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc) + cscColPtr = gpuarray.zeros((n+1, ), dtype=np.int32, allocator=alloc) + cscRowInd = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc) + if dtype == np.float32: + fn = cusparseScsr2csc + elif dtype == np.float64: + fn = cusparseDcsr2csc + elif dtype == np.complex64: + fn = cusparseCcsr2csc + elif dtype == np.complex128: + fn = cusparseZcsr2csc + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + fn(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd, + cscColPtr, copyValues, idxBase) + return (cscVal, cscColPtr, cscRowInd) + + +# also define csc2csr as a convenience +def csc2csr(m, n, cscVal, cscColPtr, cscRowInd, handle=None, nnz=None, + csrVal=None, csrRowPtr=None, csrColInd=None, + copyValues=CUSPARSE_ACTION_NUMERIC, + idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True): + """ convert CSC to CSR """ + csrVal, csrRowPtr, csrColInd = csr2csc( + m, n, cscVal, cscColPtr, cscRowInd, handle=handle, nnz=nnz, + cscVal=csrVal, cscColPtr=csrRowPtr, cscRowInd=csrColInd, + copyValues=copyValues, idxBase=idxBase, check_inputs=check_inputs) + return csrVal, csrRowPtr, csrColInd + + +def csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m, n, x, handle=None, + nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, + beta=0.0, y=None, check_inputs=True): + """ multiply a sparse matrix A, by dense vector x: + y = alpha * transA(A)*x + beta*y + + higher level wrapper to cusparsecsrmv routines """ + if check_inputs: + if not isinstance(csrValA, pycuda.gpuarray.GPUArray): + raise ValueError("csrValA must be a pyCUDA gpuarray") + if not isinstance(csrRowPtrA, pycuda.gpuarray.GPUArray): + raise ValueError("csrRowPtrA must be a pyCUDA gpuarray") + if not isinstance(csrColIndA, pycuda.gpuarray.GPUArray): + raise ValueError("csrColIndA must be a pyCUDA gpuarray") + if not isinstance(x, pycuda.gpuarray.GPUArray): + raise ValueError("x must be a pyCUDA gpuarray") + + if handle is None: + handle = misc._global_cusparse_handle + if nnz is None: + nnz = csrValA.size + dtype = csrValA.dtype + if y is None: + alloc = misc._global_cusparse_allocator + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + y = gpuarray.zeros((m, ), dtype=dtype, allocator=alloc) + else: + y = gpuarray.zeros((n, ), dtype=dtype, allocator=alloc) + # perform some basic sanity checks + if check_inputs: + if csrValA.size != nnz: + raise ValueError("length of csrValA array must match nnz") + if (x.dtype != dtype) or (y.dtype != dtype): + raise ValueError("incompatible dtypes") + if csrRowPtrA.size != (m+1): + raise ValueError("length of csrRowPtrA array must be m+1") + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + if x.size != n: + raise ValueError("sizes of x, A incompatible") + if y.size != m: + raise ValueError("sizes of y, A incompatible") + else: + if x.size != m: + raise ValueError("sizes of x, A incompatible") + if y.size != n: + raise ValueError("sizes of y, A incompatible") + if dtype == np.float32: + fn = cusparseScsrmv + elif dtype == np.float64: + fn = cusparseDcsrmv + elif dtype == np.complex64: + fn = cusparseCcsrmv + elif dtype == np.complex128: + fn = cusparseZcsrmv + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + if toolkit_version >= (4, 1, 0): - if status != 0: - try: - raise cusparseExceptions[status] - except KeyError: - raise cusparseError + fn(handle, transA, m, n, nnz, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, x, beta, y) + else: + fn(handle, transA, m, n, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, x, beta, y) -_libcusparse.cusparseCreate.restype = int -_libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p] -def cusparseCreate(): - """ - Initialize CUSPARSE. + return y - Initializes CUSPARSE and creates a handle to a structure holding - the CUSPARSE library context. - Returns - ------- - handle : int - CUSPARSE library context. +def csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, handle=None, + C=None, nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, + beta=0.0, ldb=None, ldc=None, check_inputs=True): + """ multiply a sparse matrix, A, by dense matrix B: + C = alpha * transA(A) * B + beta * C. + higher level wrapper to cusparsecsrmm routines """ + if check_inputs: + for item in [csrValA, csrRowPtrA, csrColIndA, B]: + if not isinstance(item, pycuda.gpuarray.GPUArray): + raise ValueError("csr*, B, must be pyCUDA gpuarrays") + if C is not None: + if not isinstance(C, pycuda.gpuarray.GPUArray): + raise ValueError("C must be a pyCUDA gpuarray or None") + # dense matrices must be in column-major order + if not B.flags.f_contiguous: + raise ValueError("Dense matrix B must be in column-major order") + if handle is None: + handle = misc._global_cusparse_handle + + dtype = csrValA.dtype + if C is None: + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + ldc = m + else: + ldc = k + alloc = misc._global_cusparse_allocator + C = gpuarray.zeros((ldc, n), dtype=dtype, order='F', allocator=alloc) + elif not C.flags.f_contiguous: + raise ValueError("Dense matrix C must be in column-major order") + if nnz is None: + nnz = csrValA.size + if ldb is None: + ldb = B.shape[0] + if ldc is None: + ldc = C.shape[0] + + # perform some basic sanity checks + if check_inputs: + if csrValA.size != nnz: + raise ValueError("length of csrValA array must match nnz") + if (B.dtype != dtype) or (C.dtype != dtype): + raise ValueError("A, B, C must share a common dtype") + if ldb < B.shape[0]: + raise ValueError("ldb invalid for matrix B") + if ldc < C.shape[0]: + raise ValueError("ldc invalid for matrix C") + if (C.shape[1] != n) or (B.shape[1] != n): + raise ValueError("bad shape for B or C") + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + if (ldb != k) or (ldc != m): + raise ValueError("size of A incompatible with B or C") + else: + if (ldb != m) or (ldc != k): + raise ValueError("size of A incompatible with B or C") + if csrRowPtrA.size != m+1: + raise ValueError("length of csrRowPtrA invalid") + if dtype == np.float32: + fn = cusparseScsrmm + elif dtype == np.float64: + fn = cusparseDcsrmm + elif dtype == np.complex64: + fn = cusparseCcsrmm + elif dtype == np.complex128: + fn = cusparseZcsrmm + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + if toolkit_version >= (4, 1, 0): + fn(handle, transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, B, ldb, beta, C, ldc) + else: + fn(handle, transA, m, n, k, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, B, ldb, beta, C, ldc) + return C - handle = ctypes.c_int() - status = _libcusparse.cusparseCreate(ctypes.byref(handle)) - cusparseCheckStatus(status) - return handle.value - -_libcusparse.cusparseDestroy.restype = int -_libcusparse.cusparseDestroy.argtypes = [ctypes.c_int] -def cusparseDestroy(handle): - """ - Release CUSPARSE resources. - Releases hardware resources used by CUSPARSE - Parameters - ---------- - handle : int - CUSPARSE library context. +@defineIf(toolkit_version >= (5, 5, 0)) +def csrmm2(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, handle=None, + C=None, nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, + transB=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, beta=0.0, + ldb=None, ldc=None, check_inputs=True): + """ multiply two sparse matrices: C = transA(A) * transB(B) + higher level wrapper to cusparsecsrmm2 routines. """ - - status = _libcusparse.cusparseDestroy(handle) - cusparseCheckStatus(status) - -_libcusparse.cusparseGetVersion.restype = int -_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, - ctypes.c_void_p] -def cusparseGetVersion(handle): + if check_inputs: + for item in [csrValA, csrRowPtrA, csrColIndA, B]: + if not isinstance(item, pycuda.gpuarray.GPUArray): + raise ValueError("csr*, B, must be pyCUDA gpuarrays") + if C is not None: + if not isinstance(C, pycuda.gpuarray.GPUArray): + raise ValueError("C must be a pyCUDA gpuarray or None") + # dense matrices must be in column-major order + if not B.flags.f_contiguous: + raise ValueError("Dense matrix B must be column-major order") + + if transB == CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE: + raise ValueError("Conjugate transpose operation not supported " + "for dense matrix B") + + if (transB == CUSPARSE_OPERATION_TRANSPOSE) and \ + (transA != CUSPARSE_OPERATION_NON_TRANSPOSE): + raise ValueError("if B is transposed, only A non-transpose is " + "supported") + if handle is None: + handle = misc._global_cusparse_handle + + dtype = csrValA.dtype + if C is None: + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + ldc = m + else: + ldc = k + alloc = misc._global_cusparse_allocator + C = gpuarray.zeros((ldc, n), dtype=dtype, order='F', + allocator=alloc) + elif not C.flags.f_contiguous: + raise ValueError("Dense matrix C must be in column-major order") + if nnz is None: + nnz = csrValA.size + if ldb is None: + ldb = B.shape[0] + if ldc is None: + ldc = C.shape[0] + + # perform some basic sanity checks + if check_inputs: + if csrValA.size != nnz: + raise ValueError("length of csrValA array must match nnz") + if (B.dtype != dtype) or (C.dtype != dtype): + raise ValueError("A, B, C must share a common dtype") + if ldb < B.shape[0]: + raise ValueError("ldb invalid for matrix B") + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + ldOpA = m # leading dimension for op(A) + tdOpA = k # trailing dimension for op(A) + else: + ldOpA = k + tdOpA = m + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + if B.shape[1] != n: + raise ValueError("B, n incompatible") + if (ldb < tdOpA): + raise ValueError("size of A incompatible with B") + else: + if ldb < n: + raise ValueError("B, n incompatible") + if (B.shape[1] != tdOpA): + raise ValueError("size of A incompatible with B") + if (C.shape[1] != n): + raise ValueError("bad shape for C") + if (ldc != ldOpA): + raise ValueError("size of A incompatible with C") + if csrRowPtrA.size != m+1: + raise ValueError("length of csrRowPtrA invalid") + if dtype == np.float32: + fn = cusparseScsrmm2 + elif dtype == np.float64: + fn = cusparseDcsrmm2 + elif dtype == np.complex64: + fn = cusparseCcsrmm2 + elif dtype == np.complex128: + fn = cusparseZcsrmm2 + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + transa = transA + transb = transB + try: + fn(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA, + csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc) + except CUSPARSE_STATUS_INVALID_VALUE as e: + print("m={}, n={}, k={}, nnz={}, ldb={}, ldc={}".format( + m, n, k, nnz, ldb, ldc)) + raise(e) + return C + + +@defineIf(toolkit_version >= (5, 0, 0)) +def _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB, + csrColIndB, handle=None, descrC=None, csrRowPtrC=None, + nnzA=None, nnzB=None, check_inputs=True): + """ support routine for csrgeam + + higher level wrapper to cusparseXcsrgeamNnz. """ - Return CUSPARSE library version. + if check_inputs: + for array in [csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB]: + if not isinstance(array, pycuda.gpuarray.GPUArray): + raise ValueError("all csr* inputs must be a pyCUDA gpuarray") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if descrC is not None: + if not isinstance(csrRowPtrC, pycuda.gpuarray.GPUArray): + raise ValueError("csrRowPtrC must be a gpuarray or None") + if cusparseGetMatType(descrC) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + + if handle is None: + handle = misc._global_cusparse_handle + if nnzA is None: + nnzA = csrColIndA.size + if nnzB is None: + nnzB = csrColIndB.size + if descrC is None: + return_descrC = True + descrC = cusparseCreateMatDescr() + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + else: + return_descrC = False + if csrRowPtrC is None: + alloc = misc._global_cusparse_allocator + csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, + allocator=alloc) + nnzTotalDevHostPtr = ffi.new('int *', 0) + + # perform some basic sanity checks + if check_inputs: + if csrColIndA.size != nnzA: + raise ValueError("length of csrValA array must match nnzA") + if csrColIndB.size != nnzB: + raise ValueError("length of csrValB array must match nnzB") + if csrRowPtrA.size != m+1: + raise ValueError("length of csrRowPtrA array must be m+1") + if csrRowPtrB.size != m+1: + raise ValueError("length of csrRowPtrB array must be m+1") + + cusparseXcsrgeamNnz(handle, m, n, descrA, nnzA, csrRowPtrA, csrColIndA, + descrB, nnzB, csrRowPtrB, csrColIndB, descrC, + csrRowPtrC, nnzTotalDevHostPtr) + nnzC = nnzTotalDevHostPtr[0] + if return_descrC: + return descrC, csrRowPtrC, nnzC + else: + return nnzC - Returns the version number of the CUSPARSE library. - Parameters - ---------- - handle : int - CUSPARSE library context. +@defineIf(toolkit_version >= (5, 0, 0)) +def csrgeam(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, descrB, csrValB, + csrRowPtrB, csrColIndB, handle=None, alpha=1.0, beta=0.0, + nnzA=None, nnzB=None, check_inputs=True): + """ add two sparse matrices: C = alpha*A + beta*B. + higher level wrapper to cusparsecsrgemm routines. + """ + if check_inputs: + for array in [csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB, + csrColIndB]: + if not isinstance(array, pycuda.gpuarray.GPUArray): + raise ValueError("all csr* inputs must be a pyCUDA gpuarray") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + + if handle is None: + handle = misc._global_cusparse_handle + if nnzA is None: + nnzA = csrValA.size + if nnzB is None: + nnzB = csrValB.size + dtype = csrValA.dtype + + # perform some basic sanity checks + if check_inputs: + if csrValA.size != nnzA: + raise ValueError("length of csrValA array must match nnzA") + if csrValB.size != nnzB: + raise ValueError("length of csrValB array must match nnzB") + if (dtype != csrValB.dtype): + raise ValueError("incompatible dtypes") + if csrRowPtrA.size != m + 1: + raise ValueError("bad csrRowPtrA size") + if csrRowPtrB.size != m + 1: + raise ValueError("bad csrRowPtrB size") + + # allocate output matrix C descr and row pointers + descrC = cusparseCreateMatDescr() + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + alloc = misc._global_cusparse_allocator + csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc) + + # call csrgemmNnz to determine nnzC and fill in csrRowPtrC + nnzC = _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB, + csrRowPtrB, csrColIndB, handle=handle, descrC=descrC, + csrRowPtrC=csrRowPtrC, nnzA=nnzA, nnzB=nnzB, + check_inputs=False) + + # allocated rest of C based on nnzC + csrValC = gpuarray.zeros((nnzC, ), dtype=dtype, allocator=alloc) + csrColIndC = gpuarray.zeros((nnzC, ), dtype=np.int32, allocator=alloc) + if dtype == np.float32: + fn = cusparseScsrgeam + elif dtype == np.float64: + fn = cusparseDcsrgeam + elif dtype == np.complex64: + fn = cusparseCcsrgeam + elif dtype == np.complex128: + fn = cusparseZcsrgeam + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) - Returns - ------- - version : int - CUSPARSE library version number. + fn(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA, + beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, + csrValC, csrRowPtrC, csrColIndC) + return (descrC, csrValC, csrRowPtrC, csrColIndC) - """ - version = ctypes.c_int() - status = _libcusparse.cusparseGetVersion(handle, - ctypes.byref(version)) - cusparseCheckStatus(status) - return version.value +@defineIf(toolkit_version >= (5, 0, 0)) +def _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB, + csrColIndB, handle=None, descrC=None, csrRowPtrC=None, + nnzA=None, nnzB=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, + transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True): + """ support routine for csrgemm. -_libcusparse.cusparseSetStream.restype = int -_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, - ctypes.c_int] -def cusparseSetStream(handle, id): + higher level wrapper to cusparseXcsrgemmNnz. """ - Sets the CUSPARSE stream in which kernels will run. + if check_inputs: + for array in [csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB]: + if not isinstance(array, pycuda.gpuarray.GPUArray): + raise ValueError("all csr* inputs must be a pyCUDA gpuarray") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if descrC is not None: + if not isinstance(csrRowPtrC, pycuda.gpuarray.GPUArray): + raise ValueError("csrRowPtrC must be a gpuarray or None") + if cusparseGetMatType(descrC) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if handle is None: + handle = misc._global_cusparse_handle + if nnzA is None: + nnzA = csrColIndA.size + if nnzB is None: + nnzB = csrColIndB.size + if descrC is None: + return_descrC = True + descrC = cusparseCreateMatDescr() + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + else: + return_descrC = False + if csrRowPtrC is None: + alloc = misc._global_cusparse_allocator + csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, + allocator=alloc) + nnzTotalDevHostPtr = ffi.new('int *', 0) + + # perform some basic sanity checks + if check_inputs: + if csrColIndA.size != nnzA: + raise ValueError("length of csrValA array must match nnzA") + if csrColIndB.size != nnzB: + raise ValueError("length of csrValB array must match nnzB") + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + ptrA_size = m + 1 + else: + ptrA_size = k + 1 + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + ptrB_size = k + 1 + else: + ptrB_size = n + 1 + if csrRowPtrA.size != ptrA_size: + raise ValueError("length of csrRowPtrA array must be m+1") + if csrRowPtrB.size != ptrB_size: + raise ValueError("length of csrRowPtrB array must be n+1") + + cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, descrA, nnzA, + csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB, + csrColIndB, descrC, csrRowPtrC, nnzTotalDevHostPtr) + nnzC = nnzTotalDevHostPtr[0] + if return_descrC: + return descrC, csrRowPtrC, nnzC + else: + return nnzC - Parameters - ---------- - handle : int - CUSPARSE library context. - id : int - Stream ID. - """ +@defineIf(toolkit_version >= (5, 0, 0)) +def csrgemm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, descrB, csrValB, + csrRowPtrB, csrColIndB, handle=None, nnzA=None, nnzB=None, + transA=CUSPARSE_OPERATION_NON_TRANSPOSE, + transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True): + """ multiply two sparse matrices: C = transA(A) * transB(B) - status = _libcusparse.cusparseSetStream(handle, id) - cusparseCheckStatus(status) + higher level wrapper to cusparsecsrgemm routines. -_libcusparse.cusparseCreateMatDescr.restype = int -_libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr] -def cusparseCreateMatDescr(): - """ - Initialize a sparse matrix descriptor. + Note + ---- + transA(A) is shape m x k. transB(B) is shape k x n. C is shape m x n - Initializes the `MatrixType` and `IndexBase` fields of the matrix - descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL` - and `CUSPARSE_INDEX_BASE_ZERO`. + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + m, k = A.shape + else: + k, m = A.shape - Returns - ------- - desc : cusparseMatDescr - Matrix descriptor. + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + k, n = B.shape + else: + n, k = B.shape """ - desc = cusparseMatrixDesc() - status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc)) - cusparseCheckStatus(status) - return desc + if check_inputs: + for array in [csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB, + csrColIndB]: + if not isinstance(array, pycuda.gpuarray.GPUArray): + raise ValueError("all csr* inputs must be a pyCUDA gpuarray") + if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL: + raise ValueError("Only general matrix type supported") + + if handle is None: + handle = misc._global_cusparse_handle + if nnzA is None: + nnzA = csrValA.size + if nnzB is None: + nnzB = csrValB.size + dtype = csrValA.dtype + + # perform some basic sanity checks + if check_inputs: + if csrValA.size != nnzA: + raise ValueError("length of csrValA array must match nnzA") + if csrValB.size != nnzB: + raise ValueError("length of csrValB array must match nnzB") + if (dtype != csrValB.dtype): + raise ValueError("incompatible dtypes") + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + ptrA_size = m + 1 + else: + ptrA_size = k + 1 + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + ptrB_size = k + 1 + else: + ptrB_size = n + 1 + if csrRowPtrA.size != ptrA_size: + raise ValueError("bad csrRowPtrA size") + if csrRowPtrB.size != ptrB_size: + raise ValueError("bad csrRowPtrB size") + + # allocate output matrix C descr and row pointers + descrC = cusparseCreateMatDescr() + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + alloc = misc._global_cusparse_allocator + csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc) + + # call csrgemmNnz to determine nnzC and fill in csrRowPtrC + nnzC = _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA, descrB, + csrRowPtrB, csrColIndB, handle=handle, descrC=descrC, + csrRowPtrC=csrRowPtrC, nnzA=nnzA, nnzB=nnzB, + transA=transA, transB=transB, check_inputs=False) + + # allocated rest of C based on nnzC + csrValC = gpuarray.zeros((nnzC, ), dtype=dtype, allocator=alloc) + csrColIndC = gpuarray.zeros((nnzC, ), dtype=np.int32, allocator=alloc) + + if dtype == np.float32: + fn = cusparseScsrgemm + elif dtype == np.float64: + fn = cusparseDcsrgemm + elif dtype == np.complex64: + fn = cusparseCcsrgemm + elif dtype == np.complex128: + fn = cusparseZcsrgemm + else: + raise ValueError("unsupported sparse matrix dtype: %s" % dtype) + + fn(handle, transA, transB, m, n, k, descrA, nnzA, csrValA, csrRowPtrA, + csrColIndA, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC, + csrValC, csrRowPtrC, csrColIndC) + return (descrC, csrValC, csrRowPtrC, csrColIndC) + + +class CSR(object): + """ cuSPARSE CSR (compressed sparse row) matrix object """ + def __init__(self, descr, csrVal, csrRowPtr, csrColInd, shape, + handle=None): + + if csrRowPtr.size != (shape[0] + 1): + raise ValueError("size of RowPtr inconsistent with shape") + if csrVal.size != csrColInd.size: + raise ValueError("size of csrVal and csrColInd inconsistent") + if csrColInd.dtype != np.int32: + raise ValueError("csrColInd must be a 32-bit integer array") + if csrRowPtr.dtype != np.int32: + raise ValueError("csrRowPtr must be a 32-bit integer array") + + # if input arrays are on the host, transfer them to the GPU + self._alloc = misc._global_cusparse_allocator + + if isinstance(csrVal, np.ndarray): + csrVal = gpuarray.to_gpu(csrVal, allocator=self._alloc) + if isinstance(csrRowPtr, np.ndarray): + csrRowPtr = gpuarray.to_gpu(csrRowPtr, allocator=self._alloc) + if isinstance(csrColInd, np.ndarray): + csrColInd = gpuarray.to_gpu(csrColInd, allocator=self._alloc) + + if handle is None: + self.handle = misc._global_cusparse_handle + else: + self.handle = handle + self.descr = descr + self.Val = csrVal + self.RowPtr = csrRowPtr + self.ColInd = csrColInd + self.dtype = csrVal.dtype + self.shape = shape + + # also mirror scipy.sparse.csr_matrix property names for convenience + self.data = csrVal + self.indices = csrColInd + self.indptr = csrRowPtr + + # properties + self.__matrix_type = None + self.__index_base = None + self.__diag_type = None + self.__fill_mode = None + + # alternative constructor from dense ndarray, gpuarray or cuSPARSE matrix + @classmethod + def to_CSR(cls, A, handle=None): + """ convert dense numpy or gpuarray matrices as well as any + scipy.sparse matrix formats to cuSPARSE CSR. + """ + alloc = misc._global_cusparse_allocator + if has_scipy and isinstance(A, scipy.sparse.spmatrix): + """Convert scipy.sparse CSR, COO, BSR, etc to cuSPARSE CSR""" + # converting BSR, COO, etc to CSR + if A.dtype.char not in ['f', 'd', 'F', 'D']: + raise ValueError("unsupported numpy dtype {}".format(A.dtype)) + + if not isinstance(A, scipy.sparse.csr_matrix): + A = A.tocsr() + + # avoid .astype() calls if possible for speed + if A.indptr.dtype != np.int32: + csrRowPtr = gpuarray.to_gpu(A.indptr.astype(np.int32), + allocator=alloc) + else: + csrRowPtr = gpuarray.to_gpu(A.indptr, allocator=alloc) + + if A.indices.dtype != np.int32: + csrColInd = gpuarray.to_gpu(A.indices.astype(np.int32), + allocator=alloc) + else: + csrColInd = gpuarray.to_gpu(A.indices, allocator=alloc) + + csrVal = gpuarray.to_gpu(A.data, allocator=alloc) + descr = cusparseCreateMatDescr() + cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO) + else: + """Take dense numpy array or pyCUDA gpuarray and convert to CSR """ + if not isinstance(A, pycuda.gpuarray.GPUArray): + A = np.asfortranarray(np.atleast_2d(A)) + A = gpuarray.to_gpu(A, allocator=alloc) + else: + # dense matrix must be column-major + if not A.flags.f_contiguous: + # TODO :an change to Fortran ordering be done directly on + # the gpuarray without going back to numpy? + A = A.get() + A = np.asfortranarray(A) + A = gpuarray.to_gpu(A, allocator=alloc) + + (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A, handle=handle) + return cls(descr, csrVal, csrRowPtr, csrColInd, A.shape) + + @property + def matrix_type(self): + """ matrix type """ + if self.__matrix_type is None: + return cusparseGetMatType(self.descr) + else: + return self.__matrix_type + + @matrix_type.setter + def matrix_type(self, matrix_type): + """ matrix type """ + self.__matrix_type = cusparseSetMatType(self.descr, matrix_type) + + @property + def index_base(self): + """ matrix index base """ + if self.__index_base is None: + return cusparseGetMatIndexBase(self.descr) + else: + return self.__index_base + + @index_base.setter + def index_base(self, index_base): + """ matrix index base """ + self.__index_base = cusparseSetMatIndexBase(self.descr, index_base) + + @property + def diag_type(self): + """ matrix diag type """ + if self.__diag_type is None: + return cusparseGetMatDiagType(self.descr) + else: + return self.__diag_type + + @diag_type.setter + def diag_type(self, diag_type): + """matrix diag type """ + self.__diag_type = cusparseSetMatDiagType(self.descr, diag_type) + + @property + def fill_mode(self): + """matrix fill mode """ + if self.__fill_mode is None: + return cusparseGetMatFillMode(self.descr) + else: + return self.__fill_mode + + @fill_mode.setter + def fill_mode(self, fill_mode): + """matrix fill mode """ + self.__fill_mode = cusparseSetMatFillMode(self.descr, fill_mode) + + @property + def nnz(self): + """ number of non-zeros """ + return self.Val.size + + # mirror the function name from scipy + def getnnz(self): + """ return number of non-zeros""" + return self.nnz + + def get(self): + """ return as scipy csr_matrix in host memory """ + from scipy.sparse import csr_matrix + return csr_matrix((self.data.get(), + self.indices.get(), + self.indptr.get()), + shape=self.shape) + + def todense(self, lda=None, to_cpu=False, handle=None, stream=None, + autosync=True): + """ return dense gpuarray if to_cpu=False, numpy ndarray if to_cpu=True + """ + m, n = self.shape + if lda is None: + lda = m + else: + assert lda >= m + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + A = csr2dense(m, n, self.descr, self.Val, self.RowPtr, + self.ColInd, handle=handle, lda=lda) + if autosync: + drv.Context.synchronize() + if to_cpu: + return A.get() + else: + return A + + def mv(self, x, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, + beta=0.0, y=None, check_inputs=True, to_cpu=False, + autosync=True, handle=None, stream=None): + """ multiplication by dense vector x: y = alpha*transA(A)*x + beta*y. + """ + m, n = self.shape + + # try moving list or numpy array to GPU + if not isinstance(x, pycuda.gpuarray.GPUArray): + x = np.atleast_1d(x) # .astype(self.dtype) + x = gpuarray.to_gpu(x, allocator=self._alloc) + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + y = csrmv(self.descr, self.Val, self.RowPtr, self.ColInd, m, n, + x, handle=handle, transA=transA, alpha=alpha, beta=beta, y=y, + check_inputs=check_inputs) + if autosync: + drv.Context.synchronize() + if to_cpu: + return y.get() + else: + return y + + def mm(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, + beta=0.0, C=None, ldb=None, ldc=None, check_inputs=True, + to_cpu=False, autosync=True, handle=None, stream=None): + """ multiplication by dense matrix B: C = alpha*transA(A)*B + beta*C. + """ + m, k = self.shape + # try moving list or numpy array to GPU + if not isinstance(B, pycuda.gpuarray.GPUArray): + B = np.atleast_2d(B) # .astype(self.dtype) + B = gpuarray.to_gpu(B, allocator=self._alloc) + n = B.shape[1] + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + C = csrmm(m=m, n=n, k=k, descrA=self.descr, csrValA=self.Val, + csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd, B=B, + handle=handle, C=C, transA=transA, alpha=alpha, beta=beta, + ldb=ldb, ldc=ldc, check_inputs=check_inputs) + if autosync: + drv.Context.synchronize() + if to_cpu: + return C.get() + else: + return C + + @defineIf(toolkit_version >= (5, 5, 0)) + def mm2(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, + transB=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, beta=0.0, + C=None, ldb=None, ldc=None, check_inputs=True, to_cpu=False, + autosync=True, handle=None, stream=None): + """ multiplication by dense matrix B: C = alpha*transA(A)*B + beta*C. + version 2 + """ + if toolkit_version < (5, 5, 0): + raise ImportError("mm2 not implemented prior to CUDA v5.5") + m, k = self.shape + # try moving list or numpy array to GPU + if not isinstance(B, pycuda.gpuarray.GPUArray): + B = np.atleast_2d(B) # .astype(self.dtype) + B = gpuarray.to_gpu(B, allocator=self._alloc) + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + n = B.shape[1] + else: + n = B.shape[0] + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + C = csrmm2(handle=handle, m=m, n=n, k=k, descrA=self.descr, + csrValA=self.Val, csrRowPtrA=self.RowPtr, + csrColIndA=self.ColInd, B=B, C=C, transA=transA, + transB=transB, alpha=alpha, beta=beta, ldb=ldb, ldc=ldc, + check_inputs=check_inputs) + if autosync: + drv.Context.synchronize() + if to_cpu: + return C.get() + else: + return C + + @defineIf(toolkit_version >= (5, 0, 0)) + def geam(self, B, alpha=1.0, beta=1.0, check_inputs=True, autosync=True, + handle=None, stream=None): + """ addition of sparse matrix B: C = alpha*A + beta*B """ + if toolkit_version < (5, 0, 0): + raise ImportError("geam not implemented prior to CUDA v5.0") + m, n = self.shape + if not isinstance(B, CSR): + # try converting B to cuSPARSE CSR + B = CSR.to_CSR(B, handle=self.handle) + if self.shape != B.shape: + raise ValueError("Incompatible shapes") + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + descrC, ValC, RowPtrC, ColIndC = csrgeam( + handle=handle, m=m, n=n, descrA=self.descr, csrValA=self.Val, + csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd, descrB=B.descr, + csrValB=B.Val, csrRowPtrB=B.RowPtr, csrColIndB=B.ColInd, + alpha=alpha, beta=beta, nnzA=self.nnz, nnzB=B.nnz, + check_inputs=True) + C = CSR(descr=descrC, csrVal=ValC, csrRowPtr=RowPtrC, + csrColInd=ColIndC, shape=self.shape, handle=self.handle) + if autosync: + drv.Context.synchronize() + return C + + @defineIf(toolkit_version >= (5, 0, 0)) + def gemm(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, + transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True, + autosync=True, handle=None, stream=None): + """ multiplication by sparse matrix B: C = transA(A) * transB(B) """ + if toolkit_version < (5, 0, 0): + raise ImportError("gemm not implemented prior to CUDA v5.0") + + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + m, k = self.shape + else: + k, m = self.shape + if not isinstance(B, CSR): + # try converting B to cuSPARSE CSR + B = CSR.to_CSR(B, handle=self.handle) + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + n = B.shape[1] + else: + n = B.shape[0] + if handle is None: + handle = self.handle + if stream is not None: + cusparseSetStream(handle, stream.handle) + descrC, ValC, RowPtrC, ColIndC = csrgemm( + handle=handle, m=m, n=n, k=k, descrA=self.descr, + csrValA=self.Val, csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd, + descrB=B.descr, csrValB=B.Val, csrRowPtrB=B.RowPtr, + csrColIndB=B.ColInd, nnzA=self.nnz, nnzB=B.nnz, transA=transA, + transB=transB, check_inputs=True) + if autosync: + drv.Context.synchronize() + C = CSR(descr=descrC, csrVal=ValC, csrRowPtr=RowPtrC, + csrColInd=ColIndC, shape=(m, n), handle=self.handle) + return C -_libcusparse.cusparseDestroyMatDescr.restype = int -_libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int] -def cusparseDestroyMatDescr(desc): """ - Releases the memory allocated for the matrix descriptor. + start of: subset of methods in scipy.sparse.compressed._cs_matrix + """ + @property + def A(self): + "The transpose operator." + return self.todense() + + @property + def T(self): + "The transpose operator." + return self.transpose() + + @property + def H(self): + "The adjoint operator." + return self.getH() + + @property + def real(self): + "The transpose operator." + return self._real() + + @property + def imag(self): + "The transpose operator." + return self._imag() + + @property + def size(self): + "The adjoint operator." + return self.getnnz() + + @property + def nbytes(self): + """ approximate object size in bytes (size of data, + column indices and row pointers only). """ + nbytes = self.data.nbytes + self.indptr.nbytes + self.indices.nbytes + return nbytes + + def transpose(self): + m, n = self.shape + # use csr2csc to perform the transpose + cscVal, cscColPtr, cscRowInd = csr2csc( + m, n, self.Val, self.RowPtr, self.ColInd, handle=self.handle, + nnz=self.nnz) + drv.Context.synchronize() + return CSR(copyMatDescr(self.descr), cscVal, cscColPtr, + cscRowInd, self.shape, handle=self.handle) + + def getH(self): + return self.transpose().conj() + + def conjugate(self): + return self.conj() + + # implement _with_data similar to scipy.sparse.data._data_matrix + def _with_data(self, data, copy=True): + """Returns a matrix with the same sparsity structure as self, + but with different data. By default the structure arrays + (i.e. .indptr and .indices) are copied. + """ + if copy: + return self.__class__(copyMatDescr(self.descr), data, + self.RowPtr.copy(), self.ColInd.copy(), + self.shape, handle=self.handle) + else: + return self.__class__(self.descr, data, + self.RowPtr, self.ColInd, + self.shape, handle=self.handle) - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. + """ + end of: subset of methods in scipy.sparse.compressed._cs_matrix + """ """ + start of: subset of methods in scipy.sparse.data._data_matrix + """ + def conj(self): + return self._with_data(self.data.conj()) - status = _libcusparse.cusparseDestroyMatDescr(desc) - cusparseCheckStatus(status) + def _real(self): + return self._with_data(self.data.real) -_libcusparse.cusparseSetMatType.restype = int -_libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr, - ctypes.c_int] -def cusparseSetMatType(desc, type): - """ - Sets the matrix type of the specified matrix. + def _imag(self): + return self._with_data(self.data.imag) - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. - type : int - Matrix type. + def __abs__(self): + return self._with_data(abs(self.data)) - """ + def __neg__(self): + return self._with_data(abs(self.data)) - status = _libcusparse.cusparseSetMatType(desc, type) - cusparseCheckStatus(status) + def __imul__(self, other): # self *= other + if isscalarlike(other): + self.data *= other + return self + else: + return NotImplemented -_libcusparse.cusparseGetMatType.restype = int -_libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr] -def cusparseGetMatType(desc): - """ - Gets the matrix type of the specified matrix. + def __itruediv__(self, other): # self /= other + if isscalarlike(other): + recip = 1.0 / other + self.data *= recip + return self + else: + return NotImplemented - Parameters - ---------- - desc : cusparseMatDescr - Matrix descriptor. + def astype(self, t): + return self._with_data(self.data.astype(t)) - Returns - ------- - type : int - Matrix type. + def copy(self): + return self._with_data(self.data.copy(), copy=True) - """ + def _mul_scalar(self, other): + return self._with_data(self.data * other) - return _libcusparse.cusparseGetMatType(desc) - -# Format conversion functions: -_libcusparse.cusparseSnnz.restype = int -_libcusparse.cusparseSnnz.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, - nnzPerRowColumn, nnzTotalDevHostPtr): """ - Compute number of non-zero elements per row, column, or dense matrix. - - Parameters - ---------- - handle : int - CUSPARSE library context. - dirA : int - Data direction of elements. - m : int - Rows in A. - n : int - Columns in A. - descrA : cusparseMatDescr - Matrix descriptor. - A : pycuda.gpuarray.GPUArray - Dense matrix of dimensions (lda, n). - lda : int - Leading dimension of A. - - Returns - ------- - nnzPerRowColumn : pycuda.gpuarray.GPUArray - Array of length m or n containing the number of - non-zero elements per row or column, respectively. - nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray - Total number of non-zero elements in device or host memory. - + end of: subset of methods in scipy.sparse.data._data_matrix """ - # Unfinished: - nnzPerRowColumn = gpuarray.empty() - nnzTotalDevHostPtr = gpuarray.empty() - - status = _libcusparse.cusparseSnnz(handle, dirA, m, n, - descrA, int(A), lda, - int(nnzPerRowColumn), int(nnzTotalDevHostPtr)) - cusparseCheckStatus(status) - return nnzPerVector, nnzHost - -_libcusparse.cusparseSdense2csr.restype = int -_libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int, - ctypes.c_int, - ctypes.c_int, - cusparseMatDescr, - ctypes.c_void_p, - ctypes.c_int, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p] -def cusparseSdense2csr(handle, m, n, descrA, A, lda, - nnzPerRow, csrValA, csrRowPtrA, csrColIndA): - # Unfinished - pass + def __del__(self): + """ cleanup descriptor upon object deletion """ + cusparseDestroyMatDescr(self.descr) + # don't destroy the handle as other objects may be using it + + def __repr__(self): + rstr = "CSR matrix:\n" + rstr += "\tshape = {}\n".format(self.shape) + rstr += "\tdtype = {}\n".format(self.dtype) + rstr += "\tMatrixType = {}\n".format(self.matrix_type) + rstr += "\tIndexBase = {}\n".format(self.index_base) + rstr += "\tDiagType = {}\n".format(self.diag_type) + rstr += "\tFillMode = {}\n".format(self.fill_mode) + rstr += "\tcontext = {}\n\n".format(self.handle) + rstr += "\tnnz = {}\n".format(self.nnz) + rstr += "\tRowPtr = {}\n".format(self.RowPtr) + rstr += "\tVal = {}\n".format(self.Val) + return rstr diff --git a/scikits/cuda/cusparse_variable_descriptions.json b/scikits/cuda/cusparse_variable_descriptions.json new file mode 100644 index 0000000..bd4df60 --- /dev/null +++ b/scikits/cuda/cusparse_variable_descriptions.json @@ -0,0 +1,184 @@ +{ + "A": "array of dimensions (lda, n)", + "B": "array of dimensions (ldb, n)", + "C": "array of dimensions (ldc, n)", + "X": "array of dimensions (ldx, n)", + "Y": "array of dimensions (ldy, n)", + "alpha": "scalar used for multiplication", + "base": "enumerated indexBase type", + "baseIdx": "enumerated indexBase type", + "batchCount": "number of systems to solve", + "batchStride": "stride (number of elements) that separates the vectors of every system (must be at least m)", + "beta": "scalar used for multiplication", + "blockDim": "block dimension of sparse matrix A, larger than zero.", + "blockSize": "block dimension of sparse matrix A, larger than zero.", + "boost_val": "boost value to replace a numerical zero", + "bscColPtr": "integer array of nb+1 elements that contains the start of every block column and the end of the last block column plus one", + "bscRowInd": "integer array of nnzb row indices of the non-zero blocks of matrix A", + "bscVal": "array of nnzb*rowBlockDim*colBlockDim non-zero elements of matrix A. It is only filled-in if copyValues is set to CUSPARSE_ACTION_NUMERIC", + "bscSortedVal": "array of nnzb*rowBlockDim*colBlockDim non-zero elements of matrix A. It is only filled-in if copyValues is set to CUSPARSE_ACTION_NUMERIC", + "bsrColInd": "integer array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) column indices of the nonzero blocks of matrix A", + "bsrColIndA": "integer array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) column indices of the nonzero blocks of matrix A", + "bsrColIndC": "integer array of nnz ( = csrRowPtrC(mb) - csrRowPtrC(0) ) column indices of the nonzero blocks of matrix C", + "bsrEndPtr": "integer array of mb elements that contains the end of the every block row plus one", + "bsrEndPtrA": "integer array of mb elements that contains the end of the every block row plus one", + "bsrMaskPtr": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows", + "bsrMaskPtrA": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows", + "bsrRowPtr": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrRowPtrA": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrRowPtrC": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrVal": "array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) nonzero blocks of matrix A", + "bsrValA": "array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) nonzero blocks of matrix A", + "bsrValC": "array of nnz ( = csrRowPtrC(mb) - csrRowPtrC(0) ) nonzero blocks of matrix C", + "bsrSortedColInd": "integer array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) column indices of the nonzero blocks of matrix A", + "bsrSortedColIndA": "integer array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) column indices of the nonzero blocks of matrix A", + "bsrSortedColIndC": "integer array of nnz ( = csrSortedRowPtrC(mb) - csrSortedRowPtrC(0) ) column indices of the nonzero blocks of matrix C", + "bsrSortedEndPtr": "integer array of mb elements that contains the end of the every block row plus one", + "bsrSortedEndPtrA": "integer array of mb elements that contains the end of the every block row plus one", + "bsrSortedMaskPtr": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows", + "bsrSortedMaskPtrA": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows", + "bsrSortedRowPtr": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrSortedRowPtrA": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrSortedRowPtrC": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one", + "bsrSortedVal": "array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) nonzero blocks of matrix A", + "bsrSortedValA": "array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) nonzero blocks of matrix A", + "bsrSortedValC": "array of nnz ( = csrSortedRowPtrC(mb) - csrSortedRowPtrC(0) ) nonzero blocks of matrix C", + "c": "cosine element of the rotation matrix", + "colBlockDim": "number of columns within a block of A", + "colBlockDimA": "number of columns within a block of A", + "colBlockDimC": "number of columns within a block of C", + "cooRowInd": "integer array of nnz uncompressed row indices.", + "copyValues": "enumerated CUSPARSE_ACTION", + "cscColPtr": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one", + "cscColPtrA": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one", + "cscRowInd": "integer array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) row indices of the nonzero elements of matrix A", + "cscRowIndA": "integer array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) row indices of the nonzero elements of matrix A", + "cscVal": "array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) nonzero elements of matrix A", + "cscValA": "array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) nonzero elements of matrix A", + "csrColInd": "integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) column indices of the nonzero elements of matrix A", + "csrColIndA": "integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) column indices of the nonzero elements of matrix A", + "csrColIndB": "integer array of nnz ( = csrRowPtrB(m) - csrRowPtrB(0) ) column indices of the nonzero elements of matrix B", + "csrColIndC": "integer array of nnz ( = csrRowPtrC(m) - csrRowPtrC(0) ) column indices of the nonzero elements of matrix C", + "csrRowPtr": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrRowPtrA": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrRowPtrB": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrRowPtrC": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrVal": " array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A", + "csrValA": " array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A", + "csrValA_ValM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A", + "csrValA_valM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A", + "csrValB": " array of nnz ( = csrRowPtrB(m) - csrRowPtrB(0) ) nonzero elements of matrix B", + "csrValC": " array of nnz ( = csrRowPtrC(m) - csrRowPtrC(0) ) nonzero elements of matrix C", + "csrValM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A", + "cscSortedColPtr": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one", + "cscSortedColPtrA": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one", + "cscSortedRowInd": "integer array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) row indices of the nonzero elements of matrix A", + "cscSortedRowIndA": "integer array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) row indices of the nonzero elements of matrix A", + "cscSortedVal": "array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) nonzero elements of matrix A", + "cscSortedValA": "array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) nonzero elements of matrix A", + "csrSortedColInd": "integer array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) column indices of the nonzero elements of matrix A", + "csrSortedColIndA": "integer array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) column indices of the nonzero elements of matrix A", + "csrSortedColIndB": "integer array of nnz ( = csrSortedRowPtrB(m) - csrSortedRowPtrB(0) ) column indices of the nonzero elements of matrix B", + "csrSortedColIndC": "integer array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) column indices of the nonzero elements of matrix C", + "csrSortedColIndD": "integer array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) column indices of the nonzero elements of matrix C", + "csrSortedRowPtr": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrSortedRowPtrA": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrSortedRowPtrB": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrSortedRowPtrC": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrSortedRowPtrD": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one", + "csrSortedVal": " array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A", + "csrSortedValA": " array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A", + "csrSortedValD": " array of nnz ( = csrSortedRowPtrD(m) - csrSortedRowPtrD(0) ) nonzero elements of matrix A", + "csrSortedValA_ValM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A", + "csrSortedValA_valM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A", + "csrSortedValB": " array of nnz ( = csrSortedRowPtrB(m) - csrSortedRowPtrB(0) ) nonzero elements of matrix B", + "csrSortedValC": " array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) nonzero elements of matrix C", + "csrSortedValM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A", + "d": "dense array containing the main diagonal of the tri-diagonal linear system", + "descr": "matrix descriptor", + "descrA": "matrix descriptor of sparse matrix A", + "descrB": "matrix descriptor of sparse matrix B", + "descrC": "matrix descriptor of sparse matrix C", + "descrD": "matrix descriptor of sparse matrix D", + "descra": "matrix descriptor of sparse matrix A", + "descrb": "matrix descriptor of sparse matrix B", + "diagType": "enumerated diagType", + "dir": "enumerated storage format direction", + "dirA": "enumerated storage format direction", + "dl": "dense array containing the lower diagonal of the tri-diagonal linear system. The first element of each lower diagonal must be zero", + "du": "dense array containing the upper diagonal of the tri-diagonal linear system. The last element of each upper diagonal must be zero", + "enable_boost": "disable boost by enable_boost=0; otherwise, boost is enabled", + "fillMode": "enumerated fill mode", + "handle": "cuSPARSE context handle", + "hybA": "cuSPARSE HYB data structure", + "idxBase": "enumerated indexBase type", + "info": "solve and analysis structure", + "k": "number of columns of A", + "kb": "number of block columns of sparse matrix A", + "lda": "leading dimension of A", + "ldb": "leading dimension of B", + "ldc": "leading dimension of C", + "ldx": "leading dimension of X", + "ldy": "leading dimension of Y", + "levelInd": "integer array of m (number of rows in the matrix) elements that contains the row indices belonging to every level", + "levelPtr": "integer array of nlevels+1 elements that contains the start of every level and the end of the last level plus one", + "m": "number of rows of A", + "mb": "number of block rows of matrix A", + "mode": "enumerated pointer mode", + "n": "number of columns of ", + "nb": "number of block columns of matrix A", + "nlevels": "number of levels", + "nnz": "number of non-zero elements", + "nnzA": "number of non-zero elements of matrix A", + "nnzB": "number of non-zero elements of matrix B", + "nnzD": "number of non-zero elements of matrix D", + "nnzPerCol": "array of size n containing the number of nonzero elements per column", + "nnzPerRow": "array of size n containing the number of nonzero elements per row", + "nnzPerRowCol": "array of size m or n containing the number of nonzero elements per row or column, respectively", + "nnzPerRowColumn": "array of size m or n containing the number of nonzero elements per row or column, respectively", + "nnzTotalDevHostPtr": "total number of nonzero elements in device or host memory", + "nnzTotalHostPtr": "total number of nonzero elements in host memory", + "nnzHostPtr": "total number of nonzero elements in memory", + "nnzb": "number of non-zero blocks of matrix A", + "pBuffer": "buffer allocated by the user", + "pBufferSize": "number of bytes of the buffer used", + "pBufferSizeInBytes": "number of bytes of the buffer used", + "pInputBuffer": "library version number", + "partitionType": "partitioning method to be used in the conversion", + "policy": "enumerated cuSPARSE solve policy", + "position": "if no structural or numerical zero, position is -1; otherwise if A(j,j) is missing or U(j,j) is zero, position=j", + "resultHostPtr": "pointer to result in the host memory", + "resultDevHostPtr": "pointer to result in the device or host memory", + "rowBlockDim": "number of rows within a block of A", + "rowBlockDimA": "number of rows within a block of A", + "rowBlockDimC": "number of rows within a block of C", + "s": "sine element of the rotation matrix", + "sizeOfMask": "number of updated block rows of y ", + "streamId": "stream to be used by the library", + "tol": "tolerance to determine a numerical zero", + "trans": "enumerated operation type, op(A)", + "transA": "enumerated operation type, op(A)", + "transB": "enumerated operation type, op(B)", + "transX": "enumerated operation type, op(X)", + "transXY": "enumerated operation type, op(X) and op(Y)", + "transa": "enumerated operation type, op(A)", + "transb": "enumerated operation type, op(B)", + "type": "enumerated matrix type", + "userEllWidth": "width of the regular (ELL) part of the matrix in HYB format, which should be less than the maximum number of nonzeros per row and is only required if partitionType == CUSPARSE_HYB_PARTITION_USER", + "version": "library version number", + "x": "vector", + "f": "vector", + "xInd": "integer vector of nnz indices corresponding to the nonzero values", + "xVal": "vector containing nnz data values", + "y": "vector", + "F": "array of dimensions (ldf, n)", + "ldf": "leading dimension of F", + "fractionToColor": "fraction of nodes to be colored, which should be in the interval [0.0,1.0], for example 0.8 implies that 80 percent of nodes will be colored.", + "ncolors": "The number of distinct colors used (at most the size of the matrix, but likely much smaller).", + "coloring": "The resulting coloring permutation", + "reordering": "The resulting reordering permutation (untouched if NULL)", + "p": "integer array of nnz unsorted map indices. To construct csrVal, the user has to set P=0:1:(nnz-1).", + "P": "integer array of nnz unsorted map indices. To construct cooVal, the user has to set P=0:1:(nnz-1).", + "cooRowsA": "integer array of nnz unsorted row indices of A. ", + "cooColsA": "integer array of nnz unsorted column indices of A. " +} \ No newline at end of file diff --git a/scikits/cuda/misc.py b/scikits/cuda/misc.py index d5a2f2d..ce48e3b 100644 --- a/scikits/cuda/misc.py +++ b/scikits/cuda/misc.py @@ -59,7 +59,7 @@ Result. """ - + def init_device(n=0): """ Initialize a GPU device. @@ -127,6 +127,10 @@ def done_context(ctx): _global_cublas_handle = None global _global_cublas_allocator _global_cublas_allocator = None +global _global_cusparse_handle +_global_cusparse_handle = None +global _global_cusparse_allocator +_global_cusparse_allocator = None def init(allocator=drv.mem_alloc): """ Initialize libraries used by scikits.cuda. @@ -145,8 +149,10 @@ def init(allocator=drv.mem_alloc): and context were initialized in the current host thread. """ - # CUBLAS uses whatever device is being used by the host thread: + # CUBLAS & CUSPARSE use whatever device is being used by the host thread: global _global_cublas_handle, _global_cublas_allocator + global _global_cusparse_handle, _global_cusparse_allocator + if not _global_cublas_handle: from . import cublas # nest to avoid requiring cublas e.g. for FFT _global_cublas_handle = cublas.cublasCreate() @@ -154,6 +160,16 @@ def init(allocator=drv.mem_alloc): if _global_cublas_allocator is None: _global_cublas_allocator = allocator + if not _global_cusparse_handle: + from . import cusparse # nest to avoid requiring cublas e.g. for FFT + _global_cusparse_handle = cusparse.cusparseCreate() + # set so that scalar values are passed by reference on the host + cusparse.cusparseSetPointerMode(_global_cusparse_handle, + cusparse.CUSPARSE_POINTER_MODE_HOST) + + if _global_cusparse_allocator is None: + _global_cusparse_allocator = allocator + # culaSelectDevice() need not (and, in fact, cannot) be called # here because the host thread has already been bound to a GPU # device: @@ -321,7 +337,7 @@ def select_block_grid_sizes(dev, data_shape, threads_per_block=None): # Actual number of thread blocks needed: blocks_needed = iceil(N/float(max_threads_per_block)) - + if blocks_needed <= max_grid_dim[0]: return (max_threads_per_block, 1, 1), (blocks_needed, 1, 1) elif blocks_needed > max_grid_dim[0] and \ @@ -331,7 +347,7 @@ def select_block_grid_sizes(dev, data_shape, threads_per_block=None): elif blocks_needed > max_grid_dim[0]*max_grid_dim[1] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]*max_grid_dim[2]: return (max_threads_per_block, 1, 1), \ - (max_grid_dim[0], max_grid_dim[1], + (max_grid_dim[0], max_grid_dim[1], iceil(blocks_needed/float(max_grid_dim[0]*max_grid_dim[1]))) else: raise ValueError('array size too large') diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py new file mode 100644 index 0000000..799b328 --- /dev/null +++ b/tests/test_cusparse.py @@ -0,0 +1,1059 @@ +from __future__ import division + +from scikits.cuda.cusparse import * +from scikits.cuda.cusparse import (_csrgeamNnz, _csrgemmNnz) + +from scikits.cuda import cusparse +cusparse.init() + +import numpy as np +from numpy.testing import assert_raises, assert_equal, assert_almost_equal + +from unittest import skipIf + +import pycuda.autoinit +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv + +import scipy.sparse # TODO: refactor to remove this + +cusparse_real_dtypes = [np.float32, np.float64] +cusparse_complex_dtypes = [np.complex64, np.complex128] +cusparse_dtypes = cusparse_real_dtypes + cusparse_complex_dtypes +trans_list = [CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, + CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE] + + +def test_context_create_destroy(): + handle = cusparseCreate() + cusparseDestroy(handle) + + +def test_get_version(): + handle = cusparseCreate() + try: + version = cusparseGetVersion(handle) + assert type(version) == int + finally: + cusparseDestroy(handle) + +@skipIf(toolkit_version < (4, 1, 0), "HyB format added in CUDA Toolkit v4.1") +def test_create_destroy_hyb(): + HybA = cusparseCreateHybMat() + cusparseDestroyHybMat(HybA) + + +@skipIf(toolkit_version < (4, 0, 0), "Analysis Info added in CUDA v4.0") +def test_create_destroy_AnalysisInfo(): + info = cusparseCreateSolveAnalysisInfo() + cusparseDestroySolveAnalysisInfo(info) + + +@skipIf(toolkit_version < (6, 0, 0), + "additional solve info types introduced in CUDA v6.0") +def test_create_destroy_SolveInfo_v6(): + # CSR cases + info = cusparse.cusparseCreateCsrsv2Info() + cusparse.cusparseDestroyCsrsv2Info(info) + + info = cusparse.cusparseCreateCsrilu02Info() + cusparse.cusparseDestroyCsrilu02Info(info) + + info = cusparse.cusparseCreateCsric02Info() + cusparse.cusparseDestroyCsric02Info(info) + + # BSR cases + info = cusparse.cusparseCreateBsrsv2Info() + cusparse.cusparseDestroyBsrsv2Info(info) + + info = cusparse.cusparseCreateBsrilu02Info() + cusparse.cusparseDestroyBsrilu02Info(info) + + info = cusparse.cusparseCreateBsric02Info() + cusparse.cusparseDestroyBsric02Info(info) + + +@skipIf(toolkit_version < (6, 5, 0), + "additional solve info types introduced in CUDA v6.5") +def test_create_destroy_SolveInfo_v65(): + info = cusparse.cusparseCreateBsrsm2Info() + cusparse.cusparseDestroyBsrsm2Info(info) + +@skipIf(toolkit_version < (7, 0, 0), "ColorInfo introduced in CUDA v7.0") +def test_create_destroy_ColorInfo(): + # CSR cases + info = cusparse.cusparseCreateColorInfo() + cusparse.cusparseDestroyColorInfo(info) + + +def test_set_stream(): + handle = cusparseCreate() + stream = drv.Stream() + try: + cusparseSetStream(handle, stream.handle) + finally: + cusparseDestroy(handle) + +@skipIf(toolkit_version < (4, 1, 0), "skip for CUDA < v4.1") +def test_get_set_PointerMode(): + handle = cusparseCreate() + try: + # test default mode + mode = cusparseGetPointerMode(handle) + assert mode == CUSPARSE_POINTER_MODE_HOST + + # test setting/getting new mode + cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE) + mode = cusparseGetPointerMode(handle) + assert mode == CUSPARSE_POINTER_MODE_DEVICE + + # can't set outside enumerated range + assert_raises(CUSPARSE_STATUS_INVALID_VALUE, cusparseSetPointerMode, + handle, 2) + finally: + cusparseDestroy(handle) + + +def test_matrix_descriptor_create_get_set_destroy(): + # create matrix description + descrA = cusparseCreateMatDescr() + + try: + # get default values/set + assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_GENERAL + assert cusparseGetMatDiagType(descrA) == CUSPARSE_DIAG_TYPE_NON_UNIT + assert cusparseGetMatIndexBase(descrA) == CUSPARSE_INDEX_BASE_ZERO + assert cusparseGetMatFillMode(descrA) == CUSPARSE_FILL_MODE_LOWER + + # test set/get new values + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_HERMITIAN) + assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_HERMITIAN + cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT) + assert cusparseGetMatDiagType(descrA) == CUSPARSE_DIAG_TYPE_UNIT + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE) + assert cusparseGetMatIndexBase(descrA) == CUSPARSE_INDEX_BASE_ONE + cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER) + assert cusparseGetMatFillMode(descrA) == CUSPARSE_FILL_MODE_UPPER + + # can't set outside enumerated range + assert_raises( + OverflowError, cusparseSetMatType, descrA, -1) + assert_raises( + CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatType, descrA, 100) + assert_raises( + CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatDiagType, descrA, 100) + assert_raises( + CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatIndexBase, descrA, + 100) + assert_raises( + CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatFillMode, descrA, 100) + + # OLD BEHAVIOR: float input gets cast to int + # cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL + 0.5) + # assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_GENERAL + assert_raises(TypeError, cusparseSetMatType, descrA, + CUSPARSE_MATRIX_TYPE_GENERAL + 0.5) + finally: + cusparseDestroyMatDescr(descrA) + + +def test_dense_nnz(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + A = gpuarray.to_gpu(A_cpu) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + # loop over all directions and dtypes + try: + cusparse_dtypes = [np.float32, np.float64, np.complex64, np.complex128] + for dirA in [CUSPARSE_DIRECTION_ROW, CUSPARSE_DIRECTION_COLUMN]: + for dtype in cusparse_dtypes: + nnzRowCol, nnzTotal = dense_nnz( + descrA, A.astype(dtype), dirA=dirA) + assert nnzTotal == 5 + if dirA == CUSPARSE_DIRECTION_ROW: + assert_equal(nnzRowCol.get(), [3, 0, 1, 1]) + else: + assert_equal(nnzRowCol.get(), [1, 2, 2]) + finally: + cusparseDestroyMatDescr(descrA) + + +def test_dense2csr_csr2dense(): + A = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + m, n = A.shape + for dtype in cusparse_dtypes: + A = A.astype(dtype) + A_csr_scipy = scipy.sparse.csr_matrix(A) + (descrA, csrValA, csrRowPtrA, csrColIndA) = dense2csr(A) + try: + assert_equal(csrValA.get(), A_csr_scipy.data) + assert_equal(csrRowPtrA.get(), A_csr_scipy.indptr) + assert_equal(csrColIndA.get(), A_csr_scipy.indices) + + A_dense = csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA) + assert_equal(A, A_dense.get()) + finally: + # release descrA that was generated within dense2csr + cusparseDestroyMatDescr(descrA) + + +def test_dense2csc_csc2dense(): + # comparison to scipy ColPtr/RowInd currently known to fail + # is this a bug or a different coordinate convention? + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F') + m, n = A_cpu.shape + for dtype in cusparse_dtypes: + A_cpu = A_cpu.astype(dtype) + A_csc_scipy = scipy.sparse.csc_matrix(A_cpu) + A = gpuarray.to_gpu(A_cpu) + (descrA, cscValA, cscColPtrA, cscRowIndA) = dense2csc(A) + try: + assert_equal(cscValA.get(), A_csc_scipy.data) + assert_equal(cscColPtrA.get(), A_csc_scipy.indptr) + assert_equal(cscRowIndA.get(), A_csc_scipy.indices) + + A_dense = csc2dense(m, n, descrA, cscValA, cscColPtrA, cscRowIndA) + assert_equal(A_cpu, A_dense.get()) + finally: + # release descrA that was generated within dense2csc + cusparseDestroyMatDescr(descrA) + + +def test_csr2csc_csc2csr(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F') + m, n = A_cpu.shape + + for dtype in cusparse_dtypes: + A = gpuarray.to_gpu(A_cpu.astype(dtype)) + A_csr_scipy = scipy.sparse.csr_matrix(A_cpu) + A_csc_scipy = A_csr_scipy.tocsc() + (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A) + + try: + cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr, + csrColInd) + # verify match to scipy + assert_equal(cscVal.get(), A_csc_scipy.data) + assert_equal(cscColPtr.get(), A_csc_scipy.indptr) + assert_equal(cscRowInd.get(), A_csc_scipy.indices) + + # repeat for inverse operation + csrVal, csrRowPtr, csrColInd = csc2csr(n, m, cscVal, cscColPtr, + cscRowInd) + # verify match to scipy + assert_equal(csrVal.get(), A_csr_scipy.data) + assert_equal(csrRowPtr.get(), A_csr_scipy.indptr) + assert_equal(csrColInd.get(), A_csr_scipy.indices) + + finally: + cusparseDestroyMatDescr(descr) + + +def test_csr2coo_coo2csr(): + A_cpu = np.asarray([[1, 0, 0], [0, 2, 0], [3, 0, 4], [0, 0, 5]], order='F') + m, n = A_cpu.shape + + for dtype in cusparse_dtypes: + A = gpuarray.to_gpu(A_cpu.astype(dtype)) + (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A) + try: + nnz = csrVal.size + cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr, + csrColInd) + + cooRowInd = csr2coo(csrRowPtr, nnz) + + # couldn't compare to scipy due to different ordering, so check the + # values directly + vals = csrVal.get() + rows = cooRowInd.get() + cols = csrColInd.get() + for idx in range(nnz): + assert A_cpu[rows[idx], cols[idx]] == vals[idx] + + # repeat for inverse operation + csrRowPtr_v2 = coo2csr(cooRowInd, m) + assert_equal(csrRowPtr_v2.get(), csrRowPtr.get()) + + finally: + cusparseDestroyMatDescr(descr) + + +def test_csc2coo_coo2csc(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F') + m, n = A_cpu.shape + + for dtype in cusparse_dtypes: + A = gpuarray.to_gpu(A_cpu.astype(dtype)) + (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A) + + try: + nnz = csrVal.size + + cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr, + csrColInd) + cooColInd = csc2coo(cscColPtr, nnz) + # couldn't compare to scipy due to different ordering, so check the + # values directly + vals = csrVal.get() + rows = cscRowInd.get() + cols = cooColInd.get() + for idx in range(nnz): + assert A_cpu[rows[idx], cols[idx]] == vals[idx] + + # repeat for inverse operation + cscColPtr_v2 = coo2csc(cooColInd, n) + assert_equal(cscColPtr_v2.get(), cscColPtr.get()) + + finally: + cusparseDestroyMatDescr(descr) + + +def test_csrmv(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + # A = gpuarray.to_gpu(A_cpu) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + csr_numpy = scipy.sparse.csr_matrix(A_cpu) + indptr = csr_numpy.indptr + indices = csr_numpy.indices + csr_data = csr_numpy.data + + csrRowPtrA = gpuarray.to_gpu(indptr) + csrColIndA = gpuarray.to_gpu(indices) + m, n = csr_numpy.shape + alpha = 2.0 + # loop over all transpose operations and dtypes + try: + for transA in trans_list: + for dtype in cusparse_dtypes: + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + x = gpuarray.to_gpu(np.ones((n, ), dtype=dtype)) + else: + x = gpuarray.to_gpu(np.ones((m, ), dtype=dtype)) + csrValA = gpuarray.to_gpu(csr_data.astype(dtype)) + + # test mutliplication without passing in y + beta = 0.0 + y = csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m, + n, x, transA=transA, alpha=alpha, beta=beta) + y_cpu = y.get() + + # repeat, but pass in previous y with beta = 1.0 + beta = 1.0 + y = csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m, + n, x, transA=transA, alpha=alpha, beta=beta, y=y) + y_cpu2 = y.get() + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + assert_almost_equal(y_cpu, [2., 2., 4., 6.]) + assert_almost_equal(y_cpu2, 2 * y_cpu) + else: + assert_almost_equal(y_cpu, [4., 2., 8.]) + assert_almost_equal(y_cpu2, 2 * y_cpu) + finally: + cusparseDestroyMatDescr(descrA) + + +def test_csrmm(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + csr_numpy = scipy.sparse.csr_matrix(A_cpu) + indptr = csr_numpy.indptr + indices = csr_numpy.indices + csr_data = csr_numpy.data + + csrRowPtrA = gpuarray.to_gpu(indptr) + csrColIndA = gpuarray.to_gpu(indices) + n = 5 + alpha = 2.0 + + # loop over all transpose operations and dtypes + try: + for transA in trans_list: + for dtype in cusparse_dtypes: + csrValA = gpuarray.to_gpu(csr_data.astype(dtype)) + + m, k = A_cpu.shape + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + B_cpu = np.ones((k, n), dtype=dtype, order='F') + expected_result = alpha * np.dot(A_cpu, B_cpu) + else: + B_cpu = np.ones((m, n), dtype=dtype, order='F') + if transA == CUSPARSE_OPERATION_TRANSPOSE: + expected_result = alpha * np.dot(A_cpu.T, B_cpu) + else: + expected_result = alpha * np.dot(np.conj(A_cpu).T, + B_cpu) + B = gpuarray.to_gpu(B_cpu) + # test mutliplication without passing in C + beta = 0.0 + C = csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, + transA=transA, alpha=alpha, beta=beta) + C_cpu = C.get() + assert_almost_equal(C_cpu, expected_result) + + # repeat, but pass in previous y with beta = 1.0 + beta = 1.0 + C = csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, + C=C, transA=transA, alpha=alpha, beta=beta) + C_cpu2 = C.get() + assert_almost_equal(C_cpu2, 2*expected_result) + finally: + cusparseDestroyMatDescr(descrA) + + +@skipIf(toolkit_version < (5, 5, 0), "skip for CUDA < v5.5") +def test_csrmm2(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + csr_numpy = scipy.sparse.csr_matrix(A_cpu) + indptr = csr_numpy.indptr + indices = csr_numpy.indices + csr_data = csr_numpy.data + + csrRowPtrA = gpuarray.to_gpu(indptr) + csrColIndA = gpuarray.to_gpu(indices) + n = 5 + alpha = 2.0 + try: + for transB in trans_list[:-1]: + for transA in trans_list: + for dtype in cusparse_dtypes: + csrValA = gpuarray.to_gpu(csr_data.astype(dtype)) + + m, k = A_cpu.shape + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + m, k = A_cpu.shape + + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + B_cpu = np.ones((k, n), dtype=dtype, order='F') + opB = B_cpu + else: + B_cpu = np.ones((n, k), dtype=dtype, order='F') + opB = B_cpu.T + opA = A_cpu + else: + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + B_cpu = np.ones((m, n), dtype=dtype, order='F') + opB = B_cpu + else: + # cuSPARSE doesn't implement this case + continue + # B_cpu = np.ones((n, m), dtype=dtype, order='F') + # opB = B_cpu.T + if transA == CUSPARSE_OPERATION_TRANSPOSE: + opA = A_cpu.T + else: + opA = np.conj(A_cpu).T + + expected_result = alpha * np.dot(opA, opB) + B = gpuarray.to_gpu(B_cpu) + + # test mutliplication without passing in C + beta = 0.0 + C = csrmm2(m, n, k, descrA, csrValA, csrRowPtrA, + csrColIndA, B, transA=transA, transB=transB, + alpha=alpha, beta=beta) + C_cpu = C.get() + assert_almost_equal(C_cpu, expected_result) + + # repeat, but pass in previous y with beta = 1.0 + beta = 1.0 + C = csrmm2(m, n, k, descrA, csrValA, csrRowPtrA, + csrColIndA, B, C=C, transA=transA, + transB=transB, alpha=alpha, beta=beta) + C_cpu2 = C.get() + assert_almost_equal(C_cpu2, 2*expected_result) + finally: + cusparseDestroyMatDescr(descrA) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_csrgeamNnz(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + A_cpu = scipy.sparse.csr_matrix(A_cpu) + B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]]) + B_cpu = scipy.sparse.csr_matrix(B_cpu) + C_cpu = A_cpu + B_cpu + nnz_expected = C_cpu.getnnz() + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + descrB = cusparseCreateMatDescr() + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO) + + csrRowPtrA = gpuarray.to_gpu(A_cpu.indptr) + csrColIndA = gpuarray.to_gpu(A_cpu.indices) + + csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr) + csrColIndB = gpuarray.to_gpu(B_cpu.indices) + + m, n = A_cpu.shape + + # test alternative case where descrC, csrRowPtrC not preallocated + descrC, csrRowPtrC, nnzC = _csrgeamNnz( + m, n, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB, csrColIndB, + check_inputs=True) + cusparseDestroyMatDescr(descrC) + assert_equal(nnzC, nnz_expected) + + # now test cases with preallocated matrix descrC & csrrowPtrC + descrC = cusparseCreateMatDescr() + try: + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + csrRowPtrC = gpuarray.to_gpu(np.zeros((m+1, ), dtype=np.int32)) + nnzC = _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB, + csrRowPtrB, csrColIndB, descrC=descrC, + csrRowPtrC=csrRowPtrC, nnzA=None, nnzB=None, + check_inputs=True) + assert_equal(nnzC, nnz_expected) + finally: + cusparseDestroyMatDescr(descrC) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_csrgeam(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + A_cpu = scipy.sparse.csr_matrix(A_cpu) + B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]]) + B_cpu = scipy.sparse.csr_matrix(B_cpu) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + descrB = cusparseCreateMatDescr() + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO) + + csrRowPtrA = gpuarray.to_gpu(A_cpu.indptr) + csrColIndA = gpuarray.to_gpu(A_cpu.indices) + + csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr) + csrColIndB = gpuarray.to_gpu(B_cpu.indices) + + m, n = A_cpu.shape + + alpha = 0.3 + beta = 5.0 + C_cpu = alpha*A_cpu + beta*B_cpu + + try: + for dtype in cusparse_dtypes: + csrValA = gpuarray.to_gpu(A_cpu.data.astype(dtype)) + csrValB = gpuarray.to_gpu(B_cpu.data.astype(dtype)) + try: + descrC, csrValC, csrRowPtrC, csrColIndC = csrgeam( + m, n, descrA, csrValA, csrRowPtrA, csrColIndA, descrB, + csrValB, csrRowPtrB, csrColIndB, alpha=alpha, beta=beta) + assert_almost_equal(csrValC.get(), C_cpu.data) + finally: + cusparseDestroyMatDescr(descrC) + finally: + cusparseDestroyMatDescr(descrA) + cusparseDestroyMatDescr(descrB) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_csrgemmNnz(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + # A = gpuarray.to_gpu(A_cpu) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + descrB = cusparseCreateMatDescr() + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO) + + csr_numpy = scipy.sparse.csr_matrix(A_cpu) + indptr = csr_numpy.indptr + indices = csr_numpy.indices + + B_cpu = csr_numpy.T.tocsr() + + csrRowPtrA = gpuarray.to_gpu(indptr) + csrColIndA = gpuarray.to_gpu(indices) + + csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr) + csrColIndB = gpuarray.to_gpu(B_cpu.indices) + + m, k = A_cpu.shape + n = B_cpu.shape[1] + # test alternative case where descrC, csrRowPtrC not preallocated + transA = transB = CUSPARSE_OPERATION_NON_TRANSPOSE + descrC, csrRowPtrC, nnzC = _csrgemmNnz( + m, n, k, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB, + csrColIndB, transA=transA, transB=transB, check_inputs=True) + cusparseDestroyMatDescr(descrC) + + # now test cases with preallocated matrix description + descrC = cusparseCreateMatDescr() + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL) + + # loop over all transpose operations and dtypes + try: + for transA in trans_list: + transB = transA + + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + m, k = A_cpu.shape + else: + k, m = A_cpu.shape + + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + kB, n = B_cpu.shape + else: + n, kB = B_cpu.shape + + csrRowPtrC = gpuarray.to_gpu(np.zeros((m+1, ), dtype=np.int32)) + nnzC = _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA, + descrB, csrRowPtrB, csrColIndB, descrC=descrC, + csrRowPtrC=csrRowPtrC, nnzA=None, nnzB=None, + transA=transA, transB=transB, check_inputs=True) + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + assert nnzC == 8 + assert_equal(csrRowPtrC.get(), [0, 2, 3, 6, 8]) + else: + assert nnzC == 5 + assert_equal(csrRowPtrC.get(), [0, 2, 3, 5]) + finally: + cusparseDestroyMatDescr(descrA) + cusparseDestroyMatDescr(descrB) + cusparseDestroyMatDescr(descrC) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_csrgemm(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + # A = gpuarray.to_gpu(A_cpu) + + descrA = cusparseCreateMatDescr() + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) + + descrB = cusparseCreateMatDescr() + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL) + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO) + + csr_numpy = scipy.sparse.csr_matrix(A_cpu) + indptr = csr_numpy.indptr + indices = csr_numpy.indices + csr_data = csr_numpy.data + + B_cpu = csr_numpy.T.tocsr() + + csrRowPtrA = gpuarray.to_gpu(indptr) + csrColIndA = gpuarray.to_gpu(indices) + + csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr) + csrColIndB = gpuarray.to_gpu(B_cpu.indices) + + m, k = A_cpu.shape + + try: + for transA in trans_list: + transB = transA + + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + m, k = A_cpu.shape + else: + k, m = A_cpu.shape + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + kB, n = B_cpu.shape + else: + n, kB = B_cpu.shape + + for dtype in cusparse_dtypes: + csrValA = gpuarray.to_gpu(csr_data.astype(dtype)) + csrValB = gpuarray.to_gpu(B_cpu.data.astype(dtype)) + + descrC, csrValC, csrRowPtrC, csrColIndC = csrgemm( + m, n, k, descrA, csrValA, csrRowPtrA, + csrColIndA, descrB, csrValB, csrRowPtrB, csrColIndB, + transA=transA, transB=transB) + cusparseDestroyMatDescr(descrC) + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + assert_almost_equal(csrValC.get(), + [1, 1, 1, 1, 2, 3, 3, 9]) + else: + assert_almost_equal(csrValC.get(), [2, 1, 1, 1, 10]) + + + finally: + cusparseDestroyMatDescr(descrA) + cusparseDestroyMatDescr(descrB) + + +def test_CSR_construction(): + n = 64 + h = cusparseCreate() + try: + for dtype in cusparse_dtypes: + A = 2*np.eye(n) + + A_scipy_csr = scipy.sparse.csr_matrix(A) + + # generate a CSR matrix from a dense gpuarray + A_CSR = CSR.to_CSR(gpuarray.to_gpu(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A, h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a CSR matrix from a list of lists + A_CSR = CSR.to_CSR(A.tolist(), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse CSR matrix + A_CSR = CSR.to_CSR(A_scipy_csr, h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse BSR matrix + A_CSR = CSR.to_CSR(scipy.sparse.bsr_matrix(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse COO matrix + A_CSR = CSR.to_CSR(scipy.sparse.coo_matrix(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse CSC matrix + A_CSR = CSR.to_CSR(scipy.sparse.csc_matrix(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse DIA matrix + A_CSR = CSR.to_CSR(scipy.sparse.dia_matrix(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + + # generate a csSPARSE CSR matrix from a scipy.sparse DOK matrix + A_CSR = CSR.to_CSR(scipy.sparse.dok_matrix(A), h) + assert_equal(A_CSR.Val.get(), A_scipy_csr.data) + finally: + cusparseDestroy(h) + + +def test_CSR_properties(): + n = 64 + h = cusparseCreate() + A = 2*np.eye(n) + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A, h) + + # test shape + assert A_CSR.shape == A.shape + + # test matrix_type property + assert A_CSR.matrix_type == CUSPARSE_MATRIX_TYPE_GENERAL + A_CSR.matrix_type = CUSPARSE_MATRIX_TYPE_SYMMETRIC + assert A_CSR.matrix_type == CUSPARSE_MATRIX_TYPE_SYMMETRIC + assert cusparseGetMatType(A_CSR.descr) == CUSPARSE_MATRIX_TYPE_SYMMETRIC + + # test index_base property + assert A_CSR.index_base == CUSPARSE_INDEX_BASE_ZERO + A_CSR.index_base = CUSPARSE_INDEX_BASE_ONE + assert A_CSR.index_base == CUSPARSE_INDEX_BASE_ONE + assert cusparseGetMatIndexBase(A_CSR.descr) == CUSPARSE_INDEX_BASE_ONE + + # test diag_type property + assert A_CSR.diag_type == CUSPARSE_DIAG_TYPE_NON_UNIT + A_CSR.diag_type = CUSPARSE_DIAG_TYPE_UNIT + assert A_CSR.diag_type == CUSPARSE_DIAG_TYPE_UNIT + assert cusparseGetMatDiagType(A_CSR.descr) == CUSPARSE_DIAG_TYPE_UNIT + + # test fill_mode property + assert A_CSR.fill_mode == CUSPARSE_FILL_MODE_LOWER + A_CSR.fill_mode = CUSPARSE_FILL_MODE_UPPER + assert A_CSR.fill_mode == CUSPARSE_FILL_MODE_UPPER + assert cusparseGetMatFillMode(A_CSR.descr) == CUSPARSE_FILL_MODE_UPPER + + # verify setting value outside the valid range raises an exception + def set_mat_type(A, t): + A_CSR.matrix_type = t + assert_raises(CUSPARSE_STATUS_INVALID_VALUE, set_mat_type, A_CSR, 100) + + # test get nnz + assert A_CSR.nnz == n + assert A_CSR.getnnz() == n + + # verify that nnz can't be set + def set_nnz(A, nnz): + A_CSR.nnz = nnz + assert_raises(AttributeError, set_nnz, A_CSR, 5) + + +def test_CSR_properties2(): + n = 64 + h = cusparseCreate() + try: + for dtype in cusparse_complex_dtypes: + A = 2*np.eye(n).astype(dtype) + # add a couple of complex values + A[0, 1] = 1j + A[1, 0] = -1j + A_CSR = CSR.to_CSR(A, h) + orig_data = A_CSR.data.get() + orig_indices = A_CSR.indices.get() + orig_indptr = A_CSR.indptr.get() + + assert_equal(abs(A_CSR).data.get(), abs(orig_data)) + assert_equal(-A_CSR.data.get(), -orig_data) + + A_CSR *= 2 + assert_almost_equal(A_CSR.data.get(), 2*orig_data) + + # this test requires from __future__ import division in Python 2.x + A_CSR /= 2 # A_CSR._with_data(A_CSR.data/2) + assert_almost_equal(A_CSR.data.get(), orig_data) + + assert_equal(A_CSR.conj().data.get(), np.conj(orig_data)) + assert_equal(A_CSR.real.data.get(), orig_data.real) + assert_equal(A_CSR.imag.data.get(), orig_data.imag) + assert_equal(A_CSR.A.get(), A) + assert_equal(A_CSR.T.A.get(), A.T) + + # make sure the above didn't modify the original data + assert_equal(A_CSR.data.get(), orig_data) + assert_equal(A_CSR.indices.get(), orig_indices) + assert_equal(A_CSR.indptr.get(), orig_indptr) + + assert_equal(A_CSR.H.A.get(), A.T.conj()) + assert_equal(A_CSR.size, A_CSR.nnz) + finally: + cusparseDestroy(h) + + +def test_CSR_todense(): + n = 64 + h = cusparseCreate() + try: + dtype = np.float32 + A = 2*np.eye(n) + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A.astype(dtype), h) + + # convert cusparseCSR back to a dense matrix + A_dense = A_CSR.todense(to_cpu=False) + assert type(A_dense) == gpuarray.GPUArray + assert_equal(A_dense.get(), A) + + A_dense = A_CSR.todense(to_cpu=True) + assert type(A_dense) == np.ndarray + assert_equal(A_dense, A) + finally: + cusparseDestroy(h) + + +def test_CSR_get(): + n = 64 + h = cusparseCreate() + try: + dtype = np.float32 + A = 2*np.eye(n) + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A.astype(dtype), h) + + # convert cusparseCSR back to a dense matrix + csr_scipy = A_CSR.get() + # assert_equal(A_CSR.data.get(), csr_scipy.data) + # assert_equal(A_CSR.indices.get(), csr_scipy.indices) + # assert_equal(A_CSR.indptr.get(), csr_scipy.indptr) + + assert_equal(csr_scipy.indptr, scipy.sparse.csr_matrix(A).indptr) + assert_equal(csr_scipy.indices, scipy.sparse.csr_matrix(A).indices) + assert_equal(csr_scipy.data, scipy.sparse.csr_matrix(A).data) + finally: + cusparseDestroy(h) + + +def test_CSR_mv(): + n = 64 + h = cusparseCreate() + try: + A = 2*np.eye(n) + + dtype = np.float32 + x_cpu = np.ones((n, ), dtype=dtype) + for dtype in cusparse_dtypes: + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A.astype(dtype), h) + x = gpuarray.to_gpu(x_cpu.astype(dtype)) + + # test default operation + y = A_CSR.mv(x) + assert_almost_equal(y.get(), A.diagonal()) + # transpose will be the same for this matrix + y = A_CSR.mv(x, transA=CUSPARSE_OPERATION_TRANSPOSE) + assert_almost_equal(y.get(), A.diagonal()) + + alpha = 5.0 + y = A_CSR.mv(x, alpha=alpha) + assert_almost_equal(y.get(), alpha*A.diagonal()) + + # repeat with non-zero beta and initialize with previous y + beta = 2.5 + y = A_CSR.mv(x, alpha=alpha, beta=beta, y=y) + assert_almost_equal(y.get(), (alpha+alpha*beta)*A.diagonal()) + finally: + cusparseDestroy(h) + + +def test_CSR_mm(): + m = 64 + n = 10 + h = cusparseCreate() + try: + A = 2*np.eye(m) + + dtype = np.float32 + B_cpu = np.ones((m, n), dtype=dtype, order='F') + for dtype in cusparse_dtypes: + A = A.astype(dtype) + + # generate a CSR matrix from a dense numpy array + A_CSR = CSR.to_CSR(A, h) + B = gpuarray.to_gpu(B_cpu.astype(dtype)) + + # test default operation + C = A_CSR.mm(B) + assert_almost_equal(C.get(), np.dot(A, B_cpu)) + # transpose will be the same for this matrix + C = A_CSR.mm(B_cpu.astype(dtype), transA=CUSPARSE_OPERATION_TRANSPOSE) + assert_almost_equal(C.get(), np.dot(A.T, B_cpu)) + + alpha = 5.0 + C = A_CSR.mm(B, alpha=alpha) + assert_almost_equal(C.get(), alpha*np.dot(A, B_cpu)) + + # repeat with non-zero beta and initialize with previous C + beta = 2.5 + C = A_CSR.mm(B, alpha=alpha, beta=beta, C=C) + assert_almost_equal(C.get(), (alpha+alpha*beta)*np.dot(A, B_cpu)) + finally: + cusparseDestroy(h) + + +@skipIf(toolkit_version < (5, 5, 0), "skip for CUDA < v5.5") +def test_CSR_mm2(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + n = 5 + alpha = 2.0 + h = cusparseCreate() + try: + for transB in trans_list[:-1]: + for transA in trans_list: + for dtype in cusparse_dtypes: + m, k = A_cpu.shape + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + B_cpu = np.ones((k, n), dtype=dtype, order='F') + opB = B_cpu + else: + B_cpu = np.ones((n, k), dtype=dtype, order='F') + opB = B_cpu.T + opA = A_cpu + else: + if transB == CUSPARSE_OPERATION_NON_TRANSPOSE: + B_cpu = np.ones((m, n), dtype=dtype, order='F') + opB = B_cpu + else: + # cuSPARSE doesn't implement this case + continue + if transA == CUSPARSE_OPERATION_TRANSPOSE: + opA = A_cpu.T + else: + opA = np.conj(A_cpu).T + + expected_result = alpha * np.dot(opA, opB) + B = gpuarray.to_gpu(B_cpu) + + A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h) + + # test mutliplication without passing in C + beta = 0.0 + C = A_CSR.mm2(B, transA=transA, transB=transB, alpha=alpha, + beta=beta, to_cpu=False) + assert_almost_equal(C.get(), expected_result) + + # repeat, but pass in previous y with beta = 1.0 + beta = 1.0 + C = A_CSR.mm2(B, transA=transA, transB=transB, alpha=alpha, + beta=beta, C=C, to_cpu=True) + assert_almost_equal(C, (1.0+beta)*expected_result) + + finally: + cusparseDestroy(h) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_CSR_geam(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]]) + + h = cusparseCreate() + alpha = 0.3 + beta = 5.0 + + try: + for dtype in cusparse_dtypes: + A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h) + B_CSR = CSR.to_CSR(B_cpu.astype(dtype), h) + C_CSR = A_CSR.geam(B_CSR, alpha=alpha, beta=beta) + + # compute on CPU with numpy and compare + C_cpu = alpha*A_cpu + beta*B_cpu + assert_almost_equal(C_CSR.todense(to_cpu=True), C_cpu) + finally: + cusparseDestroy(h) + + +@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0") +def test_CSR_gemm(): + A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]]) + + h = cusparseCreate() + B_cpu = A_cpu.T + try: + for transA in trans_list: + transB = transA + + if transA == CUSPARSE_OPERATION_NON_TRANSPOSE: + opA = A_cpu + opB = B_cpu + else: + opA = A_cpu.T + opB = B_cpu.T + + for dtype in cusparse_dtypes: + A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h) + B_CSR = CSR.to_CSR(B_cpu.astype(dtype), h) + C_CSR = A_CSR.gemm(B_CSR, transA=transA, transB=transB) + + # compute on CPU with numpy and compare + C_cpu = np.dot(opA, opB) + assert_almost_equal(C_CSR.todense(to_cpu=True), C_cpu) + finally: + cusparseDestroy(h)