diff --git a/MANIFEST.in b/MANIFEST.in
index bbb89e4..1108773 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
 include AUTHORS CHANGES INSTALL LICENSE README.rst tox.ini
 recursive-include scikits *.h
+recursive-include scikits *.json
 recursive-include demos *.py
 recursive-include tests *.py
 recursive-include docs *
diff --git a/scikits/cuda/_cusparse_cffi.py b/scikits/cuda/_cusparse_cffi.py
new file mode 100644
index 0000000..afd5117
--- /dev/null
+++ b/scikits/cuda/_cusparse_cffi.py
@@ -0,0 +1,101 @@
+"""
+Autogenerate Python interface to cuSPARSE functions.
+
+"""
+from __future__ import absolute_import, print_function
+
+import os
+import re
+import numpy as np
+
+from os.path import join as pjoin
+
+from ._cusparse_cffi_autogen import (generate_cffi_cdef,
+                                     ffi_init_cusparse,
+                                     generate_func_descriptions_json,
+                                     generate_cusparse_python_wrappers)
+
+base_dir = os.path.dirname(__file__)
+cffi_file = pjoin(base_dir, '_cusparse.cffi')
+python_wrapper_file = pjoin(base_dir, '_cusparse_python.py')
+variable_defs_json = pjoin(base_dir, 'cusparse_variable_descriptions.json')
+func_defs_json = pjoin(base_dir, 'cusparse_func_descriptions.json')
+
+if not os.path.exists(cffi_file):
+    print("first import of cusparse:  cffi interface file being created."
+          "This may take several seconds")
+    cffi_cdef = generate_cffi_cdef(cffi_out_file=cffi_file)
+else:
+    with open(cffi_file, 'r') as f:
+        cffi_cdef = f.read()
+
+ffi, ffi_lib = ffi_init_cusparse(cffi_cdef)
+
+if not os.path.exists(python_wrapper_file):
+    print("first import of cusparse:  python wrappers being created.")
+    if not os.path.exists(func_defs_json):
+        generate_func_descriptions_json(ffi_lib, json_file=func_defs_json)
+    generate_cusparse_python_wrappers(cffi_cdef,
+                                      variable_defs_json=variable_defs_json,
+                                      func_defs_json=func_defs_json,
+                                      python_wrapper_file=python_wrapper_file)
+
+
+class CUSPARSE_ERROR(Exception):
+    """CUSPARSE error"""
+    pass
+
+# Use CUSPARSE_STATUS* definitions to dynamically create corresponding
+# exception classes and populate dictionary used to raise appropriate
+# exception in response to the corresponding CUSPARSE error code:
+CUSPARSE_STATUS_SUCCESS = ffi_lib.CUSPARSE_STATUS_SUCCESS
+CUSPARSE_EXCEPTIONS = {-1: CUSPARSE_ERROR}
+for k, v in ffi_lib.__dict__.items():
+    # Skip CUSPARSE_STATUS_SUCCESS:
+    if re.match('CUSPARSE_STATUS.*', k) and v != CUSPARSE_STATUS_SUCCESS:
+        CUSPARSE_EXCEPTIONS[v] = vars()[k] = type(k, (CUSPARSE_ERROR,), {})
+
+
+# Import various other enum values into module namespace:
+regex = 'CUSPARSE_(?!STATUS).*'
+for k, v in ffi_lib.__dict__.items():
+    if re.match(regex, k):
+        # print("k={}, v={}".format(k,v))
+        vars()[k] = v
+
+
+def cusparseCheckStatus(status):
+    """
+    Raise CUSPARSE exception
+
+    Raise an exception corresponding to the specified CUSPARSE error
+    code.
+
+    Parameters
+    ----------
+    status : int
+        CUSPARSE error code.
+
+    See Also
+    --------
+    CUSPARSE_EXCEPTIONS
+    """
+
+    if status != 0:
+        try:
+            raise CUSPARSE_EXCEPTIONS[status]
+        except KeyError:
+            raise CUSPARSE_ERROR
+
+# execute the python wrapper code
+with open(python_wrapper_file) as f:
+    code = compile(f.read(), python_wrapper_file, 'exec')
+    exec(code)
+
+
+__all__ = [k for k, v in ffi_lib.__dict__.items()]
+__all__.append('CUSPARSE_ERROR')
+__all__.append('CUSPARSE_EXCEPTIONS')
+__all__.append('cusparseCheckStatus')
+__all__.append('ffi')
+__all__.append('ffi_lib')
diff --git a/scikits/cuda/_cusparse_cffi_autogen.py b/scikits/cuda/_cusparse_cffi_autogen.py
new file mode 100644
index 0000000..1a1d210
--- /dev/null
+++ b/scikits/cuda/_cusparse_cffi_autogen.py
@@ -0,0 +1,714 @@
+"""
+Support functions for autogenerating Python interface to the cuSPARSE library.
+
+"""
+
+"""
+Developed on linux for CUDA v6.5, but should support any version >3.1 where
+cusparse.h or cusparse_v2.h can be found in CUDA_ROOT/include.
+
+Set the environment variable CUDA_ROOT to the base of your CUDA installation
+
+Note from the NVIDIA CUSPARSE release notes:
+
+"The csr2csc() and bsr2bsc() routines contain a bug in the CUDA 6.0 and 6.5
+releases. As a consequence, csrsv(), csrsv2(), csrsm(), bsrsv2(), bsrsm2(),
+and csrgemm() may produce incorrect results when working with transpose
+(CUSPARSE_OPERATION_TRANSPOSE) or conjugate-transpose
+(CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE) operations. These routines work
+correctly when the non-transpose (CUSPARSE_OPERATION_NON_TRANSPOSE) operation
+is selected. The bug has been fixed in the CUDA 7.0 release."
+"""
+
+import os
+import re
+import json
+import numpy as np
+
+import cffi
+
+# TODO: improve autodetection of cuda include folder
+CUDA_ROOT = os.environ.get('CUDA_ROOT', None) or '/usr/local/cuda'
+cuda_include_path = os.path.join(CUDA_ROOT, 'include')
+
+# on versions where cusparse_v2.h exists, use it
+cusparse_header = os.path.join(cuda_include_path, 'cusparse_v2.h')
+if not os.path.exists(cusparse_header):
+    # on old versions there was only cusparse.h
+    cusparse_header = os.path.join(cuda_include_path, 'cusparse.h')
+    if not os.path.exists(cusparse_header):
+        raise IOError("cusparse header file not found in expected "
+                      "location.  Try defining CUDA_ROOT")
+
+
+base_dir = os.path.dirname(__file__)
+
+
+def ffi_init_cusparse(cffi_cdef, cusparse_header=cusparse_header):
+    ffi = cffi.FFI()
+    ffi.cdef(cffi_cdef)
+
+    # Get the address in a cdata pointer:
+    __verify_scr = """
+    #include <{}>
+    #include <driver_types.h>
+    """.format(os.path.basename(cusparse_header))
+
+    ffi_lib = ffi.verify(__verify_scr, libraries=['cusparse'],
+                         include_dirs=['/usr/local/cuda/include'],
+                         library_dirs=['/usr/local/cuda/lib64/'])
+    return ffi, ffi_lib
+
+
+def generate_cffi_cdef(
+        cuda_include_path=cuda_include_path, cusparse_header=cusparse_header,
+        cffi_out_file=None):
+    """ generate the CUSPARSE FFI definition
+
+    Parameters
+    ----------
+    cuda_include_path : str
+        CUDA include path
+    cffi_out_file : str, optional
+        if provided, write the definition out to a file
+
+    Returns
+    -------
+    cffi_cdef : str
+        function definitions for use with cffi.  e.g. input to FFI.verify()
+
+    """
+
+    with open(cusparse_header, 'r') as f:
+        cusparse_hdr = f.readlines()
+
+    # in some version cusparse_v2.h just points to cusparse.h, so read it
+    # instead
+    for line in cusparse_hdr:
+        # if v2 header includes cusparse.h, read that one instead
+        if line.startswith('#include "cusparse.h"'):
+            cusparse_header = os.path.join(cuda_include_path, 'cusparse.h')
+            with open(cusparse_header, 'r') as f:
+                cusparse_hdr = f.readlines()
+
+    # skip lines leading up to first typedef
+    for idx, line in enumerate(cusparse_hdr):
+        if line.startswith('typedef'):
+            start_line = idx
+            break
+
+    # skip closing #if defined logic
+    for idx, line in enumerate(cusparse_hdr[start_line:]):
+        if line.startswith('#if defined(__cplusplus)') or \
+           'Define the following symbols for the new API' in line:
+            # second match is to avoid CFFI compilation errror due to the final
+            # define statements in v4.1 through v5.5
+            end_line = start_line + idx
+            break
+
+    # define other data types needed by FFI
+    # ... will be filled in from cuComplex.h by the C compiler
+    cffi_cdef = """
+    typedef struct CUstream_st *cudaStream_t;
+
+    typedef struct float2 {
+        ...;
+    } float2;
+    typedef float2 cuFloatComplex;
+    typedef float2 cuComplex;
+
+    typedef struct double2 {
+        ...;
+    } double2;
+    typedef double2 cuDoubleComplex;
+
+    typedef float cufftReal;
+    typedef double cufftDoubleReal;
+
+    typedef cuComplex cufftComplex;
+    typedef cuDoubleComplex cufftDoubleComplex;
+
+    /* definitions from cusparse header below this point */
+    """
+
+    cffi_cdef += ''.join(cusparse_hdr[start_line:end_line])
+
+    """
+    don't use the _v2 versions of the function names defined in CUDA v4.1
+    through v5.5
+    """
+    cffi_cdef = cffi_cdef.replace('_v2(', '(')
+
+    if os.name == 'nt':  # Win
+        cffi_cdef = cffi_cdef.replace('CUSPARSEAPI', '__stdcall')
+    else:  # posix, etc
+        cffi_cdef = cffi_cdef.replace('CUSPARSEAPI', '')
+
+    if cffi_out_file is not None:
+        # create specified output directory if it doesn't already exist
+        out_dir = os.path.dirname(cffi_out_file)
+        if out_dir and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        with open(cffi_out_file, 'w') as f:
+            f.write(cffi_cdef)
+
+    return cffi_cdef
+
+# sourcefilename = ffi.verifier.sourcefilename
+# modulefilename = ffi.verifier.modulefilename
+
+
+def reindent(s, numSpaces=4, lstrip=True):
+    """add indentation to a multiline string.
+
+    Parameters
+    ----------
+    s : str
+        string to reformat
+    numSpaces : str, optional
+        number of spaces to indent each line by
+    lstrip : bool, optional
+        if True, lstrip() prior to adding numSpaces
+
+    Returns
+    -------
+    s : str
+        reformatted str
+    """
+    s = s.split('\n')
+    if lstrip:
+        s = [line.lstrip() for line in s]
+
+    for idx, line in enumerate(s):
+        if line.strip() == '':
+            # don't indent empty lines
+            s[idx] = ''
+        else:
+            s[idx] = (numSpaces * ' ') + line
+    s = '\n'.join(s)
+    return s
+
+
+def _find_breakpoint(line, break_pattern=', ', nmax=80):
+    """ determine where to break the line """
+    locs = [m.start() for m in re.finditer(break_pattern, line)]
+    if len(locs) > 0:
+        break_loc = locs[np.where(
+            np.asarray(locs) < (nmax - len(break_pattern)))[0][-1]]
+        break_loc += len(break_pattern)
+    else:
+        break_loc = None
+    return locs, break_loc
+
+
+def _split_line(line, break_pattern=', ', nmax=80, pad_char='('):
+    """ split a line (repeatedly) until length < nmax chars.
+
+    split will occur at last occurence of break_pattern occuring before nmax
+    characters
+
+    subsequent lines will be indented until the first occurance of pad_char
+    in the initial line
+
+    Parameters
+    ----------
+    line : str
+        line to reformat
+    break_pattern : str, optional
+        break line only where this pattern occurs
+    nmax : int, optional
+        max number of characters to allow
+    pad_char : str, optional
+        auto-indent subsequent lines up to the first occurance of pad_char
+
+    Returns
+    -------
+    new_line : str
+        reformatted line
+    """
+    if len(line) < nmax:
+        return line.rstrip() + '\n'
+    locs, break_loc = _find_breakpoint(line,
+                                       break_pattern=break_pattern,
+                                       nmax=nmax)
+    if break_loc is None:
+        return line.rstrip() + '\n'
+    if pad_char is not None:
+        npad = line.find(pad_char) + 1
+    else:
+        npad = 0
+
+    lines = []
+    lines.append(line[:break_loc].rstrip())
+    line = ' ' * npad + line[break_loc:]
+    while (len(line) > nmax - 1) and (break_loc is not None):
+        locs, break_loc = _find_breakpoint(line,
+                                           break_pattern=break_pattern,
+                                           nmax=nmax)
+        lines.append(line[:break_loc].rstrip())
+        line = ' ' * npad + line[break_loc:]
+    lines.append(line.rstrip())
+    return '\n'.join(lines) + '\n'
+
+
+def _build_func_sig(func_name, arg_dict, return_type):
+    """ generate the python wrapper function signature line(s). """
+
+    if 'Create' in func_name:
+        # don't pass in any argument to creation functions
+        return "def %s():\n" % func_name
+
+    if ('Get' in func_name) and (return_type == 'cusparseStatus_t') and \
+        len(arg_dict) == 2:
+        basic_getter = True
+    else:
+        basic_getter = False
+
+    sig = "def %s(" % func_name
+    for k, v in arg_dict.items():
+        is_ptr = '*' in v
+        if is_ptr and basic_getter:
+            continue
+        sig += k + ", "
+    sig = sig[:-2] + "):\n"
+    # wrap to 2nd line if too long
+    return _split_line(sig, break_pattern=', ', nmax=79)
+
+
+def _build_doc_str(arg_dict, func_description='', variable_descriptions={}):
+    """ generate python wrapper docstring """
+    docstr = '"""' + func_description + '\n'
+    docstr += 'Parameters\n----------\n'
+    for k, v in arg_dict.items():
+        docstr += k + " : " + v + "\n"
+        if k in variable_descriptions:
+            docstr += reindent(variable_descriptions[k],
+                               numSpaces=4,
+                               lstrip=True)
+        else:
+            print("no variable description provided for {}".format(k))
+    docstr += '"""\n'
+    return reindent(docstr, numSpaces=4, lstrip=False)
+
+
+def _build_body(func_name, arg_dict, return_type):
+    """ generate python_wrapper function body
+
+    Note: this code is highly specific to the particulars of the cuSPARSE
+    library
+
+    """
+    body = ""
+    arg_list = ""
+
+    # the following are pointers to scalar outputs
+    # Note: pBufferSize was renamed pBufferSizeInBytes in v6.5
+    scalar_ptr_outputs = ['nnzTotalDevHostPtr',
+                          'pBufferSize',
+                          'pBufferSizeInBytes',
+                          'resultDevHostPtr']
+
+    is_creator = 'cusparseCreate' in func_name
+    is_getter = 'cusparseGet' in func_name
+
+    if return_type == 'cusparseStatus_t' and not (is_creator or is_getter):
+        is_return = False
+    else:
+        is_return = True
+
+    # else:
+    return_str = ''
+    for k, v in arg_dict.items():
+
+        """
+        set some flags based on the name/type of the argument
+        will use these flags to determine whether and how to call ffi.new or
+        ffi.cast on each variable
+        """
+        is_ptr = '*' in v
+        is_cusparse_type = '_t' in v
+        is_cusparse_ptr = is_ptr and is_cusparse_type
+        is_output_scalar = k in scalar_ptr_outputs
+        if k in ['alpha', 'beta']:
+            is_scalar = True
+        else:
+            is_scalar = False
+        if is_getter:
+            is_gpu_array = False
+        else:
+            is_gpu_array = is_ptr and (not is_cusparse_ptr) and (not is_scalar)
+        if 'Complex' in v:
+            is_complex = True
+        else:
+            is_complex = False
+
+        # convert variable to appropriate type for the FFI
+        if is_output_scalar:
+            # for scalar outputs make a new pointer
+            body += "%s = ffi.cast('%s', %s)\n" % (k, v, k)
+        elif is_getter and is_ptr and (return_type == 'cusparseStatus_t'):
+            # any pointers in cusparseGet* are new outputs to be created
+            body += "%s = ffi.new('%s')\n" % (k, v)
+        elif is_gpu_array:
+            # pass pointer to GPU array data (use either .ptr or .gpudata)
+            body += "%s = ffi.cast('%s', %s.ptr)\n" % (k, v, k)
+        elif is_cusparse_ptr:
+            if is_creator:
+                # generate custom cusparse type
+                body += "%s = ffi.new('%s')\n" % (k, v)
+            else:
+                # cast to the custom cusparse type
+                body += "%s = ffi.cast('%s', %s)\n" % (k, v, k)
+        elif is_ptr and is_scalar:
+            # create new pointer, with value initialized to scalar
+            if is_complex:
+                # complex case is a bit tricky.  requires ffi.buffer
+                body += "%sffi = ffi.new('%s')\n" % (k, v)
+                if 'cusparseC' in func_name:
+                    body += "ffi.buffer(%sffi)[:] = \
+                        np.complex64(%s).tostring()\n" % (k, k)
+                elif 'cusparseZ' in func_name:
+                    body += "ffi.buffer(%sffi)[:] = \
+                        np.complex128(%s).tostring()\n" % (k, k)
+            else:
+                body += "%s = ffi.new('%s', %s)\n" % (k, v, k)
+        elif is_ptr or v == 'cudaStream_t':
+            # case non-scalar pointer to appropriate type
+            body += "%s = ffi.cast('%s', %s)\n" % (k, v, k)
+        else:
+            # don't need explicit cast for plain int, float, etc
+            pass
+
+        # build the list of arguments to pass to the API
+        if is_ptr and is_scalar and is_complex:
+            # take into account modified argument name for complex scalars
+            arg_list += "%sffi, " % k
+        else:
+            arg_list += "%s, " % k
+
+    # add the function call and optionally return the result
+    last_key = k
+    arg_list = arg_list[:-2]  # remove trailing ", "
+    if is_getter and return_type != 'cusparseStatus_t':
+        body += "return ffi_lib.%s(%s)\n" % (func_name, arg_list)
+    else:
+        # check cusparseStatus_t state before returning
+        call_str = "status = ffi_lib.%s(%s)\n" % (func_name, arg_list)
+        body += _split_line(call_str, break_pattern=', ', nmax=76)
+        body += "cusparseCheckStatus(status)\n"
+        if is_return:
+            # len(arg_dict) == 2) is to avoid return for cusparseGetLevelInfo
+            if is_creator or (is_getter and (len(arg_dict) == 2)):
+                body += "return %s[0]\n" % last_key
+            else:
+                body += "#TODO: return the appropriate result"
+    body += '\n\n'
+    return reindent(body, numSpaces=4, lstrip=False)
+
+
+def _func_str(func_name, arg_dict, return_type,
+             variable_descriptions={}, func_description=''):
+    """ build a single python wrapper """
+    fstr = _build_func_sig(func_name, arg_dict, return_type)
+    fstr += _build_doc_str(arg_dict, func_description=func_description,
+                          variable_descriptions=variable_descriptions)
+    fstr += _build_body(func_name, arg_dict, return_type)
+    return fstr
+
+
+def build_python_func(cdef, variable_descriptions={}, func_descriptions={}):
+    """ wrap a single python function corresponding to the given cdef C
+    function string.
+
+    Parameters
+    ----------
+    cdef : str
+        single line string containing a C function definition
+    variable_descriptions : dict
+        dictionary of variable descriptions for the docstring
+    func_descriptions : dict
+        dictionary of function descriptions for the docstring
+
+    Returns
+    -------
+    str corresponding to the python_wrapper
+    """
+    cdef_regex = "(\w*)\s*(\w*)\s*\((.*)\).*"
+    p = re.compile(cdef_regex)
+    match = p.search(cdef)
+    (return_type, func_name, func_args) = match.group(1, 2, 3)
+    func_args = func_args.split(', ')
+
+    from collections import OrderedDict
+    arg_dict = OrderedDict()
+    for arg in func_args:
+        substr = arg.split()
+        if len(substr) == 2:
+            val = substr[0]
+        else:
+            val = substr[-2]
+        key = substr[-1]
+        # handle pointer
+        if key[0] == '*':
+            val += ' *'
+            key = key[1:]
+        # handle pointer to pointer
+        if key[0] == '*':
+            val += '*'
+            key = key[1:]
+        arg_dict[key] = val
+
+    func_description = func_descriptions.get(func_name, '')
+    return _func_str(func_name, arg_dict, return_type,
+                    variable_descriptions=variable_descriptions,
+                    func_description=func_description)
+
+
+# with open('cusparse_variable_descriptions.json', 'w') as fid:
+#    json.dump(variable_descriptions, fid, sort_keys=True, indent=4)
+
+def get_variable_descriptions(var_def_json):
+    """ load variable description dictionary from .json file"""
+    with open(var_def_json, 'r') as fid:
+        variable_descriptions = json.load(fid)
+    for k, v in variable_descriptions.items():
+        variable_descriptions[k] = _split_line(v, break_pattern=' ', nmax=72,
+                                               pad_char=None)
+    return variable_descriptions
+
+
+def get_function_descriptions(func_def_json):
+    """ load function description dictionary from .json file"""
+    with open(func_def_json, 'r') as fid:
+        func_descriptions = json.load(fid)
+    for k, v in func_descriptions.items():
+        func_descriptions[k] = _split_line(v, break_pattern=' ', nmax=72,
+                                           pad_char=None)
+    return func_descriptions
+
+
+def generate_func_descriptions_json(ffi_lib, json_file):
+    func_descriptions = {}
+    for t in ['S', 'D', 'C', 'Z']:
+        # functions introduced in CUDA 3.2
+        func_descriptions['cusparse' + t + 'axpyi'] = 'scalar multiply and add: y = y + alpha * x'
+        func_descriptions['cusparse' + t + 'coo2csr'] = 'convert sparse matrix formats: COO to CSR'
+        func_descriptions['cusparse' + t + 'csc2dense'] = 'convert sparse matrix formats: CSC to dense'
+        func_descriptions['cusparse' + t + 'csr2coo'] = 'convert sparse matrix formats: CSR to COO'
+        func_descriptions['cusparse' + t + 'csr2csc'] = 'convert sparse matrix formats: CSR to CSC'
+        func_descriptions['cusparse' + t + 'csr2dense'] = 'convert sparse matrix formats: CSR to dense'
+        func_descriptions['cusparse' + t + 'csrmv'] = 'sparse CSR matrix vector product:  y = alpha * op(A)*x + beta * y'
+        func_descriptions['cusparse' + t + 'csrmm'] = 'sparse CSR matrix-matrix product:  C = alpha * op(A) * B + beta * C'
+        func_descriptions['cusparse' + t + 'dense2csc'] = 'convert sparse matrix formats: dense to CSC'
+        func_descriptions['cusparse' + t + 'dense2csr'] = 'convert sparse matrix formats: dense to CSR'
+        func_descriptions['cusparse' + t + 'doti'] = 'dot product: result = y.T * x'
+        func_descriptions['cusparse' + t + 'dotci'] = 'complex conjugate dot product: result = y.H * x'
+        func_descriptions['cusparse' + t + 'gthr'] = 'gather elements of y at locations xInd into data array xVal'
+        func_descriptions['cusparse' + t + 'gthrz'] = 'gather elements of y at locations xInd into data array xVal.  Also zeros the gathered elements in y'
+        func_descriptions['cusparse' + t + 'nnz'] = 'compute number of nonzero elements per row or column and the total number of nonzero elements'
+        func_descriptions['cusparse' + t + 'roti'] = 'applies Givens rotation matrix to sparse x and dense y'
+        func_descriptions['cusparse' + t + 'sctr'] = 'scatters elements of vector x into the vector y (at locations xInd)'
+        # for CUDA 4.0+ below here
+        func_descriptions['cusparse' + t + 'csrsv_analysis'] = 'perform analysis phase of csrsv'
+        func_descriptions['cusparse' + t + 'csrsv_solve'] = 'perform solve phase of csrsv'
+        # for CUDA 4.1+ below here
+        func_descriptions['cusparse' + t + 'csr2hyb'] = 'convert sparse matrix formats: CSR to HYB'
+        func_descriptions['cusparse' + t + 'csrsm_analysis'] = 'perform analysis phase of csrsm'
+        func_descriptions['cusparse' + t + 'csrsm_solve'] = 'perform solve phase of csrsm'
+        func_descriptions['cusparse' + t + 'dense2hyb'] = 'convert sparse matrix formats: dense to HYB'
+        func_descriptions['cusparse' + t + 'gtsv'] = 'solve tridiagonal system with multiple right-hand sides: A * Y = alpha * X'
+        func_descriptions['cusparse' + t + 'gtsvStridedBatch'] = 'solve multiple tridiagonal systems for i = 0, ..., batchCount: A_i * y_i = alpha * x_i'
+        func_descriptions['cusparse' + t + 'hyb2dense'] = 'convert sparse matrix formats: HYB to dense'
+        func_descriptions['cusparse' + t + 'hybmv'] = 'sparse HYB matrix vector product:  y = alpha * op(A)*x + beta * y'
+        func_descriptions['cusparse' + t + 'hybsv_analysis'] = 'perform analysis phase of hybsv'
+        func_descriptions['cusparse' + t + 'hybsv_solve'] = 'perform solve phase of hybsv'
+        # for CUDA 5.0+ below here
+        func_descriptions['cusparse' + t + 'bsr2csr'] = 'convert sparse matrix formats: BSR to CSR'
+        func_descriptions['cusparse' + t + 'bsrmv'] = 'sparse BSR matrix vector product:  y = alpha * op(A)*x + beta * y'
+        func_descriptions['cusparse' + t + 'bsrxmv'] = 'sparse BSRX matrix vector product:  y = alpha * op(A)*x + beta * y'
+        func_descriptions['cusparse' + t + 'csr2bsr'] = 'convert sparse matrix formats: CSR to BSR'
+        func_descriptions['cusparse' + t + 'csrgeam'] = 'sparse CSR matrix-matrix operation:  C = alpha * A + beta * B'
+        func_descriptions['cusparse' + t + 'csrgemm'] = 'sparse CSR matrix-matrix operation:  C = op(A) * op(B)'
+        func_descriptions['cusparse' + t + 'csric0'] = 'CSR incomplete-Cholesky factorization:  op(A) ~= R.T * R'
+        func_descriptions['cusparse' + t + 'csrilu0'] = 'CSR incomplete-LU factorization:  op(A) ~= LU'
+        func_descriptions['cusparse' + t + 'hyb2csr'] = 'convert sparse matrix formats: HYB to CSR'
+        # for CUDA 5.5+ below here
+        func_descriptions['cusparse' + t + 'csc2hyb'] = 'convert sparse matrix formats: CSC to HYB'
+        func_descriptions['cusparse' + t + 'csrmm2'] = 'sparse CSR matrix-matrix product type 2:  C = alpha * op(A) * op(B) + beta * C'
+        func_descriptions['cusparse' + t + 'gtsv_nopivot'] = 'solve tridiagonal system with multiple right-hand sides: A * Y = alpha * X'
+        func_descriptions['cusparse' + t + 'hyb2csc'] = 'convert sparse matrix formats: HYB to CSC'
+        # for CUDA 6.0+ below here
+        func_descriptions['cusparse' + t + 'bsric02_bufferSize'] = 'return bsric02 (A ~= L * L.H) buffer size'
+        func_descriptions['cusparse' + t + 'bsric02_analysis'] = 'perform bsric02 (A ~= L * L.H) analysis phase'
+        func_descriptions['cusparse' + t + 'bsric02'] = 'perform bsric02 (A ~= L * L.H) solve phase'
+        func_descriptions['cusparse' + t + 'bsrilu02'] = 'perform bsrilu02 (A ~= LU) solve phase'
+        func_descriptions['cusparse' + t + 'bsrilu02_analysis'] = 'perform bsrilu02 (A ~= LU) analysis phase'
+        func_descriptions['cusparse' + t + 'bsrilu02_bufferSize'] = 'return bsrilu02 (A ~= LU) buffer size'
+        func_descriptions['cusparse' + t + 'bsrilu02_numericBoost'] = 'use a boost value to replace a numerical value in incomplete LU factorization'
+        func_descriptions['cusparse' + t + 'bsrsv2_analysis'] = 'perform analysis phase of bsrsv2'
+        func_descriptions['cusparse' + t + 'bsrsv2_bufferSize'] = 'return size of buffer used in bsrsv2'
+        func_descriptions['cusparse' + t + 'bsrsv2_solve'] = 'perform solve phase of bsrsv2'
+        func_descriptions['cusparse' + t + 'csr2gebsr'] = 'convert sparse matrix formats: CSR to GEBSR'
+        func_descriptions['cusparse' + t + 'csr2gebsr_bufferSize'] = 'return csr2gebsr buffer size'
+        func_descriptions['cusparse' + t + 'csrsv2_analysis'] = 'perform analysis phase of csrsv2'
+        func_descriptions['cusparse' + t + 'csrsv2_bufferSize'] = 'return size of buffer used in csrsv2'
+        func_descriptions['cusparse' + t + 'csrsv2_solve'] = 'perform solve phase of csrsv2'
+        func_descriptions['cusparse' + t + 'csric02_analysis'] = 'perform csric02 (A ~= L * L.H) analysis phase'
+        func_descriptions['cusparse' + t + 'csric02_bufferSize'] = 'return csric02 (A ~= L * L.H) buffer size'
+        func_descriptions['cusparse' + t + 'csric02'] = 'perform csric02 (A ~= L * L.H) solve phase'
+        func_descriptions['cusparse' + t + 'csrilu02'] = 'perform csrilu02 (A ~= LU) solve phase'
+        func_descriptions['cusparse' + t + 'csrilu02_analysis'] = 'perform csrilu02 (A ~= LU) analysis phase'
+        func_descriptions['cusparse' + t + 'csrilu02_bufferSize'] = 'return csrilu02 (A ~= LU) buffer size'
+        func_descriptions['cusparse' + t + 'csrilu02_numericBoost'] = 'use a boost value to replace a numerical value in incomplete LU factorization'
+        func_descriptions['cusparse' + t + 'gebsr2csr'] = 'convert sparse matrix formats: GEBSR to CSR'
+        func_descriptions['cusparse' + t + 'gebsr2gebsc'] = 'convert sparse matrix formats: GEBSR to GEBSC'
+        func_descriptions['cusparse' + t + 'gebsr2gebsc_bufferSize'] = 'return gebsr2gebsc buffer size'
+        func_descriptions['cusparse' + t + 'gebsr2gebsr'] = 'convert sparse matrix formats: GEBSR to GEBSR'
+        func_descriptions['cusparse' + t + 'gebsr2gebsr_bufferSize'] = 'return gebsr2gebsr or gebsr2gebsrNnz buffer size'
+        # for CUDA 6.5+ below here
+        func_descriptions['cusparse' + t + 'bsrmm'] = 'sparse BSR matrix-matrix product:  C = alpha * op(A) * B + beta * C'
+        func_descriptions['cusparse' + t + 'bsrsm2_analysis'] = 'perform analysis phase of bsrsm2'
+        func_descriptions['cusparse' + t + 'bsrsm2_bufferSize'] = 'return size of buffer used in bsrsm2'
+        func_descriptions['cusparse' + t + 'bsrsm2_solve'] = 'perform solve phase of bsrsm2'
+        # for CUDA 7.0+ below here
+        func_descriptions['cusparse' + t + 'coosort_bufferSizeExt'] = 'determine buffer size for coosort'
+        func_descriptions['cusparse' + t + 'coosortByColumn'] = 'in-place sort of COO format by column'
+        func_descriptions['cusparse' + t + 'coosortByRow'] = 'in-place sort of COO format by row'
+        func_descriptions['cusparse' + t + 'cscsort'] = 'in-place sort of CSC'
+        func_descriptions['cusparse' + t + 'cscsort_bufferSizeExt'] = 'determine buffer size for cscsort'
+        func_descriptions['cusparse' + t + 'csr2csru'] = 'convert sorted CSR to unsorted CSR'
+        func_descriptions['cusparse' + t + 'csrcolor'] = 'performs the coloring of the adjacency graph associated with the matrix A stored in CSR format'
+        func_descriptions['cusparse' + t + 'csrgemm2'] = 'generalization of the csrgemm. a matrix-matrix operation: C=alpha*A*B + beta*D'
+        func_descriptions['cusparse' + t + 'csrgemm2_bufferSizeExt'] = 'returns the buffer size required by csrgemm2'
+        func_descriptions['cusparse' + t + 'csrsort'] = 'in-place sort of CSR'
+        func_descriptions['cusparse' + t + 'csrsort_bufferSizeExt'] = 'determine buffer size for csrsort'
+        func_descriptions['cusparse' + t + 'csru2csr_bufferSizeExt'] = 'determine buffer size for csru2csr and csr2csru'
+        func_descriptions['cusparse' + t + 'csru2csr'] = 'convert unsorted CSR to sorted CSR'
+        # CUDA 6.0 functions renamed in CUDA 7.0+ below here
+        func_descriptions['cusparse' + t + 'bsric02_bufferSizeExt'] = 'return bsric02 (A ~= L * L.H) buffer size'
+        func_descriptions['cusparse' + t + 'bsrilu02_bufferSizeExt'] = 'return bsrilu02 (A ~= LU) buffer size'
+        func_descriptions['cusparse' + t + 'bsrsm2_bufferSizeExt'] = 'return size of buffer used in bsrsm2'
+        func_descriptions['cusparse' + t + 'bsrsv2_bufferSizeExt'] = 'return size of buffer used in bsrsv2'
+        func_descriptions['cusparse' + t + 'csr2gebsr_bufferSizeExt'] = 'return csr2gebsr buffer size'
+        func_descriptions['cusparse' + t + 'csric02_bufferSizeExt'] = 'return csric02 (A ~= L * L.H) buffer size'
+        func_descriptions['cusparse' + t + 'csrilu02_bufferSizeExt'] = 'return csrilu02 (A ~= LU) buffer size'
+        func_descriptions['cusparse' + t + 'csrsv2_bufferSizeExt'] = 'return size of buffer used in csrsv2'
+        func_descriptions['cusparse' + t + 'gebsr2gebsc_bufferSizeExt'] = 'return gebsr2gebsc buffer size'
+        func_descriptions['cusparse' + t + 'gebsr2gebsr_bufferSizeExt'] = 'return gebsr2gebsr or gebsr2gebsrNnz buffer size'
+
+    # operations common across all precisions below here
+    # for CUDA 5.0+
+    func_descriptions['cusparseXcsr2bsrNnz'] = 'determine the number of nonzero block columns per block row for csr2bsr'
+    func_descriptions['cusparseXcsrgemmNnz'] = 'determine csrRowPtrC and the total number of nonzero elements for gemm'
+    func_descriptions['cusparseXcsrgeamNnz'] = 'determine csrRowPtrC and the total number of nonzero elements for geam'
+    # for CUDA 6.0+
+    func_descriptions['cusparseXbsrsv2_zeroPivot'] = 'return numerical zero location for bsrsv2'
+    func_descriptions['cusparseXbsric02_zeroPivot'] = 'return numerical zero location for bsric02'
+    func_descriptions['cusparseXbsrilu02_zeroPivot'] = 'return numerical zero location for bsrilu02'
+    func_descriptions['cusparseXcsr2gebsrNnz'] = 'determine the number of nonzero block columns per block row for csr2gebsr'
+    func_descriptions['cusparseXcsric02_zeroPivot'] = 'return numerical zero location for csric02'
+    func_descriptions['cusparseXcsrilu02_zeroPivot'] = 'return numerical zero location for csrilu02'
+    func_descriptions['cusparseXcsrsv2_zeroPivot'] = 'return numerical zero location for csrsv2'
+    func_descriptions['cusparseXgebsr2gebsrNnz'] = 'determine the number of nonzero block columns per block row for gebsr2gebsr'
+    # for CUDA 6.5+
+    func_descriptions['cusparseXbsrsm2_zeroPivot'] = 'return numerical zero location for bsrsm2'
+    # for CUDA 7.0+
+    func_descriptions['cusparseCreateIdentityPermutation'] = 'creates an identity map for use with coosort, csrsort, cscsort, csr2csc_indexOnly'
+    func_descriptions['cusparseXcsrgemm2Nnz'] = 'determine csrRowPtrC and the number of non-zero elements for csrgemm2'
+
+    create_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Create' in cdef]
+    for func in create_funcs:
+        tmp, obj = func.split('Create')
+        if obj:
+            func_descriptions[func] = "Create cuSPARSE {} structure.".format(obj)
+        else:
+            func_descriptions[func] = "Create cuSPARSE context."
+    destroy_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Destroy' in cdef]
+    for func in destroy_funcs:
+        tmp, obj = func.split('Destroy')
+        if obj:
+            func_descriptions[func] = "Destroy cuSPARSE {} structure.".format(obj)
+        else:
+            func_descriptions[func] = "Destroy cuSPARSE context."
+    get_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Get' in cdef]
+    for func in get_funcs:
+        tmp, obj = func.split('Get')
+        func_descriptions[func] = "Get cuSPARSE {}.".format(obj)
+
+    set_funcs = [cdef for cdef in ffi_lib.__dict__ if 'Set' in cdef]
+    for func in set_funcs:
+        tmp, obj = func.split('Set')
+        func_descriptions[func] = "Set cuSPARSE {}.".format(obj)
+
+    # prune any of the above that aren't in ffi_lib:
+    func_descriptions = dict(
+        (k, v) for k, v in func_descriptions.items(
+            ) if k in ffi_lib.__dict__)
+
+    with open(json_file, 'w') as fid:
+        json.dump(func_descriptions, fid, sort_keys=True, indent=4)
+
+
+def generate_cusparse_python_wrappers(cffi_cdef=None, variable_defs_json='',
+                                      func_defs_json='',
+                                      python_wrapper_file=None):
+    """ generate python wrappers for all functions within cffi_cdef.
+
+    Parameters
+    ----------
+    cffi_cdef : str
+        cffi definition string as generated by `generate_cffi_cdef`
+    variable_defs_json : str, optional
+        filename of .json file containing dictionary of variable descriptions
+    func_defs_json : str, optional
+        filename of .json file containing dictionary of function descriptions
+    python_wrapper_file : str, optional
+        file to output the generated python wrappers to
+
+    Returns
+    -------
+    python_wrappers : str
+        string containing all of the python wrappers
+
+    """
+    cffi_cdef_list = cffi_cdef.split('\n')
+
+    # find lines containing a function definition
+    func_def_lines = []
+    for idx, line in enumerate(cffi_cdef_list):
+        if line.startswith('cusparse'):
+            func_def_lines.append(idx)
+
+    # reformat each definition into a single line for easier string processing
+    n_funcs = len(func_def_lines)
+    cdef_list = []
+    for i in range(len(func_def_lines)):
+        loc1 = func_def_lines[i]
+        if i < n_funcs - 1:
+            loc2 = func_def_lines[i + 1]
+            cdef = ' '.join([l.strip() for l in cffi_cdef_list[loc1:loc2]])
+        else:
+            cdef = ' '.join([l.strip() for l in cffi_cdef_list[loc1:]])
+        # strip any remaining comments after the semicolon
+        cdef = cdef[:cdef.find(';') + 1]
+        cdef_list.append(cdef)
+
+    # read function and variable definition strings to use when building the
+    # the Python doc strings
+    if variable_defs_json:
+        variable_descriptions = get_function_descriptions(variable_defs_json)
+    if func_defs_json:
+        func_descriptions = get_function_descriptions(func_defs_json)
+
+    # build the wrappers
+    python_wrappers = ''
+    for cdef in cdef_list:
+        python_wrappers += build_python_func(
+            cdef,
+            variable_descriptions=variable_descriptions,
+            func_descriptions=func_descriptions)
+
+    if python_wrapper_file is not None:
+        with open(python_wrapper_file, 'w') as f:
+            f.write(python_wrappers)
+    return python_wrappers
diff --git a/scikits/cuda/cusparse.py b/scikits/cuda/cusparse.py
index 97cf0ff..038a972 100644
--- a/scikits/cuda/cusparse.py
+++ b/scikits/cuda/cusparse.py
@@ -1,387 +1,1455 @@
 #!/usr/bin/env python
+from __future__ import division, print_function, absolute_import
+import functools
+import numpy as np
+import pycuda.autoinit
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv
 
-"""
-Python interface to CUSPARSE functions.
+from . import misc
 
-Note: this module does not explicitly depend on PyCUDA.
-"""
+from .misc import init
 
-import atexit
-import ctypes.util
-import platform
-from string import Template
-import sys
-import warnings
-
-import numpy as np
+try:
+    import scipy.sparse
+    from scipy.sparse.sputils import isscalarlike
+    has_scipy = True
+except ImportError:
+    has_scipy = False
 
-import cuda
-
-# Load library:
-_version_list = [6.5, 6.0, 5.5, 5.0, 4.0]
-if 'linux' in sys.platform:
-    _libcusparse_libname_list = ['libcusparse.so'] + \
-                                ['libcusparse.so.%s' % v for v in _version_list]
-elif sys.platform == 'darwin':
-    _libcusparse_libname_list = ['libcusparse.dylib']
-elif sys.platform == 'win32':
-    if platform.machine().endswith('64'):
-        _libcusparse_libname_list = ['cusparse.dll'] + \
-                                    ['cusparse64_%s.dll' % int(10*v) for v in _version_list]
-    else:
-        _libcusparse_libname_list = ['cusparse.dll'] + \
-                                    ['cusparse32_%s.dll' % int(10*v) for v in _version_list]
-else:
-    raise RuntimeError('unsupported platform')
-
-# Print understandable error message when library cannot be found:
-_libcusparse = None
-for _libcusparse_libname in _libcusparse_libname_list:
-    try:
-        if sys.platform == 'win32':
-            _libcusparse = ctypes.windll.LoadLibrary(_libcusparse_libname)
-        else:
-            _libcusparse = ctypes.cdll.LoadLibrary(_libcusparse_libname)
-    except OSError:
-        pass
-    else:
-        break
-if _libcusparse == None:
-    OSError('CUDA sparse library not found')
-
-class cusparseError(Exception):
-    """CUSPARSE error"""
-    pass
-
-class cusparseStatusNotInitialized(cusparseError):
-    """CUSPARSE library not initialized"""
-    pass
-
-class cusparseStatusAllocFailed(cusparseError):
-    """CUSPARSE resource allocation failed"""
-    pass
-
-class cusparseStatusInvalidValue(cusparseError):
-    """Unsupported value passed to the function"""
-    pass
-
-class cusparseStatusArchMismatch(cusparseError):
-    """Function requires a feature absent from the device architecture"""
-    pass
-
-class cusparseStatusMappingError(cusparseError):
-    """An access to GPU memory space failed"""
-    pass
-
-class cusparseStatusExecutionFailed(cusparseError):
-    """GPU program failed to execute"""
-    pass
-
-class cusparseStatusInternalError(cusparseError):
-    """An internal CUSPARSE operation failed"""
-    pass
-
-class cusparseStatusMatrixTypeNotSupported(cusparseError):
-    """The matrix type is not supported by this function"""
-    pass
-
-cusparseExceptions = {
-    1: cusparseStatusNotInitialized,
-    2: cusparseStatusAllocFailed,
-    3: cusparseStatusInvalidValue,
-    4: cusparseStatusArchMismatch,
-    5: cusparseStatusMappingError,
-    6: cusparseStatusExecutionFailed,
-    7: cusparseStatusInternalError,
-    8: cusparseStatusMatrixTypeNotSupported,
-    }
-
-# Matrix types:
-CUSPARSE_MATRIX_TYPE_GENERAL = 0
-CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1
-CUSPARSE_MATRIX_TYPE_HERMITIAN = 2
-CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
-
-CUSPARSE_FILL_MODE_LOWER = 0
-CUSPARSE_FILL_MODE_UPPER = 1
-
-# Whether or not a matrix' diagonal entries are unity:
-CUSPARSE_DIAG_TYPE_NON_UNIT = 0
-CUSPARSE_DIAG_TYPE_UNIT = 1
-
-# Matrix index bases:
-CUSPARSE_INDEX_BASE_ZERO = 0
-CUSPARSE_INDEX_BASE_ONE = 1
-
-# Operation types:
-CUSPARSE_OPERATION_NON_TRANSPOSE = 0
-CUSPARSE_OPERATION_TRANSPOSE = 1
-CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
-
-# Whether or not to parse elements of a dense matrix row or column-wise.
-CUSPARSE_DIRECTION_ROW = 0
-CUSPARSE_DIRECTION_COLUMN = 1
-
-# Helper functions:
-class cusparseMatDescr(ctypes.Structure):
-    _fields_ = [
-        ('MatrixType', ctypes.c_int),
-        ('FillMode', ctypes.c_int),
-        ('DiagType', ctypes.c_int),
-        ('IndexBase', ctypes.c_int)
-        ]
-
-def cusparseCheckStatus(status):
-    """
-    Raise CUSPARSE exception
+    # copy of isscalarlike from scipy.sparse.sputils
+    def isscalarlike(x):
+        """Is x either a scalar, an array scalar, or a 0-dim array?"""
+        return np.isscalar(x) or (isdense(x) and x.ndim == 0)
 
-    Raise an exception corresponding to the specified CUSPARSE error
-    code.
+toolkit_version = drv.get_version()
 
-    Parameters
-    ----------
-    status : int
-        CUSPARSE error code.
+if toolkit_version < (3, 2, 0):
+    raise ImportError("cuSPARSE not present prior to v3.2 of the CUDA toolkit")
 
-    See Also
-    --------
-    cusparseExceptions
+"""
+Python interface to cuSPARSE functions.
 
+Note: You may need to set the environment variable CUDA_ROOT to the base of
+your CUDA installation.
+"""
+# import low level cuSPARSE python wrappers and constants
+
+try:
+    from ._cusparse_cffi import *
+except Exception as e:
+    print(repr(e))
+    estr = "autogenerattion and import of cuSPARSE wrappers failed\n"
+    estr += ("Try setting the CUDA_ROOT environment variable to the base of "
+             "your CUDA installation.  The autogeneration script tries to "
+             "find the CUSPARSE header at CUDA_ROOT/include/cusparse_v2.h or "
+             "CUDA_ROOT/include/cusparse.h\n")
+    raise ImportError(estr)
+
+# define higher level wrappers for common functions
+# will check dimensions, autoset some variables and call the appriopriate
+# function based on the input dtype
+
+def defineIf(condition):
+    def decorator(func):
+        @functools.wraps(func)
+        def func_wrapper(*args, **kwargs):
+            if condition:
+                return func(*args, **kwargs)
+            else:
+                raise NotImplementedError("requested cuSPARSE function not "
+                                          "available for your CUDA version")
+        return func_wrapper
+    return decorator
+
+
+def copyMatDescr(descr):
+    """ create a new copy of Matrix Descriptor, descr """
+    descr_copy = cusparseCreateMatDescr()
+    cusparseSetMatType(descr_copy, cusparseGetMatType(descr))
+    cusparseSetMatIndexBase(descr_copy, cusparseGetMatIndexBase(descr))
+    cusparseSetMatDiagType(descr_copy, cusparseGetMatDiagType(descr))
+    cusparseSetMatFillMode(descr_copy, cusparseGetMatFillMode(descr))
+    return descr_copy
+
+
+def dense_nnz(descrA, A, handle=None, dirA=CUSPARSE_DIRECTION_ROW, lda=None,
+              nnzPerRowCol=None, nnzTotalDevHostPtr=None):
+    """ higher level wrapper to cusparse<t>nnz routines """
+    if not isinstance(A, pycuda.gpuarray.GPUArray):
+        raise ValueError("A must be a pyCUDA gpuarray")
+    if len(A.shape) != 2:
+        raise ValueError("A must be 2D")
+    if lda is None:
+        lda = A.shape[0]
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    m, n = A.shape
+    assert lda >= m
+    dtype = A.dtype
+
+    alloc = misc._global_cusparse_allocator
+
+    if nnzPerRowCol is None:
+        if dirA == CUSPARSE_DIRECTION_ROW:
+            nnzPerRowCol = gpuarray.zeros((m, ), dtype=np.int32,
+                                          allocator=alloc)
+        elif dirA == CUSPARSE_DIRECTION_COLUMN:
+            nnzPerRowCol = gpuarray.zeros((n, ), dtype=np.int32,
+                                          allocator=alloc)
+        else:
+            raise ValueError("Invalid dirA")
+    if nnzTotalDevHostPtr is None:
+        nnzTotalDevHostPtr = ffi.new('int *', 0)
+    if dtype == np.float32:
+        fn = cusparseSnnz
+    elif dtype == np.float64:
+        fn = cusparseDnnz
+    elif dtype == np.complex64:
+        fn = cusparseCnnz
+    elif dtype == np.complex128:
+        fn = cusparseZnnz
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+    fn(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+       nnzTotalDevHostPtr)
+    return nnzPerRowCol, nnzTotalDevHostPtr[0]
+
+
+def dense2csr(A, handle=None, descrA=None, lda=None, check_inputs=True):
+    """ Convert dense matrix to CSR. """
+    if not isinstance(A, pycuda.gpuarray.GPUArray):
+        # try moving list or numpy array to GPU
+        A = np.asfortranarray(np.atleast_2d(A))
+        A = gpuarray.to_gpu(A)
+    if check_inputs:
+        if not isinstance(A, pycuda.gpuarray.GPUArray):
+            raise ValueError("A must be a pyCUDA gpuarray")
+        if len(A.shape) != 2:
+            raise ValueError("A must be 2D")
+        if descrA is not None:
+            if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+                raise ValueError("Only general matrix type supported")
+        if not A.flags.f_contiguous:
+            raise ValueError("Dense matrix A must be in column-major order")
+
+    if lda is None:
+        lda = A.shape[0]
+    m, n = A.shape
+    assert lda >= m
+    dtype = A.dtype
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    if descrA is None:
+        descrA = cusparseCreateMatDescr()
+        cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+        cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    nnzPerRow, nnz = dense_nnz(
+        descrA, A, handle=handle, dirA=CUSPARSE_DIRECTION_ROW, lda=lda)
+
+    alloc = misc._global_cusparse_allocator
+
+    csrRowPtrA = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc)
+    csrColIndA = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc)
+    csrValA = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc)
+
+    if dtype == np.float32:
+        fn = cusparseSdense2csr
+    elif dtype == np.float64:
+        fn = cusparseDdense2csr
+    elif dtype == np.complex64:
+        fn = cusparseCdense2csr
+    elif dtype == np.complex128:
+        fn = cusparseZdense2csr
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+
+    fn(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA,
+       csrColIndA)
+    return (descrA, csrValA, csrRowPtrA, csrColIndA)
+
+
+def csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A=None,
+              handle=None, lda=None, check_inputs=True):
+    """ convert CSR matrix to dense """
+    if check_inputs:
+        if A is not None:
+            if not isinstance(A, pycuda.gpuarray.GPUArray):
+                raise ValueError("A must be a pyCUDA gpuarray")
+            if len(A.shape) != 2:
+                raise ValueError("A must be 2D")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ZERO:
+            raise ValueError("Only base 0 matrix supported")
+        for arr in [csrValA, csrRowPtrA, csrColIndA]:
+            if not isinstance(arr, pycuda.gpuarray.GPUArray):
+                raise ValueError("csr* inputs must be a pyCUDA gpuarrays")
+        if (csrRowPtrA.size != m + 1):
+            raise ValueError("A: inconsistent size")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    if lda is None:
+        lda = m
+    assert lda >= m
+    dtype = csrValA.dtype
+
+    alloc = misc._global_cusparse_allocator
+    A = gpuarray.zeros((m, n), dtype=dtype, order='F', allocator=alloc)
+
+    if dtype == np.float32:
+        fn = cusparseScsr2dense
+    elif dtype == np.float64:
+        fn = cusparseDcsr2dense
+    elif dtype == np.complex64:
+        fn = cusparseCcsr2dense
+    elif dtype == np.complex128:
+        fn = cusparseZcsr2dense
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+
+    fn(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda)
+    return A
+
+
+def dense2csc(A, handle=None, descrA=None, lda=None, check_inputs=True):
+    """ Convert dense matrix to CSC. """
+    if not isinstance(A, pycuda.gpuarray.GPUArray):
+        # try moving list or numpy array to GPU
+        A = np.asfortranarray(np.atleast_2d(A))
+        A = gpuarray.to_gpu(A)
+    if check_inputs:
+        if not isinstance(A, pycuda.gpuarray.GPUArray):
+            raise ValueError("A must be a pyCUDA gpuarray")
+        if len(A.shape) != 2:
+            raise ValueError("A must be 2D")
+        if descrA is not None:
+            if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+                raise ValueError("Only general matrix type supported")
+        if not A.flags.f_contiguous:
+            raise ValueError("Dense matrix A must be in column-major order")
+
+    if lda is None:
+        lda = A.shape[0]
+    m, n = A.shape
+    assert lda >= m
+    dtype = A.dtype
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    if descrA is None:
+        descrA = cusparseCreateMatDescr()
+        cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+        cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    nnzPerCol, nnz = dense_nnz(
+        descrA, A, handle=handle, dirA=CUSPARSE_DIRECTION_COLUMN, lda=lda)
+
+    alloc = misc._global_cusparse_allocator
+    cscColPtrA = gpuarray.zeros((n+1, ), dtype=np.int32, allocator=alloc)
+    cscRowIndA = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc)
+    cscValA = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc)
+    if dtype == np.float32:
+        fn = cusparseSdense2csc
+    elif dtype == np.float64:
+        fn = cusparseDdense2csc
+    elif dtype == np.complex64:
+        fn = cusparseCdense2csc
+    elif dtype == np.complex128:
+        fn = cusparseZdense2csc
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+
+    fn(handle, m, n, descrA, A, lda, nnzPerCol, cscValA, cscRowIndA,
+       cscColPtrA)
+    return (descrA, cscValA, cscColPtrA, cscRowIndA)
+
+
+def csc2dense(m, n, descrA, cscValA, cscColPtrA, cscRowIndA, A=None,
+              handle=None, lda=None, check_inputs=True):
+    """ convert CSC matrix to dense """
+    if check_inputs:
+        if A is not None:
+            if not isinstance(A, pycuda.gpuarray.GPUArray):
+                raise ValueError("A must be a pyCUDA gpuarray")
+            if len(A.shape) != 2:
+                raise ValueError("A must be 2D")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatIndexBase(descrA) != CUSPARSE_INDEX_BASE_ZERO:
+            raise ValueError("Only base 0 matrix supported")
+        for arr in [cscValA, cscColPtrA, cscRowIndA]:
+            if not isinstance(arr, pycuda.gpuarray.GPUArray):
+                raise ValueError("csc* inputs must be a pyCUDA gpuarrays")
+        if (cscColPtrA.size != n + 1):
+            raise ValueError("A: inconsistent size")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    if lda is None:
+        lda = m
+    assert lda >= m
+    dtype = cscValA.dtype
+    alloc = misc._global_cusparse_allocator
+    A = gpuarray.zeros((m, n), dtype=dtype, order='F', allocator=alloc)
+
+    if dtype == np.float32:
+        fn = cusparseScsc2dense
+    elif dtype == np.float64:
+        fn = cusparseDcsc2dense
+    elif dtype == np.complex64:
+        fn = cusparseCcsc2dense
+    elif dtype == np.complex128:
+        fn = cusparseZcsc2dense
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+
+    fn(handle, m, n, descrA, cscValA, cscRowIndA, cscColPtrA, A, lda)
+    return A
+
+
+def csr2coo(csrRowPtr, nnz, handle=None, m=None, cooRowInd=None,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert CSR to COO """
+    if check_inputs:
+        if cooRowInd is not None:
+            if not isinstance(cooRowInd, pycuda.gpuarray.GPUArray):
+                raise ValueError("cooRowInd must be a pyCUDA gpuarray")
+        if not isinstance(csrRowPtr, pycuda.gpuarray.GPUArray):
+            raise ValueError("csrRowPtr must be a pyCUDA gpuarraya")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if m is None:
+        m = csrRowPtr.size - 1
+    if cooRowInd is None:
+        alloc = misc._global_cusparse_allocator
+        cooRowInd = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc)
+    cusparseXcsr2coo(handle, csrRowPtr, nnz, m, cooRowInd, idxBase)
+    return cooRowInd
+
+
+# define with alternate naming for convenience
+def csc2coo(cscColPtr, nnz, handle=None, m=None, cooColInd=None,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert CSC to COO """
+    # if m is None:
+    #     m = cooColPtr.size - 1
+    cooColInd = csr2coo(csrRowPtr=cscColPtr, nnz=nnz, handle=handle, m=m,
+                        cooRowInd=cooColInd, idxBase=idxBase,
+                        check_inputs=check_inputs)
+    return cooColInd
+
+
+def coo2csr(cooRowInd, m, handle=None, nnz=None, csrRowPtr=None,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert COO to CSR """
+    if check_inputs:
+        if csrRowPtr is not None:
+            if not isinstance(csrRowPtr, pycuda.gpuarray.GPUArray):
+                raise ValueError("csrRowPtr must be a pyCUDA gpuarray")
+        if not isinstance(cooRowInd, pycuda.gpuarray.GPUArray):
+            raise ValueError("cooRowInd must be a pyCUDA gpuarraya")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnz is None:
+        nnz = cooRowInd.size
+    if csrRowPtr is None:
+        alloc = misc._global_cusparse_allocator
+        csrRowPtr = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc)
+    cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, idxBase)
+    return csrRowPtr
+
+
+# define with alternate naming for convenience
+def coo2csc(cooColInd, m, handle=None, nnz=None, cscColPtr=None,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert COO to CSC"""
+    cscColPtr = coo2csr(cooRowInd=cooColInd, m=m, handle=handle, nnz=nnz,
+                        csrRowPtr=cscColPtr, idxBase=idxBase,
+                        check_inputs=check_inputs)
+    return cscColPtr
+
+
+def csr2csc(m, n, csrVal, csrRowPtr, csrColInd, handle=None, nnz=None,
+            cscVal=None, cscColPtr=None, cscRowInd=None, A=None,
+            copyValues=CUSPARSE_ACTION_NUMERIC,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert CSR to CSC """
+    if check_inputs:
+        if (cscVal is not None) or (cscColPtr is not None) or \
+           (cscRowInd is not None):
+            for arr in [cscVal, cscColPtr, csrRowInd]:
+                if not isinstance(arr, pycuda.gpuarray.GPUArray):
+                    raise ValueError("csc* inputs must all be pyCUDA gpuarrays"
+                                     " or None")
+        for arr in [csrVal, csrRowPtr, csrColInd]:
+            if not isinstance(arr, pycuda.gpuarray.GPUArray):
+                raise ValueError("csr* inputs must be a pyCUDA gpuarrays")
+        if (csrRowPtr.size != m + 1):
+            raise ValueError("A: inconsistent size")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    dtype = csrVal.dtype
+    nnz = csrVal.size
+    if cscVal is None:
+        alloc = misc._global_cusparse_allocator
+        cscVal = gpuarray.zeros((nnz, ), dtype=dtype, allocator=alloc)
+        cscColPtr = gpuarray.zeros((n+1, ), dtype=np.int32, allocator=alloc)
+        cscRowInd = gpuarray.zeros((nnz, ), dtype=np.int32, allocator=alloc)
+    if dtype == np.float32:
+        fn = cusparseScsr2csc
+    elif dtype == np.float64:
+        fn = cusparseDcsr2csc
+    elif dtype == np.complex64:
+        fn = cusparseCcsr2csc
+    elif dtype == np.complex128:
+        fn = cusparseZcsr2csc
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+    fn(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscRowInd,
+       cscColPtr, copyValues, idxBase)
+    return (cscVal, cscColPtr, cscRowInd)
+
+
+# also define csc2csr as a convenience
+def csc2csr(m, n, cscVal, cscColPtr, cscRowInd, handle=None, nnz=None,
+            csrVal=None, csrRowPtr=None, csrColInd=None,
+            copyValues=CUSPARSE_ACTION_NUMERIC,
+            idxBase=CUSPARSE_INDEX_BASE_ZERO, check_inputs=True):
+    """ convert CSC to CSR """
+    csrVal, csrRowPtr, csrColInd = csr2csc(
+        m, n, cscVal, cscColPtr, cscRowInd, handle=handle, nnz=nnz,
+        cscVal=csrVal, cscColPtr=csrRowPtr, cscRowInd=csrColInd,
+        copyValues=copyValues, idxBase=idxBase, check_inputs=check_inputs)
+    return csrVal, csrRowPtr, csrColInd
+
+
+def csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m, n, x, handle=None,
+          nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0,
+          beta=0.0, y=None, check_inputs=True):
+    """ multiply a sparse matrix A, by dense vector x:
+    y = alpha * transA(A)*x + beta*y
+
+    higher level wrapper to cusparse<t>csrmv routines
     """
+    if check_inputs:
+        if not isinstance(csrValA, pycuda.gpuarray.GPUArray):
+            raise ValueError("csrValA must be a pyCUDA gpuarray")
+        if not isinstance(csrRowPtrA, pycuda.gpuarray.GPUArray):
+            raise ValueError("csrRowPtrA must be a pyCUDA gpuarray")
+        if not isinstance(csrColIndA, pycuda.gpuarray.GPUArray):
+            raise ValueError("csrColIndA must be a pyCUDA gpuarray")
+        if not isinstance(x, pycuda.gpuarray.GPUArray):
+            raise ValueError("x must be a pyCUDA gpuarray")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnz is None:
+        nnz = csrValA.size
+    dtype = csrValA.dtype
+    if y is None:
+        alloc = misc._global_cusparse_allocator
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            y = gpuarray.zeros((m, ), dtype=dtype, allocator=alloc)
+        else:
+            y = gpuarray.zeros((n, ), dtype=dtype, allocator=alloc)
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrValA.size != nnz:
+            raise ValueError("length of csrValA array must match nnz")
+        if (x.dtype != dtype) or (y.dtype != dtype):
+            raise ValueError("incompatible dtypes")
+        if csrRowPtrA.size != (m+1):
+            raise ValueError("length of csrRowPtrA array must be m+1")
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            if x.size != n:
+                raise ValueError("sizes of x, A incompatible")
+            if y.size != m:
+                raise ValueError("sizes of y, A incompatible")
+        else:
+            if x.size != m:
+                raise ValueError("sizes of x, A incompatible")
+            if y.size != n:
+                raise ValueError("sizes of y, A incompatible")
+    if dtype == np.float32:
+        fn = cusparseScsrmv
+    elif dtype == np.float64:
+        fn = cusparseDcsrmv
+    elif dtype == np.complex64:
+        fn = cusparseCcsrmv
+    elif dtype == np.complex128:
+        fn = cusparseZcsrmv
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+    if toolkit_version >= (4, 1, 0):
 
-    if status != 0:
-        try:
-            raise cusparseExceptions[status]
-        except KeyError:
-            raise cusparseError
+        fn(handle, transA, m, n, nnz, alpha, descrA, csrValA, csrRowPtrA,
+           csrColIndA, x, beta, y)
+    else:
+        fn(handle, transA, m, n, alpha, descrA, csrValA, csrRowPtrA,
+           csrColIndA, x, beta, y)
 
-_libcusparse.cusparseCreate.restype = int
-_libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p]
-def cusparseCreate():
-    """
-    Initialize CUSPARSE.
+    return y
 
-    Initializes CUSPARSE and creates a handle to a structure holding
-    the CUSPARSE library context.
 
-    Returns
-    -------
-    handle : int
-        CUSPARSE library context.
+def csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, handle=None,
+          C=None, nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0,
+          beta=0.0, ldb=None, ldc=None, check_inputs=True):
+    """ multiply a sparse matrix, A, by dense matrix B:
+    C = alpha * transA(A) * B + beta * C.
 
+    higher level wrapper to cusparse<t>csrmm routines
     """
+    if check_inputs:
+        for item in [csrValA, csrRowPtrA, csrColIndA, B]:
+            if not isinstance(item, pycuda.gpuarray.GPUArray):
+                raise ValueError("csr*, B, must be pyCUDA gpuarrays")
+        if C is not None:
+            if not isinstance(C, pycuda.gpuarray.GPUArray):
+                raise ValueError("C must be a pyCUDA gpuarray or None")
+        # dense matrices must be in column-major order
+        if not B.flags.f_contiguous:
+            raise ValueError("Dense matrix B must be in column-major order")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    dtype = csrValA.dtype
+    if C is None:
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ldc = m
+        else:
+            ldc = k
+        alloc = misc._global_cusparse_allocator
+        C = gpuarray.zeros((ldc, n), dtype=dtype, order='F', allocator=alloc)
+    elif not C.flags.f_contiguous:
+        raise ValueError("Dense matrix C must be in column-major order")
+    if nnz is None:
+        nnz = csrValA.size
+    if ldb is None:
+        ldb = B.shape[0]
+    if ldc is None:
+        ldc = C.shape[0]
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrValA.size != nnz:
+            raise ValueError("length of csrValA array must match nnz")
+        if (B.dtype != dtype) or (C.dtype != dtype):
+            raise ValueError("A, B, C must share a common dtype")
+        if ldb < B.shape[0]:
+            raise ValueError("ldb invalid for matrix B")
+        if ldc < C.shape[0]:
+            raise ValueError("ldc invalid for matrix C")
+        if (C.shape[1] != n) or (B.shape[1] != n):
+            raise ValueError("bad shape for B or C")
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            if (ldb != k) or (ldc != m):
+                raise ValueError("size of A incompatible with B or C")
+        else:
+            if (ldb != m) or (ldc != k):
+                raise ValueError("size of A incompatible with B or C")
+        if csrRowPtrA.size != m+1:
+            raise ValueError("length of csrRowPtrA invalid")
+    if dtype == np.float32:
+        fn = cusparseScsrmm
+    elif dtype == np.float64:
+        fn = cusparseDcsrmm
+    elif dtype == np.complex64:
+        fn = cusparseCcsrmm
+    elif dtype == np.complex128:
+        fn = cusparseZcsrmm
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+    if toolkit_version >= (4, 1, 0):
+        fn(handle, transA, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
+           csrColIndA, B, ldb, beta, C, ldc)
+    else:
+        fn(handle, transA, m, n, k, alpha, descrA, csrValA, csrRowPtrA,
+           csrColIndA, B, ldb, beta, C, ldc)
+    return C
 
-    handle = ctypes.c_int()
-    status = _libcusparse.cusparseCreate(ctypes.byref(handle))
-    cusparseCheckStatus(status)
-    return handle.value
-
-_libcusparse.cusparseDestroy.restype = int
-_libcusparse.cusparseDestroy.argtypes = [ctypes.c_int]
-def cusparseDestroy(handle):
-    """
-    Release CUSPARSE resources.
 
-    Releases hardware resources used by CUSPARSE
 
-    Parameters
-    ----------
-    handle : int
-        CUSPARSE library context.
+@defineIf(toolkit_version >= (5, 5, 0))
+def csrmm2(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B, handle=None,
+           C=None, nnz=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE,
+           transB=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, beta=0.0,
+           ldb=None, ldc=None, check_inputs=True):
+    """ multiply two sparse matrices:  C = transA(A) * transB(B)
 
+    higher level wrapper to cusparse<t>csrmm2 routines.
     """
-
-    status = _libcusparse.cusparseDestroy(handle)
-    cusparseCheckStatus(status)
-
-_libcusparse.cusparseGetVersion.restype = int
-_libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int,
-                                            ctypes.c_void_p]
-def cusparseGetVersion(handle):
+    if check_inputs:
+        for item in [csrValA, csrRowPtrA, csrColIndA, B]:
+            if not isinstance(item, pycuda.gpuarray.GPUArray):
+                raise ValueError("csr*, B, must be pyCUDA gpuarrays")
+        if C is not None:
+            if not isinstance(C, pycuda.gpuarray.GPUArray):
+                raise ValueError("C must be a pyCUDA gpuarray or None")
+        # dense matrices must be in column-major order
+        if not B.flags.f_contiguous:
+            raise ValueError("Dense matrix B must be column-major order")
+
+        if transB == CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE:
+            raise ValueError("Conjugate transpose operation not supported "
+                             "for dense matrix B")
+
+        if (transB == CUSPARSE_OPERATION_TRANSPOSE) and \
+           (transA != CUSPARSE_OPERATION_NON_TRANSPOSE):
+            raise ValueError("if B is transposed, only A non-transpose is "
+                             "supported")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+
+    dtype = csrValA.dtype
+    if C is None:
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ldc = m
+        else:
+            ldc = k
+        alloc = misc._global_cusparse_allocator
+        C = gpuarray.zeros((ldc, n), dtype=dtype, order='F',
+                           allocator=alloc)
+    elif not C.flags.f_contiguous:
+        raise ValueError("Dense matrix C must be in column-major order")
+    if nnz is None:
+        nnz = csrValA.size
+    if ldb is None:
+        ldb = B.shape[0]
+    if ldc is None:
+        ldc = C.shape[0]
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrValA.size != nnz:
+            raise ValueError("length of csrValA array must match nnz")
+        if (B.dtype != dtype) or (C.dtype != dtype):
+            raise ValueError("A, B, C must share a common dtype")
+        if ldb < B.shape[0]:
+            raise ValueError("ldb invalid for matrix B")
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ldOpA = m  # leading dimension for op(A)
+            tdOpA = k  # trailing dimension for op(A)
+        else:
+            ldOpA = k
+            tdOpA = m
+        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            if B.shape[1] != n:
+                raise ValueError("B, n incompatible")
+            if (ldb < tdOpA):
+                raise ValueError("size of A incompatible with B")
+        else:
+            if ldb < n:
+                raise ValueError("B, n incompatible")
+            if (B.shape[1] != tdOpA):
+                raise ValueError("size of A incompatible with B")
+        if (C.shape[1] != n):
+            raise ValueError("bad shape for C")
+        if (ldc != ldOpA):
+            raise ValueError("size of A incompatible with C")
+        if csrRowPtrA.size != m+1:
+            raise ValueError("length of csrRowPtrA invalid")
+    if dtype == np.float32:
+        fn = cusparseScsrmm2
+    elif dtype == np.float64:
+        fn = cusparseDcsrmm2
+    elif dtype == np.complex64:
+        fn = cusparseCcsrmm2
+    elif dtype == np.complex128:
+        fn = cusparseZcsrmm2
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+    transa = transA
+    transb = transB
+    try:
+        fn(handle, transa, transb, m, n, k, nnz, alpha, descrA, csrValA,
+           csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc)
+    except CUSPARSE_STATUS_INVALID_VALUE as e:
+        print("m={}, n={}, k={}, nnz={}, ldb={}, ldc={}".format(
+            m, n, k, nnz, ldb, ldc))
+        raise(e)
+    return C
+
+
+@defineIf(toolkit_version >= (5, 0, 0))
+def _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB,
+                csrColIndB, handle=None, descrC=None, csrRowPtrC=None,
+                nnzA=None, nnzB=None, check_inputs=True):
+    """ support routine for csrgeam
+
+    higher level wrapper to cusparseXcsrgeamNnz.
     """
-    Return CUSPARSE library version.
+    if check_inputs:
+        for array in [csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB]:
+            if not isinstance(array, pycuda.gpuarray.GPUArray):
+                raise ValueError("all csr* inputs must be a pyCUDA gpuarray")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if descrC is not None:
+            if not isinstance(csrRowPtrC, pycuda.gpuarray.GPUArray):
+                raise ValueError("csrRowPtrC must be a gpuarray or None")
+            if cusparseGetMatType(descrC) != CUSPARSE_MATRIX_TYPE_GENERAL:
+                raise ValueError("Only general matrix type supported")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnzA is None:
+        nnzA = csrColIndA.size
+    if nnzB is None:
+        nnzB = csrColIndB.size
+    if descrC is None:
+        return_descrC = True
+        descrC = cusparseCreateMatDescr()
+        cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+    else:
+        return_descrC = False
+    if csrRowPtrC is None:
+        alloc = misc._global_cusparse_allocator
+        csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32,
+                                    allocator=alloc)
+    nnzTotalDevHostPtr = ffi.new('int *', 0)
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrColIndA.size != nnzA:
+            raise ValueError("length of csrValA array must match nnzA")
+        if csrColIndB.size != nnzB:
+            raise ValueError("length of csrValB array must match nnzB")
+        if csrRowPtrA.size != m+1:
+            raise ValueError("length of csrRowPtrA array must be m+1")
+        if csrRowPtrB.size != m+1:
+            raise ValueError("length of csrRowPtrB array must be m+1")
+
+    cusparseXcsrgeamNnz(handle, m, n, descrA, nnzA, csrRowPtrA, csrColIndA,
+                        descrB, nnzB, csrRowPtrB, csrColIndB, descrC,
+                        csrRowPtrC, nnzTotalDevHostPtr)
+    nnzC = nnzTotalDevHostPtr[0]
+    if return_descrC:
+        return descrC, csrRowPtrC, nnzC
+    else:
+        return nnzC
 
-    Returns the version number of the CUSPARSE library.
 
-    Parameters
-    ----------
-    handle : int
-        CUSPARSE library context.
+@defineIf(toolkit_version >= (5, 0, 0))
+def csrgeam(m, n, descrA, csrValA, csrRowPtrA, csrColIndA, descrB, csrValB,
+            csrRowPtrB, csrColIndB, handle=None, alpha=1.0, beta=0.0,
+            nnzA=None, nnzB=None,  check_inputs=True):
+    """ add two sparse matrices:  C = alpha*A + beta*B.
+    higher level wrapper to cusparse<t>csrgemm routines.
+    """
+    if check_inputs:
+        for array in [csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB,
+                      csrColIndB]:
+            if not isinstance(array, pycuda.gpuarray.GPUArray):
+                raise ValueError("all csr* inputs must be a pyCUDA gpuarray")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnzA is None:
+        nnzA = csrValA.size
+    if nnzB is None:
+        nnzB = csrValB.size
+    dtype = csrValA.dtype
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrValA.size != nnzA:
+            raise ValueError("length of csrValA array must match nnzA")
+        if csrValB.size != nnzB:
+            raise ValueError("length of csrValB array must match nnzB")
+        if (dtype != csrValB.dtype):
+            raise ValueError("incompatible dtypes")
+        if csrRowPtrA.size != m + 1:
+            raise ValueError("bad csrRowPtrA size")
+        if csrRowPtrB.size != m + 1:
+            raise ValueError("bad csrRowPtrB size")
+
+    # allocate output matrix C descr and row pointers
+    descrC = cusparseCreateMatDescr()
+    cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+    alloc = misc._global_cusparse_allocator
+    csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc)
+
+    # call csrgemmNnz to determine nnzC and fill in csrRowPtrC
+    nnzC = _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB,
+                       csrRowPtrB, csrColIndB, handle=handle, descrC=descrC,
+                       csrRowPtrC=csrRowPtrC, nnzA=nnzA, nnzB=nnzB,
+                       check_inputs=False)
+
+    # allocated rest of C based on nnzC
+    csrValC = gpuarray.zeros((nnzC, ), dtype=dtype, allocator=alloc)
+    csrColIndC = gpuarray.zeros((nnzC, ), dtype=np.int32, allocator=alloc)
+    if dtype == np.float32:
+        fn = cusparseScsrgeam
+    elif dtype == np.float64:
+        fn = cusparseDcsrgeam
+    elif dtype == np.complex64:
+        fn = cusparseCcsrgeam
+    elif dtype == np.complex128:
+        fn = cusparseZcsrgeam
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
 
-    Returns
-    -------
-    version : int
-        CUSPARSE library version number.
+    fn(handle, m, n, alpha, descrA, nnzA, csrValA, csrRowPtrA, csrColIndA,
+       beta, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC,
+       csrValC, csrRowPtrC, csrColIndC)
+    return (descrC, csrValC, csrRowPtrC, csrColIndC)
 
-    """
 
-    version = ctypes.c_int()
-    status = _libcusparse.cusparseGetVersion(handle,
-                                             ctypes.byref(version))
-    cusparseCheckStatus(status)
-    return version.value
+@defineIf(toolkit_version >= (5, 0, 0))
+def _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB,
+                csrColIndB, handle=None, descrC=None, csrRowPtrC=None,
+                nnzA=None, nnzB=None, transA=CUSPARSE_OPERATION_NON_TRANSPOSE,
+                transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True):
+    """ support routine for csrgemm.
 
-_libcusparse.cusparseSetStream.restype = int
-_libcusparse.cusparseSetStream.argtypes = [ctypes.c_int,
-                                                 ctypes.c_int]
-def cusparseSetStream(handle, id):
+    higher level wrapper to cusparseXcsrgemmNnz.
     """
-    Sets the CUSPARSE stream in which kernels will run.
+    if check_inputs:
+        for array in [csrRowPtrA, csrColIndA, csrRowPtrB, csrColIndB]:
+            if not isinstance(array, pycuda.gpuarray.GPUArray):
+                raise ValueError("all csr* inputs must be a pyCUDA gpuarray")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if descrC is not None:
+            if not isinstance(csrRowPtrC, pycuda.gpuarray.GPUArray):
+                raise ValueError("csrRowPtrC must be a gpuarray or None")
+            if cusparseGetMatType(descrC) != CUSPARSE_MATRIX_TYPE_GENERAL:
+                raise ValueError("Only general matrix type supported")
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnzA is None:
+        nnzA = csrColIndA.size
+    if nnzB is None:
+        nnzB = csrColIndB.size
+    if descrC is None:
+        return_descrC = True
+        descrC = cusparseCreateMatDescr()
+        cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+    else:
+        return_descrC = False
+    if csrRowPtrC is None:
+        alloc = misc._global_cusparse_allocator
+        csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32,
+                                    allocator=alloc)
+    nnzTotalDevHostPtr = ffi.new('int *', 0)
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrColIndA.size != nnzA:
+            raise ValueError("length of csrValA array must match nnzA")
+        if csrColIndB.size != nnzB:
+            raise ValueError("length of csrValB array must match nnzB")
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ptrA_size = m + 1
+        else:
+            ptrA_size = k + 1
+        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ptrB_size = k + 1
+        else:
+            ptrB_size = n + 1
+        if csrRowPtrA.size != ptrA_size:
+            raise ValueError("length of csrRowPtrA array must be m+1")
+        if csrRowPtrB.size != ptrB_size:
+            raise ValueError("length of csrRowPtrB array must be n+1")
+
+    cusparseXcsrgemmNnz(handle, transA, transB, m, n, k, descrA, nnzA,
+                        csrRowPtrA, csrColIndA, descrB, nnzB, csrRowPtrB,
+                        csrColIndB, descrC, csrRowPtrC, nnzTotalDevHostPtr)
+    nnzC = nnzTotalDevHostPtr[0]
+    if return_descrC:
+        return descrC, csrRowPtrC, nnzC
+    else:
+        return nnzC
 
-    Parameters
-    ----------
-    handle : int
-        CUSPARSE library context.
-    id : int
-        Stream ID.
 
-    """
+@defineIf(toolkit_version >= (5, 0, 0))
+def csrgemm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, descrB, csrValB,
+            csrRowPtrB, csrColIndB, handle=None, nnzA=None, nnzB=None,
+            transA=CUSPARSE_OPERATION_NON_TRANSPOSE,
+            transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True):
+    """ multiply two sparse matrices:  C = transA(A) * transB(B)
 
-    status = _libcusparse.cusparseSetStream(handle, id)
-    cusparseCheckStatus(status)
+    higher level wrapper to cusparse<t>csrgemm routines.
 
-_libcusparse.cusparseCreateMatDescr.restype = int
-_libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr]
-def cusparseCreateMatDescr():
-    """
-    Initialize a sparse matrix descriptor.
+    Note
+    ----
+    transA(A) is shape m x k.  transB(B) is shape k x n.  C is shape m x n
 
-    Initializes the `MatrixType` and `IndexBase` fields of the matrix
-    descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL`
-    and `CUSPARSE_INDEX_BASE_ZERO`.
+    if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+        m, k = A.shape
+    else:
+        k, m = A.shape
 
-    Returns
-    -------
-    desc : cusparseMatDescr
-        Matrix descriptor.
+    if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+        k, n = B.shape
+    else:
+        n, k = B.shape
 
     """
 
-    desc = cusparseMatrixDesc()
-    status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc))
-    cusparseCheckStatus(status)
-    return desc
+    if check_inputs:
+        for array in [csrValA, csrRowPtrA, csrColIndA, csrValB, csrRowPtrB,
+                      csrColIndB]:
+            if not isinstance(array, pycuda.gpuarray.GPUArray):
+                raise ValueError("all csr* inputs must be a pyCUDA gpuarray")
+        if cusparseGetMatType(descrA) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+        if cusparseGetMatType(descrB) != CUSPARSE_MATRIX_TYPE_GENERAL:
+            raise ValueError("Only general matrix type supported")
+
+    if handle is None:
+        handle = misc._global_cusparse_handle
+    if nnzA is None:
+        nnzA = csrValA.size
+    if nnzB is None:
+        nnzB = csrValB.size
+    dtype = csrValA.dtype
+
+    # perform some basic sanity checks
+    if check_inputs:
+        if csrValA.size != nnzA:
+            raise ValueError("length of csrValA array must match nnzA")
+        if csrValB.size != nnzB:
+            raise ValueError("length of csrValB array must match nnzB")
+        if (dtype != csrValB.dtype):
+            raise ValueError("incompatible dtypes")
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ptrA_size = m + 1
+        else:
+            ptrA_size = k + 1
+        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            ptrB_size = k + 1
+        else:
+            ptrB_size = n + 1
+        if csrRowPtrA.size != ptrA_size:
+            raise ValueError("bad csrRowPtrA size")
+        if csrRowPtrB.size != ptrB_size:
+            raise ValueError("bad csrRowPtrB size")
+
+    # allocate output matrix C descr and row pointers
+    descrC = cusparseCreateMatDescr()
+    cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+    alloc = misc._global_cusparse_allocator
+    csrRowPtrC = gpuarray.zeros((m+1, ), dtype=np.int32, allocator=alloc)
+
+    # call csrgemmNnz to determine nnzC and fill in csrRowPtrC
+    nnzC = _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA, descrB,
+                       csrRowPtrB, csrColIndB, handle=handle, descrC=descrC,
+                       csrRowPtrC=csrRowPtrC, nnzA=nnzA, nnzB=nnzB,
+                       transA=transA, transB=transB, check_inputs=False)
+
+    # allocated rest of C based on nnzC
+    csrValC = gpuarray.zeros((nnzC, ), dtype=dtype, allocator=alloc)
+    csrColIndC = gpuarray.zeros((nnzC, ), dtype=np.int32, allocator=alloc)
+
+    if dtype == np.float32:
+        fn = cusparseScsrgemm
+    elif dtype == np.float64:
+        fn = cusparseDcsrgemm
+    elif dtype == np.complex64:
+        fn = cusparseCcsrgemm
+    elif dtype == np.complex128:
+        fn = cusparseZcsrgemm
+    else:
+        raise ValueError("unsupported sparse matrix dtype: %s" % dtype)
+
+    fn(handle, transA, transB, m, n, k, descrA, nnzA, csrValA, csrRowPtrA,
+       csrColIndA, descrB, nnzB, csrValB, csrRowPtrB, csrColIndB, descrC,
+       csrValC, csrRowPtrC, csrColIndC)
+    return (descrC, csrValC, csrRowPtrC, csrColIndC)
+
+
+class CSR(object):
+    """ cuSPARSE CSR (compressed sparse row) matrix object """
+    def __init__(self, descr, csrVal, csrRowPtr, csrColInd, shape,
+                 handle=None):
+
+        if csrRowPtr.size != (shape[0] + 1):
+            raise ValueError("size of RowPtr inconsistent with shape")
+        if csrVal.size != csrColInd.size:
+            raise ValueError("size of csrVal and csrColInd inconsistent")
+        if csrColInd.dtype != np.int32:
+            raise ValueError("csrColInd must be a 32-bit integer array")
+        if csrRowPtr.dtype != np.int32:
+            raise ValueError("csrRowPtr must be a 32-bit integer array")
+
+        # if input arrays are on the host, transfer them to the GPU
+        self._alloc = misc._global_cusparse_allocator
+
+        if isinstance(csrVal, np.ndarray):
+            csrVal = gpuarray.to_gpu(csrVal, allocator=self._alloc)
+        if isinstance(csrRowPtr, np.ndarray):
+            csrRowPtr = gpuarray.to_gpu(csrRowPtr, allocator=self._alloc)
+        if isinstance(csrColInd, np.ndarray):
+            csrColInd = gpuarray.to_gpu(csrColInd, allocator=self._alloc)
+
+        if handle is None:
+            self.handle = misc._global_cusparse_handle
+        else:
+            self.handle = handle
+        self.descr = descr
+        self.Val = csrVal
+        self.RowPtr = csrRowPtr
+        self.ColInd = csrColInd
+        self.dtype = csrVal.dtype
+        self.shape = shape
+
+        # also mirror scipy.sparse.csr_matrix property names for convenience
+        self.data = csrVal
+        self.indices = csrColInd
+        self.indptr = csrRowPtr
+
+        # properties
+        self.__matrix_type = None
+        self.__index_base = None
+        self.__diag_type = None
+        self.__fill_mode = None
+
+    # alternative constructor from dense ndarray, gpuarray or cuSPARSE matrix
+    @classmethod
+    def to_CSR(cls, A, handle=None):
+        """ convert dense numpy or gpuarray matrices as well as any
+        scipy.sparse matrix formats to cuSPARSE CSR.
+        """
+        alloc = misc._global_cusparse_allocator
+        if has_scipy and isinstance(A, scipy.sparse.spmatrix):
+            """Convert scipy.sparse CSR, COO, BSR, etc to cuSPARSE CSR"""
+            # converting BSR, COO, etc to CSR
+            if A.dtype.char not in ['f', 'd', 'F', 'D']:
+                raise ValueError("unsupported numpy dtype {}".format(A.dtype))
+
+            if not isinstance(A, scipy.sparse.csr_matrix):
+                A = A.tocsr()
+
+            # avoid .astype() calls if possible for speed
+            if A.indptr.dtype != np.int32:
+                csrRowPtr = gpuarray.to_gpu(A.indptr.astype(np.int32),
+                                            allocator=alloc)
+            else:
+                csrRowPtr = gpuarray.to_gpu(A.indptr, allocator=alloc)
+
+            if A.indices.dtype != np.int32:
+                csrColInd = gpuarray.to_gpu(A.indices.astype(np.int32),
+                                            allocator=alloc)
+            else:
+                csrColInd = gpuarray.to_gpu(A.indices, allocator=alloc)
+
+            csrVal = gpuarray.to_gpu(A.data, allocator=alloc)
+            descr = cusparseCreateMatDescr()
+            cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)
+            cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)
+        else:
+            """Take dense numpy array or pyCUDA gpuarray and convert to CSR """
+            if not isinstance(A, pycuda.gpuarray.GPUArray):
+                A = np.asfortranarray(np.atleast_2d(A))
+                A = gpuarray.to_gpu(A, allocator=alloc)
+            else:
+                # dense matrix must be column-major
+                if not A.flags.f_contiguous:
+                    # TODO :an change to Fortran ordering be done directly on
+                    # the gpuarray without going back to numpy?
+                    A = A.get()
+                    A = np.asfortranarray(A)
+                    A = gpuarray.to_gpu(A, allocator=alloc)
+
+            (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A, handle=handle)
+        return cls(descr, csrVal, csrRowPtr, csrColInd, A.shape)
+
+    @property
+    def matrix_type(self):
+        """ matrix type """
+        if self.__matrix_type is None:
+            return cusparseGetMatType(self.descr)
+        else:
+            return self.__matrix_type
+
+    @matrix_type.setter
+    def matrix_type(self, matrix_type):
+        """ matrix type """
+        self.__matrix_type = cusparseSetMatType(self.descr, matrix_type)
+
+    @property
+    def index_base(self):
+        """ matrix index base """
+        if self.__index_base is None:
+            return cusparseGetMatIndexBase(self.descr)
+        else:
+            return self.__index_base
+
+    @index_base.setter
+    def index_base(self, index_base):
+        """ matrix index base """
+        self.__index_base = cusparseSetMatIndexBase(self.descr, index_base)
+
+    @property
+    def diag_type(self):
+        """ matrix diag type """
+        if self.__diag_type is None:
+            return cusparseGetMatDiagType(self.descr)
+        else:
+            return self.__diag_type
+
+    @diag_type.setter
+    def diag_type(self, diag_type):
+        """matrix diag type """
+        self.__diag_type = cusparseSetMatDiagType(self.descr, diag_type)
+
+    @property
+    def fill_mode(self):
+        """matrix fill mode """
+        if self.__fill_mode is None:
+            return cusparseGetMatFillMode(self.descr)
+        else:
+            return self.__fill_mode
+
+    @fill_mode.setter
+    def fill_mode(self, fill_mode):
+        """matrix fill mode """
+        self.__fill_mode = cusparseSetMatFillMode(self.descr, fill_mode)
+
+    @property
+    def nnz(self):
+        """ number of non-zeros """
+        return self.Val.size
+
+    # mirror the function name from scipy
+    def getnnz(self):
+        """ return number of non-zeros"""
+        return self.nnz
+
+    def get(self):
+        """ return as scipy csr_matrix in host memory """
+        from scipy.sparse import csr_matrix
+        return csr_matrix((self.data.get(),
+                           self.indices.get(),
+                           self.indptr.get()),
+                          shape=self.shape)
+
+    def todense(self, lda=None, to_cpu=False, handle=None, stream=None,
+                autosync=True):
+        """ return dense gpuarray if to_cpu=False, numpy ndarray if to_cpu=True
+        """
+        m, n = self.shape
+        if lda is None:
+            lda = m
+        else:
+            assert lda >= m
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        A = csr2dense(m, n, self.descr, self.Val, self.RowPtr,
+                      self.ColInd, handle=handle, lda=lda)
+        if autosync:
+            drv.Context.synchronize()
+        if to_cpu:
+            return A.get()
+        else:
+            return A
+
+    def mv(self, x, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0,
+           beta=0.0, y=None, check_inputs=True, to_cpu=False,
+           autosync=True, handle=None, stream=None):
+        """ multiplication by dense vector x:  y = alpha*transA(A)*x + beta*y.
+        """
+        m, n = self.shape
+
+        # try moving list or numpy array to GPU
+        if not isinstance(x, pycuda.gpuarray.GPUArray):
+            x = np.atleast_1d(x)  # .astype(self.dtype)
+            x = gpuarray.to_gpu(x, allocator=self._alloc)
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        y = csrmv(self.descr, self.Val, self.RowPtr, self.ColInd, m, n,
+                  x, handle=handle, transA=transA, alpha=alpha, beta=beta, y=y,
+                  check_inputs=check_inputs)
+        if autosync:
+            drv.Context.synchronize()
+        if to_cpu:
+            return y.get()
+        else:
+            return y
+
+    def mm(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0,
+           beta=0.0, C=None, ldb=None, ldc=None, check_inputs=True,
+           to_cpu=False, autosync=True, handle=None, stream=None):
+        """ multiplication by dense matrix B:  C = alpha*transA(A)*B + beta*C.
+        """
+        m, k = self.shape
+        # try moving list or numpy array to GPU
+        if not isinstance(B, pycuda.gpuarray.GPUArray):
+            B = np.atleast_2d(B)  # .astype(self.dtype)
+            B = gpuarray.to_gpu(B, allocator=self._alloc)
+        n = B.shape[1]
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        C = csrmm(m=m, n=n, k=k, descrA=self.descr, csrValA=self.Val,
+                  csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd,  B=B,
+                  handle=handle, C=C, transA=transA, alpha=alpha, beta=beta,
+                  ldb=ldb, ldc=ldc, check_inputs=check_inputs)
+        if autosync:
+            drv.Context.synchronize()
+        if to_cpu:
+            return C.get()
+        else:
+            return C
+
+    @defineIf(toolkit_version >= (5, 5, 0))
+    def mm2(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE,
+            transB=CUSPARSE_OPERATION_NON_TRANSPOSE, alpha=1.0, beta=0.0,
+            C=None, ldb=None, ldc=None, check_inputs=True, to_cpu=False,
+            autosync=True, handle=None, stream=None):
+        """ multiplication by dense matrix B:  C = alpha*transA(A)*B + beta*C.
+        version 2
+        """
+        if toolkit_version < (5, 5, 0):
+            raise ImportError("mm2 not implemented prior to CUDA v5.5")
+        m, k = self.shape
+        # try moving list or numpy array to GPU
+        if not isinstance(B, pycuda.gpuarray.GPUArray):
+            B = np.atleast_2d(B)  # .astype(self.dtype)
+            B = gpuarray.to_gpu(B, allocator=self._alloc)
+        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            n = B.shape[1]
+        else:
+            n = B.shape[0]
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        C = csrmm2(handle=handle, m=m, n=n, k=k, descrA=self.descr,
+                   csrValA=self.Val, csrRowPtrA=self.RowPtr,
+                   csrColIndA=self.ColInd, B=B, C=C, transA=transA,
+                   transB=transB, alpha=alpha, beta=beta, ldb=ldb, ldc=ldc,
+                   check_inputs=check_inputs)
+        if autosync:
+            drv.Context.synchronize()
+        if to_cpu:
+            return C.get()
+        else:
+            return C
+
+    @defineIf(toolkit_version >= (5, 0, 0))
+    def geam(self, B, alpha=1.0, beta=1.0, check_inputs=True, autosync=True,
+             handle=None, stream=None):
+        """ addition of sparse matrix B:  C = alpha*A + beta*B """
+        if toolkit_version < (5, 0, 0):
+            raise ImportError("geam not implemented prior to CUDA v5.0")
+        m, n = self.shape
+        if not isinstance(B, CSR):
+            # try converting B to cuSPARSE CSR
+            B = CSR.to_CSR(B, handle=self.handle)
+        if self.shape != B.shape:
+            raise ValueError("Incompatible shapes")
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        descrC, ValC, RowPtrC, ColIndC = csrgeam(
+            handle=handle, m=m, n=n, descrA=self.descr, csrValA=self.Val,
+            csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd, descrB=B.descr,
+            csrValB=B.Val, csrRowPtrB=B.RowPtr, csrColIndB=B.ColInd,
+            alpha=alpha, beta=beta, nnzA=self.nnz, nnzB=B.nnz,
+            check_inputs=True)
+        C = CSR(descr=descrC, csrVal=ValC, csrRowPtr=RowPtrC,
+                csrColInd=ColIndC, shape=self.shape, handle=self.handle)
+        if autosync:
+            drv.Context.synchronize()
+        return C
+
+    @defineIf(toolkit_version >= (5, 0, 0))
+    def gemm(self, B, transA=CUSPARSE_OPERATION_NON_TRANSPOSE,
+             transB=CUSPARSE_OPERATION_NON_TRANSPOSE, check_inputs=True,
+             autosync=True, handle=None, stream=None):
+        """ multiplication by sparse matrix B:  C = transA(A) * transB(B) """
+        if toolkit_version < (5, 0, 0):
+            raise ImportError("gemm not implemented prior to CUDA v5.0")
+
+        if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            m, k = self.shape
+        else:
+            k, m = self.shape
+        if not isinstance(B, CSR):
+            # try converting B to cuSPARSE CSR
+            B = CSR.to_CSR(B, handle=self.handle)
+        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+            n = B.shape[1]
+        else:
+            n = B.shape[0]
+        if handle is None:
+            handle = self.handle
+        if stream is not None:
+            cusparseSetStream(handle, stream.handle)
+        descrC, ValC, RowPtrC, ColIndC = csrgemm(
+            handle=handle, m=m, n=n, k=k, descrA=self.descr,
+            csrValA=self.Val, csrRowPtrA=self.RowPtr, csrColIndA=self.ColInd,
+            descrB=B.descr, csrValB=B.Val, csrRowPtrB=B.RowPtr,
+            csrColIndB=B.ColInd, nnzA=self.nnz, nnzB=B.nnz, transA=transA,
+            transB=transB, check_inputs=True)
+        if autosync:
+            drv.Context.synchronize()
+        C = CSR(descr=descrC, csrVal=ValC, csrRowPtr=RowPtrC,
+                csrColInd=ColIndC, shape=(m, n), handle=self.handle)
+        return C
 
-_libcusparse.cusparseDestroyMatDescr.restype = int
-_libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int]
-def cusparseDestroyMatDescr(desc):
     """
-    Releases the memory allocated for the matrix descriptor.
+    start of: subset of methods in scipy.sparse.compressed._cs_matrix
+    """
+    @property
+    def A(self):
+        "The transpose operator."
+        return self.todense()
+
+    @property
+    def T(self):
+        "The transpose operator."
+        return self.transpose()
+
+    @property
+    def H(self):
+        "The adjoint operator."
+        return self.getH()
+
+    @property
+    def real(self):
+        "The transpose operator."
+        return self._real()
+
+    @property
+    def imag(self):
+        "The transpose operator."
+        return self._imag()
+
+    @property
+    def size(self):
+        "The adjoint operator."
+        return self.getnnz()
+
+    @property
+    def nbytes(self):
+        """ approximate object size in bytes (size of data,
+            column indices and row pointers only). """
+        nbytes = self.data.nbytes + self.indptr.nbytes + self.indices.nbytes
+        return nbytes
+
+    def transpose(self):
+        m, n = self.shape
+        # use csr2csc to perform the transpose
+        cscVal, cscColPtr, cscRowInd = csr2csc(
+            m, n, self.Val, self.RowPtr, self.ColInd, handle=self.handle,
+            nnz=self.nnz)
+        drv.Context.synchronize()
+        return CSR(copyMatDescr(self.descr), cscVal, cscColPtr,
+                   cscRowInd, self.shape, handle=self.handle)
+
+    def getH(self):
+        return self.transpose().conj()
+
+    def conjugate(self):
+        return self.conj()
+
+    # implement _with_data similar to scipy.sparse.data._data_matrix
+    def _with_data(self, data, copy=True):
+        """Returns a matrix with the same sparsity structure as self,
+        but with different data.  By default the structure arrays
+        (i.e. .indptr and .indices) are copied.
+        """
+        if copy:
+            return self.__class__(copyMatDescr(self.descr), data,
+                                  self.RowPtr.copy(), self.ColInd.copy(),
+                                  self.shape, handle=self.handle)
+        else:
+            return self.__class__(self.descr, data,
+                                  self.RowPtr, self.ColInd,
+                                  self.shape, handle=self.handle)
 
-    Parameters
-    ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
+    """
+    end of: subset of methods in scipy.sparse.compressed._cs_matrix
+    """
 
     """
+    start of: subset of methods in scipy.sparse.data._data_matrix
+    """
+    def conj(self):
+        return self._with_data(self.data.conj())
 
-    status = _libcusparse.cusparseDestroyMatDescr(desc)
-    cusparseCheckStatus(status)
+    def _real(self):
+        return self._with_data(self.data.real)
 
-_libcusparse.cusparseSetMatType.restype = int
-_libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr,
-                                            ctypes.c_int]
-def cusparseSetMatType(desc, type):
-    """
-    Sets the matrix type of the specified matrix.
+    def _imag(self):
+        return self._with_data(self.data.imag)
 
-    Parameters
-    ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
-    type : int
-        Matrix type.
+    def __abs__(self):
+        return self._with_data(abs(self.data))
 
-    """
+    def __neg__(self):
+        return self._with_data(abs(self.data))
 
-    status = _libcusparse.cusparseSetMatType(desc, type)
-    cusparseCheckStatus(status)
+    def __imul__(self, other):  # self *= other
+        if isscalarlike(other):
+            self.data *= other
+            return self
+        else:
+            return NotImplemented
 
-_libcusparse.cusparseGetMatType.restype = int
-_libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr]                                 
-def cusparseGetMatType(desc):
-    """
-    Gets the matrix type of the specified matrix.
+    def __itruediv__(self, other):  # self /= other
+        if isscalarlike(other):
+            recip = 1.0 / other
+            self.data *= recip
+            return self
+        else:
+            return NotImplemented
 
-    Parameters
-    ----------
-    desc : cusparseMatDescr
-        Matrix descriptor.
+    def astype(self, t):
+        return self._with_data(self.data.astype(t))
 
-    Returns
-    -------
-    type : int
-        Matrix type.
+    def copy(self):
+        return self._with_data(self.data.copy(), copy=True)
 
-    """
+    def _mul_scalar(self, other):
+        return self._with_data(self.data * other)
 
-    return _libcusparse.cusparseGetMatType(desc)
-
-# Format conversion functions:
-_libcusparse.cusparseSnnz.restype = int
-_libcusparse.cusparseSnnz.argtypes = [ctypes.c_int,
-                                      ctypes.c_int,
-                                      ctypes.c_int,
-                                      ctypes.c_int,
-                                      cusparseMatDescr,
-                                      ctypes.c_void_p,
-                                      ctypes.c_int,
-                                      ctypes.c_void_p,
-                                      ctypes.c_void_p]
-def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, 
-                 nnzPerRowColumn, nnzTotalDevHostPtr):
     """
-    Compute number of non-zero elements per row, column, or dense matrix.
-
-    Parameters
-    ----------
-    handle : int
-        CUSPARSE library context.
-    dirA : int
-        Data direction of elements.
-    m : int
-        Rows in A.
-    n : int
-        Columns in A.
-    descrA : cusparseMatDescr
-        Matrix descriptor.
-    A : pycuda.gpuarray.GPUArray
-        Dense matrix of dimensions (lda, n).
-    lda : int
-        Leading dimension of A.
-    
-    Returns
-    -------
-    nnzPerRowColumn : pycuda.gpuarray.GPUArray
-        Array of length m or n containing the number of 
-        non-zero elements per row or column, respectively.
-    nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray
-        Total number of non-zero elements in device or host memory.
-
+    end of: subset of methods in scipy.sparse.data._data_matrix
     """
 
-    # Unfinished:
-    nnzPerRowColumn = gpuarray.empty()
-    nnzTotalDevHostPtr = gpuarray.empty()
-
-    status = _libcusparse.cusparseSnnz(handle, dirA, m, n, 
-                                       descrA, int(A), lda,
-                                       int(nnzPerRowColumn), int(nnzTotalDevHostPtr))
-    cusparseCheckStatus(status)
-    return nnzPerVector, nnzHost
-
-_libcusparse.cusparseSdense2csr.restype = int
-_libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int,
-                                            ctypes.c_int,
-                                            ctypes.c_int,
-                                            cusparseMatDescr,
-                                            ctypes.c_void_p,
-                                            ctypes.c_int,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p,
-                                            ctypes.c_void_p]
-def cusparseSdense2csr(handle, m, n, descrA, A, lda, 
-                       nnzPerRow, csrValA, csrRowPtrA, csrColIndA):
-    # Unfinished
-    pass
+    def __del__(self):
+        """ cleanup descriptor upon object deletion """
+        cusparseDestroyMatDescr(self.descr)
+        # don't destroy the handle as other objects may be using it
+
+    def __repr__(self):
+        rstr = "CSR matrix:\n"
+        rstr += "\tshape = {}\n".format(self.shape)
+        rstr += "\tdtype = {}\n".format(self.dtype)
+        rstr += "\tMatrixType = {}\n".format(self.matrix_type)
+        rstr += "\tIndexBase = {}\n".format(self.index_base)
+        rstr += "\tDiagType = {}\n".format(self.diag_type)
+        rstr += "\tFillMode = {}\n".format(self.fill_mode)
+        rstr += "\tcontext = {}\n\n".format(self.handle)
+        rstr += "\tnnz = {}\n".format(self.nnz)
+        rstr += "\tRowPtr = {}\n".format(self.RowPtr)
+        rstr += "\tVal = {}\n".format(self.Val)
+        return rstr
diff --git a/scikits/cuda/cusparse_variable_descriptions.json b/scikits/cuda/cusparse_variable_descriptions.json
new file mode 100644
index 0000000..bd4df60
--- /dev/null
+++ b/scikits/cuda/cusparse_variable_descriptions.json
@@ -0,0 +1,184 @@
+{
+    "A": "array of dimensions (lda, n)",
+    "B": "array of dimensions (ldb, n)",
+    "C": "array of dimensions (ldc, n)",
+    "X": "array of dimensions (ldx, n)",
+    "Y": "array of dimensions (ldy, n)",
+    "alpha": "scalar used for multiplication",
+    "base": "enumerated indexBase type",
+    "baseIdx": "enumerated indexBase type",
+    "batchCount": "number of systems to solve",
+    "batchStride": "stride (number of elements) that separates the vectors of every system (must be at least m)",
+    "beta": "scalar used for multiplication",
+    "blockDim": "block dimension of sparse matrix A, larger than zero.",
+    "blockSize": "block dimension of sparse matrix A, larger than zero.",
+    "boost_val": "boost value to replace a numerical zero",
+    "bscColPtr": "integer array of nb+1 elements that contains the start of every block column and the end of the last block column plus one",
+    "bscRowInd": "integer array of nnzb row indices of the non-zero blocks of matrix A",
+    "bscVal": "array of nnzb*rowBlockDim*colBlockDim non-zero elements of matrix A. It is only filled-in if copyValues is set to CUSPARSE_ACTION_NUMERIC",
+    "bscSortedVal": "array of nnzb*rowBlockDim*colBlockDim non-zero elements of matrix A. It is only filled-in if copyValues is set to CUSPARSE_ACTION_NUMERIC",
+    "bsrColInd": "integer array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) column indices of the nonzero blocks of matrix A",
+    "bsrColIndA": "integer array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) column indices of the nonzero blocks of matrix A",
+    "bsrColIndC": "integer array of nnz ( = csrRowPtrC(mb) - csrRowPtrC(0) ) column indices of the nonzero blocks of matrix C",
+    "bsrEndPtr": "integer array of mb elements that contains the end of the every block row plus one",
+    "bsrEndPtrA": "integer array of mb elements that contains the end of the every block row plus one",
+    "bsrMaskPtr": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows",
+    "bsrMaskPtrA": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows",
+    "bsrRowPtr": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrRowPtrA": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrRowPtrC": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrVal": "array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) nonzero blocks of matrix A",
+    "bsrValA": "array of nnz ( = csrRowPtrA(mb) - csrRowPtrA(0) ) nonzero blocks of matrix A",
+    "bsrValC": "array of nnz ( = csrRowPtrC(mb) - csrRowPtrC(0) ) nonzero blocks of matrix C",
+    "bsrSortedColInd": "integer array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) column indices of the nonzero blocks of matrix A",
+    "bsrSortedColIndA": "integer array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) column indices of the nonzero blocks of matrix A",
+    "bsrSortedColIndC": "integer array of nnz ( = csrSortedRowPtrC(mb) - csrSortedRowPtrC(0) ) column indices of the nonzero blocks of matrix C",
+    "bsrSortedEndPtr": "integer array of mb elements that contains the end of the every block row plus one",
+    "bsrSortedEndPtrA": "integer array of mb elements that contains the end of the every block row plus one",
+    "bsrSortedMaskPtr": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows",
+    "bsrSortedMaskPtrA": "integer array of sizeOfMask elements that contains the indices corresponding to updated block rows",
+    "bsrSortedRowPtr": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrSortedRowPtrA": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrSortedRowPtrC": "integer array of mb + 1 elements that contains the start of every block row and the end of the last block row plus one",
+    "bsrSortedVal": "array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) nonzero blocks of matrix A",
+    "bsrSortedValA": "array of nnz ( = csrSortedRowPtrA(mb) - csrSortedRowPtrA(0) ) nonzero blocks of matrix A",
+    "bsrSortedValC": "array of nnz ( = csrSortedRowPtrC(mb) - csrSortedRowPtrC(0) ) nonzero blocks of matrix C",
+    "c": "cosine element of the rotation matrix",
+    "colBlockDim": "number of columns within a block of A",
+    "colBlockDimA": "number of columns within a block of A",
+    "colBlockDimC": "number of columns within a block of C",
+    "cooRowInd": "integer array of nnz uncompressed row indices.",
+    "copyValues": "enumerated CUSPARSE_ACTION",
+    "cscColPtr": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one",
+    "cscColPtrA": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one",
+    "cscRowInd": "integer array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) row indices of the nonzero elements of matrix A",
+    "cscRowIndA": "integer array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) row indices of the nonzero elements of matrix A",
+    "cscVal": "array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) nonzero elements of matrix A",
+    "cscValA": "array of nnz ( = cscColPtrA(m) - cscColPtrA(0) ) nonzero elements of matrix A",
+    "csrColInd": "integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) column indices of the nonzero elements of matrix A",
+    "csrColIndA": "integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) column indices of the nonzero elements of matrix A",
+    "csrColIndB": "integer array of nnz ( = csrRowPtrB(m) - csrRowPtrB(0) ) column indices of the nonzero elements of matrix B",
+    "csrColIndC": "integer array of nnz ( = csrRowPtrC(m) - csrRowPtrC(0) ) column indices of the nonzero elements of matrix C",
+    "csrRowPtr": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrRowPtrA": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrRowPtrB": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrRowPtrC": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrVal": " array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A",
+    "csrValA": " array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A",
+    "csrValA_ValM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A",
+    "csrValA_valM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A",
+    "csrValB": " array of nnz ( = csrRowPtrB(m) - csrRowPtrB(0) ) nonzero elements of matrix B",
+    "csrValC": " array of nnz ( = csrRowPtrC(m) - csrRowPtrC(0) ) nonzero elements of matrix C",
+    "csrValM": "array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A",
+    "cscSortedColPtr": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one",
+    "cscSortedColPtrA": "integer array of n+1 elements that contains the start of every row and the end of the last column plus one",
+    "cscSortedRowInd": "integer array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) row indices of the nonzero elements of matrix A",
+    "cscSortedRowIndA": "integer array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) row indices of the nonzero elements of matrix A",
+    "cscSortedVal": "array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) nonzero elements of matrix A",
+    "cscSortedValA": "array of nnz ( = cscSortedColPtrA(m) - cscSortedColPtrA(0) ) nonzero elements of matrix A",
+    "csrSortedColInd": "integer array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) column indices of the nonzero elements of matrix A",
+    "csrSortedColIndA": "integer array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) column indices of the nonzero elements of matrix A",
+    "csrSortedColIndB": "integer array of nnz ( = csrSortedRowPtrB(m) - csrSortedRowPtrB(0) ) column indices of the nonzero elements of matrix B",
+    "csrSortedColIndC": "integer array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) column indices of the nonzero elements of matrix C",
+    "csrSortedColIndD": "integer array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) column indices of the nonzero elements of matrix C",
+    "csrSortedRowPtr": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrSortedRowPtrA": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrSortedRowPtrB": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrSortedRowPtrC": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrSortedRowPtrD": "integer array of m+1 elements that contains the start of every row and the end of the last row plus one",
+    "csrSortedVal": " array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A",
+    "csrSortedValA": " array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A",
+    "csrSortedValD": " array of nnz ( = csrSortedRowPtrD(m) - csrSortedRowPtrD(0) ) nonzero elements of matrix A",
+    "csrSortedValA_ValM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A",
+    "csrSortedValA_valM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A",
+    "csrSortedValB": " array of nnz ( = csrSortedRowPtrB(m) - csrSortedRowPtrB(0) ) nonzero elements of matrix B",
+    "csrSortedValC": " array of nnz ( = csrSortedRowPtrC(m) - csrSortedRowPtrC(0) ) nonzero elements of matrix C",
+    "csrSortedValM": "array of nnz ( = csrSortedRowPtrA(m) - csrSortedRowPtrA(0) ) nonzero elements of matrix A",
+    "d": "dense array containing the main diagonal of the tri-diagonal linear system",
+    "descr": "matrix descriptor",
+    "descrA": "matrix descriptor of sparse matrix A",
+    "descrB": "matrix descriptor of sparse matrix B",
+    "descrC": "matrix descriptor of sparse matrix C",
+    "descrD": "matrix descriptor of sparse matrix D",
+    "descra": "matrix descriptor of sparse matrix A",
+    "descrb": "matrix descriptor of sparse matrix B",
+    "diagType": "enumerated diagType",
+    "dir": "enumerated storage format direction",
+    "dirA": "enumerated storage format direction",
+    "dl": "dense array containing the lower diagonal of the tri-diagonal linear system. The first element of each lower diagonal must be zero",
+    "du": "dense array containing the upper diagonal of the tri-diagonal linear system. The last element of each upper diagonal must be zero",
+    "enable_boost": "disable boost by enable_boost=0; otherwise, boost is enabled",
+    "fillMode": "enumerated fill mode",
+    "handle": "cuSPARSE context handle",
+    "hybA": "cuSPARSE HYB data structure",
+    "idxBase": "enumerated indexBase type",
+    "info": "solve and analysis structure",
+    "k": "number of columns of A",
+    "kb": "number of block columns of sparse matrix A",
+    "lda": "leading dimension of A",
+    "ldb": "leading dimension of B",
+    "ldc": "leading dimension of C",
+    "ldx": "leading dimension of X",
+    "ldy": "leading dimension of Y",
+    "levelInd": "integer array of m (number of rows in the matrix) elements that contains the row indices belonging to every level",
+    "levelPtr": "integer array of nlevels+1 elements that contains the start of every level and the end of the last level plus one",
+    "m": "number of rows of A",
+    "mb": "number of block rows of matrix A",
+    "mode": "enumerated pointer mode",
+    "n": "number of columns of ",
+    "nb": "number of block columns of matrix A",
+    "nlevels": "number of levels",
+    "nnz": "number of non-zero elements",
+    "nnzA": "number of non-zero elements of matrix A",
+    "nnzB": "number of non-zero elements of matrix B",
+    "nnzD": "number of non-zero elements of matrix D",
+    "nnzPerCol": "array of size n containing the number of nonzero elements per column",
+    "nnzPerRow": "array of size n containing the number of nonzero elements per row",
+    "nnzPerRowCol": "array of size m or n containing the number of nonzero elements per row or column, respectively",
+    "nnzPerRowColumn": "array of size m or n containing the number of nonzero elements per row or column, respectively",
+    "nnzTotalDevHostPtr": "total number of nonzero elements in device or host memory",
+    "nnzTotalHostPtr": "total number of nonzero elements in host memory",
+    "nnzHostPtr": "total number of nonzero elements in memory",
+    "nnzb": "number of non-zero blocks of matrix A",
+    "pBuffer": "buffer allocated by the user",
+    "pBufferSize": "number of bytes of the buffer used",
+    "pBufferSizeInBytes": "number of bytes of the buffer used",
+    "pInputBuffer": "library version number",
+    "partitionType": "partitioning method to be used in the conversion",
+    "policy": "enumerated cuSPARSE solve policy",
+    "position": "if no structural or numerical zero, position is -1; otherwise if A(j,j) is missing or U(j,j) is zero, position=j",
+    "resultHostPtr": "pointer to result in the host memory",
+    "resultDevHostPtr": "pointer to result in the device or host memory",
+    "rowBlockDim": "number of rows within a block of A",
+    "rowBlockDimA": "number of rows within a block of A",
+    "rowBlockDimC": "number of rows within a block of C",
+    "s": "sine element of the rotation matrix",
+    "sizeOfMask": "number of updated block rows of y ",
+    "streamId": "stream to be used by the library",
+    "tol": "tolerance to determine a numerical zero",
+    "trans": "enumerated operation type, op(A)",
+    "transA": "enumerated operation type, op(A)",
+    "transB": "enumerated operation type, op(B)",
+    "transX": "enumerated operation type, op(X)",
+    "transXY": "enumerated operation type, op(X) and op(Y)",
+    "transa": "enumerated operation type, op(A)",
+    "transb": "enumerated operation type, op(B)",
+    "type": "enumerated matrix type",
+    "userEllWidth": "width of the regular (ELL) part of the matrix in HYB format, which should be less than the maximum number of nonzeros per row and is only required if partitionType == CUSPARSE_HYB_PARTITION_USER",
+    "version": "library version number",
+    "x": "vector",
+    "f": "vector",
+    "xInd": "integer vector of nnz indices corresponding to the nonzero values",
+    "xVal": "vector containing nnz data values",
+    "y": "vector",
+    "F": "array of dimensions (ldf, n)",
+    "ldf": "leading dimension of F",
+    "fractionToColor": "fraction of nodes to be colored, which should be in the interval [0.0,1.0], for example 0.8 implies that 80 percent of nodes will be colored.",
+    "ncolors": "The number of distinct colors used (at most the size of the matrix, but likely much smaller).",
+    "coloring": "The resulting coloring permutation",
+    "reordering": "The resulting reordering permutation (untouched if NULL)",
+    "p": "integer array of nnz unsorted map indices. To construct csrVal, the user has to set P=0:1:(nnz-1).",
+    "P": "integer array of nnz unsorted map indices. To construct cooVal, the user has to set P=0:1:(nnz-1).",
+    "cooRowsA": "integer array of nnz unsorted row indices of A. ",
+    "cooColsA": "integer array of nnz unsorted column indices of A. "
+}
\ No newline at end of file
diff --git a/scikits/cuda/misc.py b/scikits/cuda/misc.py
index d5a2f2d..ce48e3b 100644
--- a/scikits/cuda/misc.py
+++ b/scikits/cuda/misc.py
@@ -59,7 +59,7 @@
     Result.
 
 """
-    
+
 def init_device(n=0):
     """
     Initialize a GPU device.
@@ -127,6 +127,10 @@ def done_context(ctx):
 _global_cublas_handle = None
 global _global_cublas_allocator
 _global_cublas_allocator = None
+global _global_cusparse_handle
+_global_cusparse_handle = None
+global _global_cusparse_allocator
+_global_cusparse_allocator = None
 def init(allocator=drv.mem_alloc):
     """
     Initialize libraries used by scikits.cuda.
@@ -145,8 +149,10 @@ def init(allocator=drv.mem_alloc):
     and context were initialized in the current host thread.
     """
 
-    # CUBLAS uses whatever device is being used by the host thread:
+    # CUBLAS & CUSPARSE use whatever device is being used by the host thread:
     global _global_cublas_handle, _global_cublas_allocator
+    global _global_cusparse_handle, _global_cusparse_allocator
+
     if not _global_cublas_handle:
         from . import cublas  # nest to avoid requiring cublas e.g. for FFT
         _global_cublas_handle = cublas.cublasCreate()
@@ -154,6 +160,16 @@ def init(allocator=drv.mem_alloc):
     if _global_cublas_allocator is None:
         _global_cublas_allocator = allocator
 
+    if not _global_cusparse_handle:
+        from . import cusparse  # nest to avoid requiring cublas e.g. for FFT
+        _global_cusparse_handle = cusparse.cusparseCreate()
+        # set so that scalar values are passed by reference on the host
+        cusparse.cusparseSetPointerMode(_global_cusparse_handle,
+                                        cusparse.CUSPARSE_POINTER_MODE_HOST)
+
+    if _global_cusparse_allocator is None:
+        _global_cusparse_allocator = allocator
+
     # culaSelectDevice() need not (and, in fact, cannot) be called
     # here because the host thread has already been bound to a GPU
     # device:
@@ -321,7 +337,7 @@ def select_block_grid_sizes(dev, data_shape, threads_per_block=None):
 
     # Actual number of thread blocks needed:
     blocks_needed = iceil(N/float(max_threads_per_block))
-    
+
     if blocks_needed <= max_grid_dim[0]:
         return (max_threads_per_block, 1, 1), (blocks_needed, 1, 1)
     elif blocks_needed > max_grid_dim[0] and \
@@ -331,7 +347,7 @@ def select_block_grid_sizes(dev, data_shape, threads_per_block=None):
     elif blocks_needed > max_grid_dim[0]*max_grid_dim[1] and \
          blocks_needed <= max_grid_dim[0]*max_grid_dim[1]*max_grid_dim[2]:
         return (max_threads_per_block, 1, 1), \
-            (max_grid_dim[0], max_grid_dim[1], 
+            (max_grid_dim[0], max_grid_dim[1],
              iceil(blocks_needed/float(max_grid_dim[0]*max_grid_dim[1])))
     else:
         raise ValueError('array size too large')
diff --git a/tests/test_cusparse.py b/tests/test_cusparse.py
new file mode 100644
index 0000000..799b328
--- /dev/null
+++ b/tests/test_cusparse.py
@@ -0,0 +1,1059 @@
+from __future__ import division
+
+from scikits.cuda.cusparse import *
+from scikits.cuda.cusparse import (_csrgeamNnz, _csrgemmNnz)
+
+from scikits.cuda import cusparse
+cusparse.init()
+
+import numpy as np
+from numpy.testing import assert_raises, assert_equal, assert_almost_equal
+
+from unittest import skipIf
+
+import pycuda.autoinit
+import pycuda.gpuarray as gpuarray
+import pycuda.driver as drv
+
+import scipy.sparse  # TODO: refactor to remove this
+
+cusparse_real_dtypes = [np.float32, np.float64]
+cusparse_complex_dtypes = [np.complex64, np.complex128]
+cusparse_dtypes = cusparse_real_dtypes + cusparse_complex_dtypes
+trans_list = [CUSPARSE_OPERATION_NON_TRANSPOSE,
+              CUSPARSE_OPERATION_TRANSPOSE,
+              CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE]
+
+
+def test_context_create_destroy():
+    handle = cusparseCreate()
+    cusparseDestroy(handle)
+
+
+def test_get_version():
+    handle = cusparseCreate()
+    try:
+        version = cusparseGetVersion(handle)
+        assert type(version) == int
+    finally:
+        cusparseDestroy(handle)
+
+@skipIf(toolkit_version < (4, 1, 0), "HyB format added in CUDA Toolkit v4.1")
+def test_create_destroy_hyb():
+    HybA = cusparseCreateHybMat()
+    cusparseDestroyHybMat(HybA)
+
+
+@skipIf(toolkit_version < (4, 0, 0), "Analysis Info added in CUDA v4.0")
+def test_create_destroy_AnalysisInfo():
+    info = cusparseCreateSolveAnalysisInfo()
+    cusparseDestroySolveAnalysisInfo(info)
+
+
+@skipIf(toolkit_version < (6, 0, 0),
+        "additional solve info types introduced in CUDA v6.0")
+def test_create_destroy_SolveInfo_v6():
+    # CSR cases
+    info = cusparse.cusparseCreateCsrsv2Info()
+    cusparse.cusparseDestroyCsrsv2Info(info)
+
+    info = cusparse.cusparseCreateCsrilu02Info()
+    cusparse.cusparseDestroyCsrilu02Info(info)
+
+    info = cusparse.cusparseCreateCsric02Info()
+    cusparse.cusparseDestroyCsric02Info(info)
+
+    # BSR cases
+    info = cusparse.cusparseCreateBsrsv2Info()
+    cusparse.cusparseDestroyBsrsv2Info(info)
+
+    info = cusparse.cusparseCreateBsrilu02Info()
+    cusparse.cusparseDestroyBsrilu02Info(info)
+
+    info = cusparse.cusparseCreateBsric02Info()
+    cusparse.cusparseDestroyBsric02Info(info)
+
+
+@skipIf(toolkit_version < (6, 5, 0),
+        "additional solve info types introduced in CUDA v6.5")
+def test_create_destroy_SolveInfo_v65():
+    info = cusparse.cusparseCreateBsrsm2Info()
+    cusparse.cusparseDestroyBsrsm2Info(info)
+
+@skipIf(toolkit_version < (7, 0, 0), "ColorInfo introduced in CUDA v7.0")
+def test_create_destroy_ColorInfo():
+    # CSR cases
+    info = cusparse.cusparseCreateColorInfo()
+    cusparse.cusparseDestroyColorInfo(info)
+
+
+def test_set_stream():
+    handle = cusparseCreate()
+    stream = drv.Stream()
+    try:
+        cusparseSetStream(handle, stream.handle)
+    finally:
+        cusparseDestroy(handle)
+
+@skipIf(toolkit_version < (4, 1, 0), "skip for CUDA < v4.1")
+def test_get_set_PointerMode():
+    handle = cusparseCreate()
+    try:
+        # test default mode
+        mode = cusparseGetPointerMode(handle)
+        assert mode == CUSPARSE_POINTER_MODE_HOST
+
+        # test setting/getting new mode
+        cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE)
+        mode = cusparseGetPointerMode(handle)
+        assert mode == CUSPARSE_POINTER_MODE_DEVICE
+
+        # can't set outside enumerated range
+        assert_raises(CUSPARSE_STATUS_INVALID_VALUE, cusparseSetPointerMode,
+                      handle, 2)
+    finally:
+        cusparseDestroy(handle)
+
+
+def test_matrix_descriptor_create_get_set_destroy():
+    # create matrix description
+    descrA = cusparseCreateMatDescr()
+
+    try:
+        # get default values/set
+        assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_GENERAL
+        assert cusparseGetMatDiagType(descrA) == CUSPARSE_DIAG_TYPE_NON_UNIT
+        assert cusparseGetMatIndexBase(descrA) == CUSPARSE_INDEX_BASE_ZERO
+        assert cusparseGetMatFillMode(descrA) == CUSPARSE_FILL_MODE_LOWER
+
+        # test set/get new values
+        cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_HERMITIAN)
+        assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_HERMITIAN
+        cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)
+        assert cusparseGetMatDiagType(descrA) == CUSPARSE_DIAG_TYPE_UNIT
+        cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)
+        assert cusparseGetMatIndexBase(descrA) == CUSPARSE_INDEX_BASE_ONE
+        cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)
+        assert cusparseGetMatFillMode(descrA) == CUSPARSE_FILL_MODE_UPPER
+
+        # can't set outside enumerated range
+        assert_raises(
+            OverflowError, cusparseSetMatType, descrA, -1)
+        assert_raises(
+            CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatType, descrA, 100)
+        assert_raises(
+            CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatDiagType, descrA, 100)
+        assert_raises(
+            CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatIndexBase, descrA,
+            100)
+        assert_raises(
+            CUSPARSE_STATUS_INVALID_VALUE, cusparseSetMatFillMode, descrA, 100)
+
+        # OLD BEHAVIOR:  float input gets cast to int
+        # cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL + 0.5)
+        # assert cusparseGetMatType(descrA) == CUSPARSE_MATRIX_TYPE_GENERAL
+        assert_raises(TypeError, cusparseSetMatType, descrA,
+                      CUSPARSE_MATRIX_TYPE_GENERAL + 0.5)
+    finally:
+        cusparseDestroyMatDescr(descrA)
+
+
+def test_dense_nnz():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    A = gpuarray.to_gpu(A_cpu)
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    # loop over all directions and dtypes
+    try:
+        cusparse_dtypes = [np.float32, np.float64, np.complex64, np.complex128]
+        for dirA in [CUSPARSE_DIRECTION_ROW, CUSPARSE_DIRECTION_COLUMN]:
+            for dtype in cusparse_dtypes:
+                nnzRowCol, nnzTotal = dense_nnz(
+                    descrA, A.astype(dtype), dirA=dirA)
+                assert nnzTotal == 5
+                if dirA == CUSPARSE_DIRECTION_ROW:
+                    assert_equal(nnzRowCol.get(), [3, 0, 1, 1])
+                else:
+                    assert_equal(nnzRowCol.get(), [1, 2, 2])
+    finally:
+        cusparseDestroyMatDescr(descrA)
+
+
+def test_dense2csr_csr2dense():
+    A = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    m, n = A.shape
+    for dtype in cusparse_dtypes:
+        A = A.astype(dtype)
+        A_csr_scipy = scipy.sparse.csr_matrix(A)
+        (descrA, csrValA, csrRowPtrA, csrColIndA) = dense2csr(A)
+        try:
+            assert_equal(csrValA.get(), A_csr_scipy.data)
+            assert_equal(csrRowPtrA.get(), A_csr_scipy.indptr)
+            assert_equal(csrColIndA.get(), A_csr_scipy.indices)
+
+            A_dense = csr2dense(m, n, descrA, csrValA, csrRowPtrA, csrColIndA)
+            assert_equal(A, A_dense.get())
+        finally:
+            # release descrA that was generated within dense2csr
+            cusparseDestroyMatDescr(descrA)
+
+
+def test_dense2csc_csc2dense():
+    # comparison to scipy ColPtr/RowInd currently known to fail
+    # is this a bug or a different coordinate convention?
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F')
+    m, n = A_cpu.shape
+    for dtype in cusparse_dtypes:
+        A_cpu = A_cpu.astype(dtype)
+        A_csc_scipy = scipy.sparse.csc_matrix(A_cpu)
+        A = gpuarray.to_gpu(A_cpu)
+        (descrA, cscValA, cscColPtrA, cscRowIndA) = dense2csc(A)
+        try:
+            assert_equal(cscValA.get(), A_csc_scipy.data)
+            assert_equal(cscColPtrA.get(), A_csc_scipy.indptr)
+            assert_equal(cscRowIndA.get(), A_csc_scipy.indices)
+
+            A_dense = csc2dense(m, n, descrA, cscValA, cscColPtrA, cscRowIndA)
+            assert_equal(A_cpu, A_dense.get())
+        finally:
+            # release descrA that was generated within dense2csc
+            cusparseDestroyMatDescr(descrA)
+
+
+def test_csr2csc_csc2csr():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F')
+    m, n = A_cpu.shape
+
+    for dtype in cusparse_dtypes:
+        A = gpuarray.to_gpu(A_cpu.astype(dtype))
+        A_csr_scipy = scipy.sparse.csr_matrix(A_cpu)
+        A_csc_scipy = A_csr_scipy.tocsc()
+        (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A)
+
+        try:
+            cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr,
+                                                   csrColInd)
+            # verify match to scipy
+            assert_equal(cscVal.get(), A_csc_scipy.data)
+            assert_equal(cscColPtr.get(), A_csc_scipy.indptr)
+            assert_equal(cscRowInd.get(), A_csc_scipy.indices)
+
+            # repeat for inverse operation
+            csrVal, csrRowPtr, csrColInd = csc2csr(n, m, cscVal, cscColPtr,
+                                                   cscRowInd)
+            # verify match to scipy
+            assert_equal(csrVal.get(), A_csr_scipy.data)
+            assert_equal(csrRowPtr.get(), A_csr_scipy.indptr)
+            assert_equal(csrColInd.get(), A_csr_scipy.indices)
+
+        finally:
+            cusparseDestroyMatDescr(descr)
+
+
+def test_csr2coo_coo2csr():
+    A_cpu = np.asarray([[1, 0, 0], [0, 2, 0], [3, 0, 4], [0, 0, 5]], order='F')
+    m, n = A_cpu.shape
+
+    for dtype in cusparse_dtypes:
+        A = gpuarray.to_gpu(A_cpu.astype(dtype))
+        (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A)
+        try:
+            nnz = csrVal.size
+            cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr,
+                                                   csrColInd)
+
+            cooRowInd = csr2coo(csrRowPtr, nnz)
+
+            # couldn't compare to scipy due to different ordering, so check the
+            # values directly
+            vals = csrVal.get()
+            rows = cooRowInd.get()
+            cols = csrColInd.get()
+            for idx in range(nnz):
+                assert A_cpu[rows[idx], cols[idx]] == vals[idx]
+
+            # repeat for inverse operation
+            csrRowPtr_v2 = coo2csr(cooRowInd, m)
+            assert_equal(csrRowPtr_v2.get(), csrRowPtr.get())
+
+        finally:
+            cusparseDestroyMatDescr(descr)
+
+
+def test_csc2coo_coo2csc():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]], order='F')
+    m, n = A_cpu.shape
+
+    for dtype in cusparse_dtypes:
+        A = gpuarray.to_gpu(A_cpu.astype(dtype))
+        (descr, csrVal, csrRowPtr, csrColInd) = dense2csr(A)
+
+        try:
+            nnz = csrVal.size
+
+            cscVal, cscColPtr, cscRowInd = csr2csc(m, n, csrVal, csrRowPtr,
+                                                   csrColInd)
+            cooColInd = csc2coo(cscColPtr, nnz)
+            # couldn't compare to scipy due to different ordering, so check the
+            # values directly
+            vals = csrVal.get()
+            rows = cscRowInd.get()
+            cols = cooColInd.get()
+            for idx in range(nnz):
+                assert A_cpu[rows[idx], cols[idx]] == vals[idx]
+
+            # repeat for inverse operation
+            cscColPtr_v2 = coo2csc(cooColInd, n)
+            assert_equal(cscColPtr_v2.get(), cscColPtr.get())
+
+        finally:
+            cusparseDestroyMatDescr(descr)
+
+
+def test_csrmv():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    # A = gpuarray.to_gpu(A_cpu)
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    csr_numpy = scipy.sparse.csr_matrix(A_cpu)
+    indptr = csr_numpy.indptr
+    indices = csr_numpy.indices
+    csr_data = csr_numpy.data
+
+    csrRowPtrA = gpuarray.to_gpu(indptr)
+    csrColIndA = gpuarray.to_gpu(indices)
+    m, n = csr_numpy.shape
+    alpha = 2.0
+    # loop over all transpose operations and dtypes
+    try:
+        for transA in trans_list:
+            for dtype in cusparse_dtypes:
+                if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                    x = gpuarray.to_gpu(np.ones((n, ), dtype=dtype))
+                else:
+                    x = gpuarray.to_gpu(np.ones((m, ), dtype=dtype))
+                csrValA = gpuarray.to_gpu(csr_data.astype(dtype))
+
+                # test mutliplication without passing in y
+                beta = 0.0
+                y = csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m,
+                          n, x, transA=transA, alpha=alpha, beta=beta)
+                y_cpu = y.get()
+
+                # repeat, but pass in previous y with beta = 1.0
+                beta = 1.0
+                y = csrmv(descrA, csrValA, csrRowPtrA, csrColIndA, m,
+                          n, x, transA=transA, alpha=alpha, beta=beta, y=y)
+                y_cpu2 = y.get()
+                if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                    assert_almost_equal(y_cpu, [2., 2., 4., 6.])
+                    assert_almost_equal(y_cpu2, 2 * y_cpu)
+                else:
+                    assert_almost_equal(y_cpu, [4., 2., 8.])
+                    assert_almost_equal(y_cpu2, 2 * y_cpu)
+    finally:
+        cusparseDestroyMatDescr(descrA)
+
+
+def test_csrmm():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    csr_numpy = scipy.sparse.csr_matrix(A_cpu)
+    indptr = csr_numpy.indptr
+    indices = csr_numpy.indices
+    csr_data = csr_numpy.data
+
+    csrRowPtrA = gpuarray.to_gpu(indptr)
+    csrColIndA = gpuarray.to_gpu(indices)
+    n = 5
+    alpha = 2.0
+
+    # loop over all transpose operations and dtypes
+    try:
+        for transA in trans_list:
+            for dtype in cusparse_dtypes:
+                csrValA = gpuarray.to_gpu(csr_data.astype(dtype))
+
+                m, k = A_cpu.shape
+                if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                    B_cpu = np.ones((k, n), dtype=dtype, order='F')
+                    expected_result = alpha * np.dot(A_cpu, B_cpu)
+                else:
+                    B_cpu = np.ones((m, n), dtype=dtype, order='F')
+                    if transA == CUSPARSE_OPERATION_TRANSPOSE:
+                        expected_result = alpha * np.dot(A_cpu.T, B_cpu)
+                    else:
+                        expected_result = alpha * np.dot(np.conj(A_cpu).T,
+                                                         B_cpu)
+                B = gpuarray.to_gpu(B_cpu)
+                # test mutliplication without passing in C
+                beta = 0.0
+                C = csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B,
+                          transA=transA, alpha=alpha, beta=beta)
+                C_cpu = C.get()
+                assert_almost_equal(C_cpu, expected_result)
+
+                # repeat, but pass in previous y with beta = 1.0
+                beta = 1.0
+                C = csrmm(m, n, k, descrA, csrValA, csrRowPtrA, csrColIndA, B,
+                          C=C, transA=transA, alpha=alpha, beta=beta)
+                C_cpu2 = C.get()
+                assert_almost_equal(C_cpu2, 2*expected_result)
+    finally:
+        cusparseDestroyMatDescr(descrA)
+
+
+@skipIf(toolkit_version < (5, 5, 0), "skip for CUDA < v5.5")
+def test_csrmm2():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    csr_numpy = scipy.sparse.csr_matrix(A_cpu)
+    indptr = csr_numpy.indptr
+    indices = csr_numpy.indices
+    csr_data = csr_numpy.data
+
+    csrRowPtrA = gpuarray.to_gpu(indptr)
+    csrColIndA = gpuarray.to_gpu(indices)
+    n = 5
+    alpha = 2.0
+    try:
+        for transB in trans_list[:-1]:
+            for transA in trans_list:
+                for dtype in cusparse_dtypes:
+                    csrValA = gpuarray.to_gpu(csr_data.astype(dtype))
+
+                    m, k = A_cpu.shape
+                    if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                        m, k = A_cpu.shape
+
+                        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                            B_cpu = np.ones((k, n), dtype=dtype, order='F')
+                            opB = B_cpu
+                        else:
+                            B_cpu = np.ones((n, k), dtype=dtype, order='F')
+                            opB = B_cpu.T
+                        opA = A_cpu
+                    else:
+                        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                            B_cpu = np.ones((m, n), dtype=dtype, order='F')
+                            opB = B_cpu
+                        else:
+                            # cuSPARSE doesn't implement this case
+                            continue
+                            # B_cpu = np.ones((n, m), dtype=dtype, order='F')
+                            # opB = B_cpu.T
+                        if transA == CUSPARSE_OPERATION_TRANSPOSE:
+                            opA = A_cpu.T
+                        else:
+                            opA = np.conj(A_cpu).T
+
+                    expected_result = alpha * np.dot(opA, opB)
+                    B = gpuarray.to_gpu(B_cpu)
+
+                    # test mutliplication without passing in C
+                    beta = 0.0
+                    C = csrmm2(m, n, k, descrA, csrValA, csrRowPtrA,
+                               csrColIndA, B, transA=transA, transB=transB,
+                               alpha=alpha, beta=beta)
+                    C_cpu = C.get()
+                    assert_almost_equal(C_cpu, expected_result)
+
+                    # repeat, but pass in previous y with beta = 1.0
+                    beta = 1.0
+                    C = csrmm2(m, n, k, descrA, csrValA, csrRowPtrA,
+                               csrColIndA, B, C=C, transA=transA,
+                               transB=transB, alpha=alpha, beta=beta)
+                    C_cpu2 = C.get()
+                    assert_almost_equal(C_cpu2, 2*expected_result)
+    finally:
+        cusparseDestroyMatDescr(descrA)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_csrgeamNnz():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    A_cpu = scipy.sparse.csr_matrix(A_cpu)
+    B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]])
+    B_cpu = scipy.sparse.csr_matrix(B_cpu)
+    C_cpu = A_cpu + B_cpu
+    nnz_expected = C_cpu.getnnz()
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    descrB = cusparseCreateMatDescr()
+    cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO)
+
+    csrRowPtrA = gpuarray.to_gpu(A_cpu.indptr)
+    csrColIndA = gpuarray.to_gpu(A_cpu.indices)
+
+    csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr)
+    csrColIndB = gpuarray.to_gpu(B_cpu.indices)
+
+    m, n = A_cpu.shape
+
+    # test alternative case where descrC, csrRowPtrC not preallocated
+    descrC, csrRowPtrC, nnzC = _csrgeamNnz(
+        m, n, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB, csrColIndB,
+        check_inputs=True)
+    cusparseDestroyMatDescr(descrC)
+    assert_equal(nnzC, nnz_expected)
+
+    # now test cases with preallocated matrix descrC & csrrowPtrC
+    descrC = cusparseCreateMatDescr()
+    try:
+        cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+        csrRowPtrC = gpuarray.to_gpu(np.zeros((m+1, ), dtype=np.int32))
+        nnzC = _csrgeamNnz(m, n, descrA, csrRowPtrA, csrColIndA, descrB,
+                           csrRowPtrB, csrColIndB, descrC=descrC,
+                           csrRowPtrC=csrRowPtrC, nnzA=None, nnzB=None,
+                           check_inputs=True)
+        assert_equal(nnzC, nnz_expected)
+    finally:
+        cusparseDestroyMatDescr(descrC)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_csrgeam():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    A_cpu = scipy.sparse.csr_matrix(A_cpu)
+    B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]])
+    B_cpu = scipy.sparse.csr_matrix(B_cpu)
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    descrB = cusparseCreateMatDescr()
+    cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO)
+
+    csrRowPtrA = gpuarray.to_gpu(A_cpu.indptr)
+    csrColIndA = gpuarray.to_gpu(A_cpu.indices)
+
+    csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr)
+    csrColIndB = gpuarray.to_gpu(B_cpu.indices)
+
+    m, n = A_cpu.shape
+
+    alpha = 0.3
+    beta = 5.0
+    C_cpu = alpha*A_cpu + beta*B_cpu
+
+    try:
+        for dtype in cusparse_dtypes:
+            csrValA = gpuarray.to_gpu(A_cpu.data.astype(dtype))
+            csrValB = gpuarray.to_gpu(B_cpu.data.astype(dtype))
+            try:
+                descrC, csrValC, csrRowPtrC, csrColIndC = csrgeam(
+                    m, n, descrA, csrValA, csrRowPtrA, csrColIndA, descrB,
+                    csrValB, csrRowPtrB, csrColIndB, alpha=alpha, beta=beta)
+                assert_almost_equal(csrValC.get(), C_cpu.data)
+            finally:
+                cusparseDestroyMatDescr(descrC)
+    finally:
+        cusparseDestroyMatDescr(descrA)
+        cusparseDestroyMatDescr(descrB)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_csrgemmNnz():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    # A = gpuarray.to_gpu(A_cpu)
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    descrB = cusparseCreateMatDescr()
+    cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO)
+
+    csr_numpy = scipy.sparse.csr_matrix(A_cpu)
+    indptr = csr_numpy.indptr
+    indices = csr_numpy.indices
+
+    B_cpu = csr_numpy.T.tocsr()
+
+    csrRowPtrA = gpuarray.to_gpu(indptr)
+    csrColIndA = gpuarray.to_gpu(indices)
+
+    csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr)
+    csrColIndB = gpuarray.to_gpu(B_cpu.indices)
+
+    m, k = A_cpu.shape
+    n = B_cpu.shape[1]
+    # test alternative case where descrC, csrRowPtrC not preallocated
+    transA = transB = CUSPARSE_OPERATION_NON_TRANSPOSE
+    descrC, csrRowPtrC, nnzC = _csrgemmNnz(
+        m, n, k, descrA, csrRowPtrA, csrColIndA, descrB, csrRowPtrB,
+        csrColIndB, transA=transA, transB=transB, check_inputs=True)
+    cusparseDestroyMatDescr(descrC)
+
+    # now test cases with preallocated matrix description
+    descrC = cusparseCreateMatDescr()
+    cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL)
+
+    # loop over all transpose operations and dtypes
+    try:
+        for transA in trans_list:
+            transB = transA
+
+            if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                m, k = A_cpu.shape
+            else:
+                k, m = A_cpu.shape
+
+            if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                kB, n = B_cpu.shape
+            else:
+                n, kB = B_cpu.shape
+
+            csrRowPtrC = gpuarray.to_gpu(np.zeros((m+1, ), dtype=np.int32))
+            nnzC = _csrgemmNnz(m, n, k, descrA, csrRowPtrA, csrColIndA,
+                               descrB, csrRowPtrB, csrColIndB, descrC=descrC,
+                               csrRowPtrC=csrRowPtrC, nnzA=None, nnzB=None,
+                               transA=transA, transB=transB, check_inputs=True)
+            if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                assert nnzC == 8
+                assert_equal(csrRowPtrC.get(), [0, 2, 3, 6, 8])
+            else:
+                assert nnzC == 5
+                assert_equal(csrRowPtrC.get(), [0, 2, 3, 5])
+    finally:
+        cusparseDestroyMatDescr(descrA)
+        cusparseDestroyMatDescr(descrB)
+        cusparseDestroyMatDescr(descrC)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_csrgemm():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    # A = gpuarray.to_gpu(A_cpu)
+
+    descrA = cusparseCreateMatDescr()
+    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)
+
+    descrB = cusparseCreateMatDescr()
+    cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL)
+    cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO)
+
+    csr_numpy = scipy.sparse.csr_matrix(A_cpu)
+    indptr = csr_numpy.indptr
+    indices = csr_numpy.indices
+    csr_data = csr_numpy.data
+
+    B_cpu = csr_numpy.T.tocsr()
+
+    csrRowPtrA = gpuarray.to_gpu(indptr)
+    csrColIndA = gpuarray.to_gpu(indices)
+
+    csrRowPtrB = gpuarray.to_gpu(B_cpu.indptr)
+    csrColIndB = gpuarray.to_gpu(B_cpu.indices)
+
+    m, k = A_cpu.shape
+
+    try:
+        for transA in trans_list:
+            transB = transA
+
+            if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                m, k = A_cpu.shape
+            else:
+                k, m = A_cpu.shape
+            if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                kB, n = B_cpu.shape
+            else:
+                n, kB = B_cpu.shape
+
+            for dtype in cusparse_dtypes:
+                csrValA = gpuarray.to_gpu(csr_data.astype(dtype))
+                csrValB = gpuarray.to_gpu(B_cpu.data.astype(dtype))
+
+                descrC, csrValC, csrRowPtrC, csrColIndC = csrgemm(
+                    m, n, k, descrA, csrValA, csrRowPtrA,
+                    csrColIndA, descrB, csrValB, csrRowPtrB, csrColIndB,
+                    transA=transA, transB=transB)
+                cusparseDestroyMatDescr(descrC)
+                if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                    assert_almost_equal(csrValC.get(),
+                                        [1, 1, 1, 1, 2, 3, 3, 9])
+                else:
+                    assert_almost_equal(csrValC.get(), [2, 1, 1, 1, 10])
+
+
+    finally:
+        cusparseDestroyMatDescr(descrA)
+        cusparseDestroyMatDescr(descrB)
+
+
+def test_CSR_construction():
+    n = 64
+    h = cusparseCreate()
+    try:
+        for dtype in cusparse_dtypes:
+            A = 2*np.eye(n)
+
+            A_scipy_csr = scipy.sparse.csr_matrix(A)
+
+            # generate a CSR matrix from a dense gpuarray
+            A_CSR = CSR.to_CSR(gpuarray.to_gpu(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a CSR matrix from a dense numpy array
+            A_CSR = CSR.to_CSR(A, h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a CSR matrix from a list of lists
+            A_CSR = CSR.to_CSR(A.tolist(), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse CSR matrix
+            A_CSR = CSR.to_CSR(A_scipy_csr, h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse BSR matrix
+            A_CSR = CSR.to_CSR(scipy.sparse.bsr_matrix(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse COO matrix
+            A_CSR = CSR.to_CSR(scipy.sparse.coo_matrix(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse CSC matrix
+            A_CSR = CSR.to_CSR(scipy.sparse.csc_matrix(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse DIA matrix
+            A_CSR = CSR.to_CSR(scipy.sparse.dia_matrix(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+
+            # generate a csSPARSE CSR matrix from a scipy.sparse DOK matrix
+            A_CSR = CSR.to_CSR(scipy.sparse.dok_matrix(A), h)
+            assert_equal(A_CSR.Val.get(), A_scipy_csr.data)
+    finally:
+        cusparseDestroy(h)
+
+
+def test_CSR_properties():
+    n = 64
+    h = cusparseCreate()
+    A = 2*np.eye(n)
+
+    # generate a CSR matrix from a dense numpy array
+    A_CSR = CSR.to_CSR(A, h)
+
+    # test shape
+    assert A_CSR.shape == A.shape
+
+    # test matrix_type property
+    assert A_CSR.matrix_type == CUSPARSE_MATRIX_TYPE_GENERAL
+    A_CSR.matrix_type = CUSPARSE_MATRIX_TYPE_SYMMETRIC
+    assert A_CSR.matrix_type == CUSPARSE_MATRIX_TYPE_SYMMETRIC
+    assert cusparseGetMatType(A_CSR.descr) == CUSPARSE_MATRIX_TYPE_SYMMETRIC
+
+    # test index_base property
+    assert A_CSR.index_base == CUSPARSE_INDEX_BASE_ZERO
+    A_CSR.index_base = CUSPARSE_INDEX_BASE_ONE
+    assert A_CSR.index_base == CUSPARSE_INDEX_BASE_ONE
+    assert cusparseGetMatIndexBase(A_CSR.descr) == CUSPARSE_INDEX_BASE_ONE
+
+    # test diag_type property
+    assert A_CSR.diag_type == CUSPARSE_DIAG_TYPE_NON_UNIT
+    A_CSR.diag_type = CUSPARSE_DIAG_TYPE_UNIT
+    assert A_CSR.diag_type == CUSPARSE_DIAG_TYPE_UNIT
+    assert cusparseGetMatDiagType(A_CSR.descr) == CUSPARSE_DIAG_TYPE_UNIT
+
+    # test fill_mode property
+    assert A_CSR.fill_mode == CUSPARSE_FILL_MODE_LOWER
+    A_CSR.fill_mode = CUSPARSE_FILL_MODE_UPPER
+    assert A_CSR.fill_mode == CUSPARSE_FILL_MODE_UPPER
+    assert cusparseGetMatFillMode(A_CSR.descr) == CUSPARSE_FILL_MODE_UPPER
+
+    # verify setting value outside the valid range raises an exception
+    def set_mat_type(A, t):
+        A_CSR.matrix_type = t
+    assert_raises(CUSPARSE_STATUS_INVALID_VALUE, set_mat_type, A_CSR, 100)
+
+    # test get nnz
+    assert A_CSR.nnz == n
+    assert A_CSR.getnnz() == n
+
+    # verify that nnz can't be set
+    def set_nnz(A, nnz):
+        A_CSR.nnz = nnz
+    assert_raises(AttributeError, set_nnz, A_CSR, 5)
+
+
+def test_CSR_properties2():
+    n = 64
+    h = cusparseCreate()
+    try:
+        for dtype in cusparse_complex_dtypes:
+            A = 2*np.eye(n).astype(dtype)
+            # add a couple of complex values
+            A[0, 1] = 1j
+            A[1, 0] = -1j
+            A_CSR = CSR.to_CSR(A, h)
+            orig_data = A_CSR.data.get()
+            orig_indices = A_CSR.indices.get()
+            orig_indptr = A_CSR.indptr.get()
+
+            assert_equal(abs(A_CSR).data.get(), abs(orig_data))
+            assert_equal(-A_CSR.data.get(), -orig_data)
+
+            A_CSR *= 2
+            assert_almost_equal(A_CSR.data.get(), 2*orig_data)
+
+            # this test requires from __future__ import division in Python 2.x
+            A_CSR /= 2  # A_CSR._with_data(A_CSR.data/2)
+            assert_almost_equal(A_CSR.data.get(), orig_data)
+
+            assert_equal(A_CSR.conj().data.get(), np.conj(orig_data))
+            assert_equal(A_CSR.real.data.get(), orig_data.real)
+            assert_equal(A_CSR.imag.data.get(), orig_data.imag)
+            assert_equal(A_CSR.A.get(), A)
+            assert_equal(A_CSR.T.A.get(), A.T)
+
+            # make sure the above didn't modify the original data
+            assert_equal(A_CSR.data.get(), orig_data)
+            assert_equal(A_CSR.indices.get(), orig_indices)
+            assert_equal(A_CSR.indptr.get(), orig_indptr)
+
+            assert_equal(A_CSR.H.A.get(), A.T.conj())
+            assert_equal(A_CSR.size, A_CSR.nnz)
+    finally:
+        cusparseDestroy(h)
+
+
+def test_CSR_todense():
+    n = 64
+    h = cusparseCreate()
+    try:
+        dtype = np.float32
+        A = 2*np.eye(n)
+
+        # generate a CSR matrix from a dense numpy array
+        A_CSR = CSR.to_CSR(A.astype(dtype), h)
+
+        # convert cusparseCSR back to a dense matrix
+        A_dense = A_CSR.todense(to_cpu=False)
+        assert type(A_dense) == gpuarray.GPUArray
+        assert_equal(A_dense.get(), A)
+
+        A_dense = A_CSR.todense(to_cpu=True)
+        assert type(A_dense) == np.ndarray
+        assert_equal(A_dense, A)
+    finally:
+        cusparseDestroy(h)
+
+
+def test_CSR_get():
+    n = 64
+    h = cusparseCreate()
+    try:
+        dtype = np.float32
+        A = 2*np.eye(n)
+
+        # generate a CSR matrix from a dense numpy array
+        A_CSR = CSR.to_CSR(A.astype(dtype), h)
+
+        # convert cusparseCSR back to a dense matrix
+        csr_scipy = A_CSR.get()
+        # assert_equal(A_CSR.data.get(), csr_scipy.data)
+        # assert_equal(A_CSR.indices.get(), csr_scipy.indices)
+        # assert_equal(A_CSR.indptr.get(), csr_scipy.indptr)
+
+        assert_equal(csr_scipy.indptr, scipy.sparse.csr_matrix(A).indptr)
+        assert_equal(csr_scipy.indices, scipy.sparse.csr_matrix(A).indices)
+        assert_equal(csr_scipy.data, scipy.sparse.csr_matrix(A).data)
+    finally:
+        cusparseDestroy(h)
+
+
+def test_CSR_mv():
+    n = 64
+    h = cusparseCreate()
+    try:
+        A = 2*np.eye(n)
+
+        dtype = np.float32
+        x_cpu = np.ones((n, ), dtype=dtype)
+        for dtype in cusparse_dtypes:
+
+            # generate a CSR matrix from a dense numpy array
+            A_CSR = CSR.to_CSR(A.astype(dtype), h)
+            x = gpuarray.to_gpu(x_cpu.astype(dtype))
+
+            # test default operation
+            y = A_CSR.mv(x)
+            assert_almost_equal(y.get(), A.diagonal())
+            # transpose will be the same for this matrix
+            y = A_CSR.mv(x, transA=CUSPARSE_OPERATION_TRANSPOSE)
+            assert_almost_equal(y.get(), A.diagonal())
+
+            alpha = 5.0
+            y = A_CSR.mv(x, alpha=alpha)
+            assert_almost_equal(y.get(), alpha*A.diagonal())
+
+            # repeat with non-zero beta and initialize with previous y
+            beta = 2.5
+            y = A_CSR.mv(x, alpha=alpha, beta=beta, y=y)
+            assert_almost_equal(y.get(), (alpha+alpha*beta)*A.diagonal())
+    finally:
+        cusparseDestroy(h)
+
+
+def test_CSR_mm():
+    m = 64
+    n = 10
+    h = cusparseCreate()
+    try:
+        A = 2*np.eye(m)
+
+        dtype = np.float32
+        B_cpu = np.ones((m, n), dtype=dtype, order='F')
+        for dtype in cusparse_dtypes:
+            A = A.astype(dtype)
+
+            # generate a CSR matrix from a dense numpy array
+            A_CSR = CSR.to_CSR(A, h)
+            B = gpuarray.to_gpu(B_cpu.astype(dtype))
+
+            # test default operation
+            C = A_CSR.mm(B)
+            assert_almost_equal(C.get(), np.dot(A, B_cpu))
+            # transpose will be the same for this matrix
+            C = A_CSR.mm(B_cpu.astype(dtype), transA=CUSPARSE_OPERATION_TRANSPOSE)
+            assert_almost_equal(C.get(), np.dot(A.T, B_cpu))
+
+            alpha = 5.0
+            C = A_CSR.mm(B, alpha=alpha)
+            assert_almost_equal(C.get(), alpha*np.dot(A, B_cpu))
+
+            # repeat with non-zero beta and initialize with previous C
+            beta = 2.5
+            C = A_CSR.mm(B, alpha=alpha, beta=beta, C=C)
+            assert_almost_equal(C.get(), (alpha+alpha*beta)*np.dot(A, B_cpu))
+    finally:
+        cusparseDestroy(h)
+
+
+@skipIf(toolkit_version < (5, 5, 0), "skip for CUDA < v5.5")
+def test_CSR_mm2():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    n = 5
+    alpha = 2.0
+    h = cusparseCreate()
+    try:
+        for transB in trans_list[:-1]:
+            for transA in trans_list:
+                for dtype in cusparse_dtypes:
+                    m, k = A_cpu.shape
+                    if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                            B_cpu = np.ones((k, n), dtype=dtype, order='F')
+                            opB = B_cpu
+                        else:
+                            B_cpu = np.ones((n, k), dtype=dtype, order='F')
+                            opB = B_cpu.T
+                        opA = A_cpu
+                    else:
+                        if transB == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                            B_cpu = np.ones((m, n), dtype=dtype, order='F')
+                            opB = B_cpu
+                        else:
+                            # cuSPARSE doesn't implement this case
+                            continue
+                        if transA == CUSPARSE_OPERATION_TRANSPOSE:
+                            opA = A_cpu.T
+                        else:
+                            opA = np.conj(A_cpu).T
+
+                    expected_result = alpha * np.dot(opA, opB)
+                    B = gpuarray.to_gpu(B_cpu)
+
+                    A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h)
+
+                    # test mutliplication without passing in C
+                    beta = 0.0
+                    C = A_CSR.mm2(B, transA=transA, transB=transB, alpha=alpha,
+                                  beta=beta, to_cpu=False)
+                    assert_almost_equal(C.get(), expected_result)
+
+                    # repeat, but pass in previous y with beta = 1.0
+                    beta = 1.0
+                    C = A_CSR.mm2(B, transA=transA, transB=transB, alpha=alpha,
+                                  beta=beta, C=C, to_cpu=True)
+                    assert_almost_equal(C, (1.0+beta)*expected_result)
+
+    finally:
+        cusparseDestroy(h)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_CSR_geam():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+    B_cpu = np.asarray([[0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0]])
+
+    h = cusparseCreate()
+    alpha = 0.3
+    beta = 5.0
+
+    try:
+        for dtype in cusparse_dtypes:
+            A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h)
+            B_CSR = CSR.to_CSR(B_cpu.astype(dtype), h)
+            C_CSR = A_CSR.geam(B_CSR, alpha=alpha, beta=beta)
+
+            # compute on CPU with numpy and compare
+            C_cpu = alpha*A_cpu + beta*B_cpu
+            assert_almost_equal(C_CSR.todense(to_cpu=True), C_cpu)
+    finally:
+        cusparseDestroy(h)
+
+
+@skipIf(toolkit_version < (5, 0, 0), "skip for CUDA < v5.0")
+def test_CSR_gemm():
+    A_cpu = np.asarray([[1, 0, 0], [0, 1, 0], [1, 0, 1], [0, 0, 3]])
+
+    h = cusparseCreate()
+    B_cpu = A_cpu.T
+    try:
+        for transA in trans_list:
+            transB = transA
+
+            if transA == CUSPARSE_OPERATION_NON_TRANSPOSE:
+                opA = A_cpu
+                opB = B_cpu
+            else:
+                opA = A_cpu.T
+                opB = B_cpu.T
+
+            for dtype in cusparse_dtypes:
+                A_CSR = CSR.to_CSR(A_cpu.astype(dtype), h)
+                B_CSR = CSR.to_CSR(B_cpu.astype(dtype), h)
+                C_CSR = A_CSR.gemm(B_CSR, transA=transA, transB=transB)
+
+                # compute on CPU with numpy and compare
+                C_cpu = np.dot(opA, opB)
+                assert_almost_equal(C_CSR.todense(to_cpu=True), C_cpu)
+    finally:
+        cusparseDestroy(h)