diff --git a/libNeonPy/src/Neon/py/CudaDriver.cpp b/libNeonPy/src/Neon/py/CudaDriver.cpp index 29f279fa..c7d13285 100644 --- a/libNeonPy/src/Neon/py/CudaDriver.cpp +++ b/libNeonPy/src/Neon/py/CudaDriver.cpp @@ -67,10 +67,10 @@ auto CudaDriver::run_kernel( int const ndevs = backend.getDeviceCount(); // #pragma omp parallel for num_threads(ndevs) for (int setIdx = 0; setIdx < ndevs; setIdx++) { + backend.devSet().setActiveDevContext(setIdx); cudaStream_t const& cuda_stream = streamSet.cudaStream(setIdx); CUstream driverStream = (CUstream)cuda_stream; CUfunction function = static_cast(kernelSet[setIdx]); - backend.devSet().setActiveDevContext(setIdx); auto& launch_info = launch_params[setIdx]; // auto const cudaGrid = launch_info.cudaGrid(); @@ -110,10 +110,10 @@ auto CudaDriver::run_kernel( // } // int block_dim = 256; // int grid_dim = (n + block_dim - 1) / block_dim; - // std::cout << "block_dim " << block_dim << std::endl; - // std::cout << "grid_dim " << grid_dim << std::endl; - // std::cout << "n " << n << std::endl; - // std::cout << "cuLaunchKernel" << std::endl; +// std::cout << "block_dim " << launch_info.toString()<< std::endl; +// std::cout << "grid_dim " << launch_info << std::endl; +// std::cout << "n " << n << std::endl; +// std::cout << "cuLaunchKernel" << std::endl; res = cuLaunchKernel( function, diff --git a/py_neon/__init__.py b/py_neon/__init__.py index 31126aad..04a748b8 100644 --- a/py_neon/__init__.py +++ b/py_neon/__init__.py @@ -6,6 +6,7 @@ from .dataview import DataView from .execution import Execution from .index_3d import Index_3d +from .ngh_idx import Ngh_idx from .dense.__init__ import * from .block.__init__ import * diff --git a/py_neon/ngh_idx.py b/py_neon/ngh_idx.py new file mode 100644 index 00000000..57a5f6a7 --- /dev/null +++ b/py_neon/ngh_idx.py @@ -0,0 +1,43 @@ +import ctypes +import typing + + +class Ngh_idx(ctypes.Structure): + _fields_ = [("x", ctypes.c_int8), + ("y", ctypes.c_int8), + ("z", ctypes.c_int8)] + + def __init__(self, + x: int, + y: int, + z: int): + self.x = x + self.y = y + self.z = z + + def __len__(self): + return 3 + + def __getitem__(self, index): + if index == 0: + return self.x + if index == 1: + return self.y + if index == 2: + return self.z + raise IndexError("Index out of range") + + def to_wp_kernel_dim(self) -> typing.Tuple[int, int, int]: + return (self.x, self.y, self.z) + + def __str__(self): + str = "" % (ctypes.addressof(self)) + str += f"\n\tx: {self.x}" + str += f"\n\ty: {self.y}" + str += f"\n\tz: {self.z}" + return str + + def __eq__(self, other): + if not isinstance(other, Index_3d): + return NotImplemented + return (self.x == other.x and self.y == other.y and self.z == other.z)