From e9b6a9ffedb38fd847e2b6a0d4ca5e7c75aa03ca Mon Sep 17 00:00:00 2001 From: Dan Lykov Date: Wed, 14 Feb 2024 03:14:51 -0600 Subject: [PATCH 01/14] writedisk compressor filename change --- qtensor/compression/Compressor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/qtensor/compression/Compressor.py b/qtensor/compression/Compressor.py index 18126bbf..cc033f63 100644 --- a/qtensor/compression/Compressor.py +++ b/qtensor/compression/Compressor.py @@ -513,12 +513,15 @@ def __init__(self, path): from pathlib import Path Path(path).mkdir(exist_ok=True, parents=True) self.path = path + self.counter = 1 - def _gen_random_filename(self, info): + def _gen_filename(self, info): dtype, shape, isCupy = info k = np.random.randint(0, 100000000) s = hex(k)[2:] - return self.path + f'/qtensor_data_{s}_{str(dtype)}.bin' + c = str(self.counter) + self.counter += 1 + return self.path + f'/qtensor_data_c-{c}_{s}_{str(dtype)}.bin' def compress(self, data): import cupy @@ -526,7 +529,7 @@ def compress(self, data): isCupy=False else: isCupy=True - fname = self._gen_random_filename((data.dtype, data.shape, isCupy)) + fname = self._gen_filename((data.dtype, data.shape, isCupy)) data.tofile(fname) return (fname, data.dtype, data.shape, isCupy) From a3d228be64eebab36862d2e4abc73760a6f2e399 Mon Sep 17 00:00:00 2001 From: Yuri Alexeev Date: Fri, 16 Feb 2024 22:59:38 +0000 Subject: [PATCH 02/14] fix torch slicing: ignore data_dict if data is present --- qtensor/contraction_backends/torch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 3df1bf16..5398f19e 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -63,6 +63,7 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): indices_sliced = [ i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int) ] + print(f'indicies_in {indices_in}, slice_dict {slice_dict}, bounds {slice_bounds}, slicedix {indices_sliced}, sshape {s_data.shape}') indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)] indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)] assert len(indices_sized) == len(s_data.shape) @@ -181,7 +182,10 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): # get data # sort tensor dimensions out_indices = list(sorted(tensor.indices, key=int, reverse=True)) - data = data_dict[tensor.data_key] + if tensor.data is None: + data = data_dict[tensor.data_key] + else: + data = tensor.data # Works for torch tensors just fine if not isinstance(data, torch.Tensor): if self.device == 'gpu' and torch.cuda.is_available(): From de30241d09d0cc265af8f2a5c8c9392f4194c242 Mon Sep 17 00:00:00 2001 From: Yuri Alexeev Date: Fri, 15 Mar 2024 13:25:37 +0000 Subject: [PATCH 03/14] add torch matm backend --- qtensor/contraction_backends/torch.py | 160 ++++++++++++++++++++++++-- qtree | 2 +- 2 files changed, 154 insertions(+), 8 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 5398f19e..ff7c8d25 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -1,10 +1,12 @@ from qtensor.tools.lazy_import import torch import qtree import numpy as np +from functools import reduce from qtree import np_framework from qtensor.contraction_backends import ContractionBackend from .common import get_slice_bounds, get_einsum_expr, slice_numpy_tensor import string +from loguru import logger CHARS = string.ascii_lowercase + string.ascii_uppercase def qtree2torch_tensor(tensor, data_dict): @@ -63,7 +65,7 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): indices_sliced = [ i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int) ] - print(f'indicies_in {indices_in}, slice_dict {slice_dict}, bounds {slice_bounds}, slicedix {indices_sliced}, sshape {s_data.shape}') + #print(f'indicies_in {indices_in}, slice_dict {slice_dict}, bounds {slice_bounds}, slicedix {indices_sliced}, sshape {s_data.shape}') indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)] indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)] assert len(indices_sized) == len(s_data.shape) @@ -73,8 +75,22 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): class TorchBackend(ContractionBackend): + def __init__(self, device='cpu'): - self.device = device + # alias of gpu -> cuda + if device=='gpu': + device='cuda' + # Check that CUDA is available if specified + if device=='cuda': + if not torch.cuda.is_available(): + logger.warning("Cuda is not available. Falling back to CPU") + device = 'cpu' + if device=='xpu': + import intel_extension_for_pytorch as ipex + + + self.device = torch.device(device) + logger.debug("Torch backend using device {}", self.device) self.dtype = ['float', 'double', 'complex64', 'complex128'] self.width_dict = [set() for i in range(30)] self.width_bc = [[0,0] for i in range(30)] #(#distinct_bc, #bc) @@ -91,7 +107,9 @@ def process_bucket(self, bucket, no_sum=False): list(map(int, result_indices)), list(map(int, tensor.indices)) ) + logger.trace('Before contract. Expr: {}, inputs: {}, {}', expr, result_data, tensor) result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace("expression {}. Data: {}, -> {}", expr, tensor.data, result_data) # Merge and sort indices and shapes result_indices = tuple(sorted( @@ -114,7 +132,9 @@ def process_bucket(self, bucket, no_sum=False): list(map(int, result_indices)), list(map(int, tensor.indices)) , contract = 1 ) + logger.trace('Before contract. Expr: {}, inputs: {}, {}', expr, result_data, tensor) result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace("expression {}. Data: {}, -> {}", expr, tensor.data, result_data) result_indices = tuple(sorted( set(result_indices + tensor.indices), key=int, reverse=True @@ -188,11 +208,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): data = tensor.data # Works for torch tensors just fine if not isinstance(data, torch.Tensor): - if self.device == 'gpu' and torch.cuda.is_available(): - cuda = torch.device('cuda') - data = torch.from_numpy(data.astype(np.complex128)).to(cuda) - else: - data = torch.from_numpy(data.astype(np.complex128)) + data = torch.from_numpy(data.astype(np.complex128)).to(self.device) else: data = data.type(torch.complex128) # slice data @@ -206,3 +222,133 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): def get_result_data(self, result): return torch.permute(result.data, tuple(reversed(range(result.data.ndim)))) + +class TorchBackendMatm(TorchBackend): + + def _get_index_sizes(self, *ixs): + try: + sizes = [ i.size for i in ixs ] + except AttributeError: + sizes = [2] * len(ixs) + return sizes + + def _get_index_space_size(self, *ixs): + sizes = self._get_index_sizes(*ixs) + return reduce(np.multiply, sizes, 1) + + def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): + out = ixout + common = set(ixa).intersection(set(ixb)) + # -- sum indices that are in one tensor only + all_ix = set(ixa+ixb) + sum_ix = all_ix - set(out) + a_sum = sum_ix.intersection(set(ixa) - common) + b_sum = sum_ix.intersection(set(ixb) - common) + #print('ab', ixa, ixb) + #print('all sum', sum_ix, 'a/b_sum', a_sum, b_sum) + if len(a_sum): + a = a.sum(axis=tuple(ixa.index(x) for x in a_sum)) + ixa = [x for x in ixa if x not in a_sum] + if len(b_sum): + b = b.sum(axis=tuple(ixb.index(x) for x in b_sum)) + ixb = [x for x in ixb if x not in b_sum] + tensors = a, b + # -- + + ixs = ixa, ixb + common = set(ixs[0]).intersection(set(ixs[1])) + + # \sum_k A_{kfm} * B_{kfn} = C_{fmn} + mix = set(ixs[0]) - common + nix = set(ixs[1]) - common + kix = common - set(out) + fix = common - kix + common = list(kix) + list(fix) + a = tensors[0].transpose(*[ + list(ixs[0]).index(x) for x in common + list(mix) + ]) + + b = tensors[1].transpose(*[ + list(ixs[1]).index(x) for x in common + list(nix) + ]) + + k, f, m, n = [self._get_index_space_size(*ix) + for ix in (kix, fix, mix, nix) + ] + a = a.reshape(k, f, m) + b = b.reshape(k, f, n) + c = torch.einsum('kfm, kfn -> fmn', a, b) + if len(out): + #print('out ix', out, 'kfmnix', kix, fix, mix, nix) + c = c.reshape(*self._get_index_sizes(*out)) + #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) + + current_ord_ = list(fix) + list(mix) + list(nix) + c = c.transpose(*[current_ord_.index(i) for i in out]) + return c + + def process_bucket(self, bucket, no_sum=False): + bucket.sort(key = lambda x: len(x.indices)) + result_indices = bucket[0].indices + result_data = bucket[0].data + width = len(set(bucket[0].indices)) + + for tensor in bucket[1:-1]: + + ixr = list(map(int, result_indices)) + ixt = list(map(int, tensor.indices)) + result_indices = tuple(sorted( + set(result_indices + tensor.indices), + key=int, reverse=True + ) + ) + ixout = list(map(int, result_indices)) + + logger.trace('Before contract. expr: {}, {} ->', ixr, ixt, ixout) + result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout) + #result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace("Data: {}, -> {}", result_data, tensor.data, result_data_new) + result_data = result_data_new + + # Merge and sort indices and shapes + + size = len(set(tensor.indices)) + if size > width: + width = size + + + if len(bucket)>1: + tensor = bucket[-1] + + ixr = list(map(int, result_indices)) + ixt = list(map(int, tensor.indices)) + result_indices = tuple(sorted( + set(result_indices + tensor.indices), + key=int, reverse=True + ))[:-1] + ixout = list(map(int, result_indices)) + + logger.trace('Before contract. expr: {}, {} ->', ixr, ixt, ixout) + result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout) + #result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace("Data: {}, -> {}", result_data, tensor.data, result_data_new) + result_data = result_data_new + else: + result_data = result_data.sum(axis=-1) + + + + if len(result_indices) > 0: + first_index = result_indices[-1] + result_indices = result_indices[:-1] + tag = first_index.identity + else: + tag = 'f' + result_indices = [] + + # reduce + result = qtree.optimizer.Tensor(f'E{tag}', result_indices, + data=result_data) + return result + + diff --git a/qtree b/qtree index 7b038d5a..16efbba2 160000 --- a/qtree +++ b/qtree @@ -1 +1 @@ -Subproject commit 7b038d5a4cc1f9b5e0ede4b0e5740bff4b22153e +Subproject commit 16efbba2566e65a37bb7927f06a80c9f88ac57ff From 9391657be5bfedcfb56751acf959b46915e91a20 Mon Sep 17 00:00:00 2001 From: Yuri Alexeev Date: Fri, 15 Mar 2024 15:24:30 +0000 Subject: [PATCH 04/14] torch matm backend fixes --- qtensor/contraction_backends/torch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index ff7c8d25..e684bde4 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -264,11 +264,12 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): kix = common - set(out) fix = common - kix common = list(kix) + list(fix) - a = tensors[0].transpose(*[ + print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}') + a = tensors[0].permute(*[ list(ixs[0]).index(x) for x in common + list(mix) ]) - b = tensors[1].transpose(*[ + b = tensors[1].permute(*[ list(ixs[1]).index(x) for x in common + list(nix) ]) @@ -284,7 +285,7 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) current_ord_ = list(fix) + list(mix) + list(nix) - c = c.transpose(*[current_ord_.index(i) for i in out]) + c = c.permute(*[current_ord_.index(i) for i in out]) return c def process_bucket(self, bucket, no_sum=False): From 6f30c948039c94832ff2ea5dadaf0777055d3a6c Mon Sep 17 00:00:00 2001 From: Dan Lykov Date: Tue, 19 Mar 2024 18:43:23 -0500 Subject: [PATCH 05/14] replace lineend characters --- .../compression/cusz/include/cli/analyzer.hh | 556 ++-- .../compression/cusz/include/cli/document.hh | 544 ++-- .../cusz/include/cli/quality_viewer.hh | 326 +-- qtensor/compression/cusz/include/cli/query.hh | 142 +- .../compression/cusz/include/cli/query_dev.hh | 136 +- .../cusz/include/cli/timerecord_viewer.hh | 218 +- .../compression/cusz/include/cli/verify.hh | 174 +- qtensor/compression/cusz/include/common.hh | 36 +- .../cusz/include/common/capsule.hh | 804 +++--- .../cusz/include/common/configs.hh | 708 ++--- .../cusz/include/common/definition.hh | 132 +- .../cusz/include/common/type_traits.hh | 216 +- .../compression/cusz/include/compaction.hh | 36 +- qtensor/compression/cusz/include/component.hh | 36 +- .../cusz/include/component/glue.cuh | 240 +- .../component/pred_boilerplate_deprecated.hh | 420 +-- .../cusz/include/component/prediction.inl | 386 +-- .../cusz/include/component/spcodec.inl | 436 +-- .../compression/cusz/include/compressor.hh | 330 +-- qtensor/compression/cusz/include/context.hh | 502 ++-- qtensor/compression/cusz/include/cusz.h | 120 +- .../compression/cusz/include/cusz/custom.h | 52 +- qtensor/compression/cusz/include/cusz/it.hh | 154 +- qtensor/compression/cusz/include/cusz/nd.h | 28 +- qtensor/compression/cusz/include/cusz/pn.hh | 98 +- .../compression/cusz/include/cusz/record.h | 76 +- qtensor/compression/cusz/include/cusz/type.h | 438 +-- qtensor/compression/cusz/include/framework.hh | 124 +- qtensor/compression/cusz/include/header.h | 222 +- qtensor/compression/cusz/include/hf/hf.hh | 340 +-- .../compression/cusz/include/hf/hf_bookg.hh | 90 +- .../compression/cusz/include/hf/hf_codecg.hh | 164 +- .../compression/cusz/include/hf/hf_struct.h | 106 +- .../cusz/include/kernel/claunch_cuda.h | 98 +- .../cusz/include/kernel/cpplaunch_cuda.hh | 102 +- .../cusz/include/kernel/dryrun.cuh | 92 +- .../cusz/include/kernel/launch_spm.cuh | 696 ++--- .../cusz/include/kernel/lorenzo_all.h | 88 +- .../cusz/include/kernel/lorenzo_all.hh | 192 +- .../compression/cusz/include/kernel/spv_gpu.h | 84 +- .../cusz/include/kernel/spv_gpu.hh | 66 +- .../cusz/include/kernel/v2_lorenzo.hh | 64 +- .../cusz/include/pipeline/compaction_g.inl | 146 +- .../cusz/include/pipeline/v2_compressor.hh | 292 +- .../compression/cusz/include/stat/compare.h | 114 +- .../cusz/include/stat/compare_cpu.hh | 124 +- .../cusz/include/stat/compare_gpu.hh | 66 +- qtensor/compression/cusz/include/stat/stat.h | 58 +- qtensor/compression/cusz/include/stat/stat.hh | 30 +- .../compression/cusz/include/stat/stat_g.hh | 88 +- qtensor/compression/cusz/include/utils.hh | 40 +- .../cusz/include/utils/cuda_err.cuh | 370 +-- .../cusz/include/utils/cuda_mem.cuh | 200 +- .../cusz/include/utils/cusparse_err.cuh | 120 +- .../compression/cusz/include/utils/format.hh | 114 +- qtensor/compression/cusz/include/utils/io.hh | 118 +- .../cusz/include/utils/print_gpu.h | 90 +- .../cusz/include/utils/print_gpu.hh | 42 +- .../cusz/include/utils/strhelper.hh | 288 +- .../compression/cusz/include/utils/timer.h | 184 +- .../compression/cusz/include/utils/timer.hh | 306 +- qtensor/compression/cusz/src/cli/cli.cu | 28 +- qtensor/compression/cusz/src/cli/cli.cuh | 390 +-- .../compression/cusz/src/cli/dryrun_part.cu | 34 +- .../compression/cusz/src/cli/dryrun_part.cuh | 392 +-- qtensor/compression/cusz/src/cli_bin.cu | 54 +- qtensor/compression/cusz/src/compressor.cc | 298 +- qtensor/compression/cusz/src/context.cc | 986 +++---- qtensor/compression/cusz/src/cusz/custom.cc | 68 +- qtensor/compression/cusz/src/cusz_lib.cc | 228 +- .../compression/cusz/src/cusz_version.h.in | 6 +- qtensor/compression/cusz/src/cusz_wrapper.cu | 308 +- qtensor/compression/cusz/src/cusz_wrapper.py | 346 +-- .../cusz/src/detail/compare_cpu.inl | 218 +- .../cusz/src/detail/compare_gpu.inl | 386 +-- .../cusz/src/detail/compressor_impl.cu | 36 +- .../cusz/src/detail/compressor_impl.inl | 958 +++---- qtensor/compression/cusz/src/detail/spmat.cu | 28 +- .../compression/cusz/src/detail/spv_gpu.inl | 154 +- qtensor/compression/cusz/src/detail/spvec.cu | 36 +- .../cusz/src/experimental/Makefile | 14 +- .../src/experimental/dpcpp_demo_lorenzo.cu | 240 +- .../cusz/src/hf/detail/hf_bookg.inl | 1484 +++++----- .../cusz/src/hf/detail/hf_codecg.inl | 592 ++-- .../cusz/src/hf/detail/hf_pimpl.inl | 728 ++--- .../cusz/src/hf/detail/par_merge.inl | 888 +++--- qtensor/compression/cusz/src/hf/hf.cc | 218 +- qtensor/compression/cusz/src/hf/hf_bookg.cu | 66 +- qtensor/compression/cusz/src/hf/hf_codecg.cu | 538 ++-- qtensor/compression/cusz/src/hf/hf_pimpl.cu | 62 +- .../cusz/src/kernel/claunch_cuda.cu | 152 +- .../cusz/src/kernel/detail/hist.inl | 200 +- .../cusz/src/kernel/detail/lorenzo.inl | 1632 +++++------ .../cusz/src/kernel/detail/lorenzo23.inl | 2474 ++++++++--------- .../cusz/src/kernel/detail/lorenzo_proto.inl | 428 +-- .../cusz/src/kernel/detail/lorenzo_serial.inl | 652 ++--- .../cusz/src/kernel/detail/lorenzo_var.inl | 1060 +++---- .../cusz/src/kernel/detail/spline3.inl | 1492 +++++----- .../cusz/src/kernel/detail/subroutine.inl | 2148 +++++++------- .../cusz/src/kernel/detail/subsub.inl | 184 +- .../compression/cusz/src/kernel/lorenzo.cu | 418 +-- .../cusz/src/kernel/lorenzo_proto.cu | 352 +-- .../cusz/src/kernel/lorenzo_serial.cc | 236 +- .../cusz/src/kernel/lorenzo_var.cu | 412 +-- .../cusz/src/kernel/preprocess.cuh | 130 +- qtensor/compression/cusz/src/kernel/rle.cuh | 148 +- .../compression/cusz/src/kernel/spv_gpu.cu | 120 +- .../compression/cusz/src/kernel/v2_lorenzo.cu | 236 +- .../cusz/src/pipeline/v2_compressor.cc | 222 +- .../cusz/src/pipeline/v2_compressor_impl.cu | 28 +- .../cusz/src/pipeline/v2_compressor_impl.inl | 478 ++-- qtensor/compression/cusz/src/stat/cmpg1_1.cu | 60 +- qtensor/compression/cusz/src/stat/cmpg1_2.cu | 58 +- qtensor/compression/cusz/src/stat/cmpg1_3.cu | 58 +- qtensor/compression/cusz/src/stat/cmpg1_4.cu | 58 +- qtensor/compression/cusz/src/stat/cmpg1_5.cu | 58 +- qtensor/compression/cusz/src/stat/cmpg2.cu | 68 +- qtensor/compression/cusz/src/stat/cmpg3.cu | 64 +- qtensor/compression/cusz/src/stat/cmpg4_1.cu | 48 +- qtensor/compression/cusz/src/stat/cmpg4_2.cu | 50 +- qtensor/compression/cusz/src/stat/cmpg4_3.cu | 46 +- qtensor/compression/cusz/src/stat/cmpg4_4.cu | 48 +- .../compression/cusz/src/stat/compare_cpu.cc | 86 +- qtensor/compression/cusz/src/stat/stat_g.cu | 190 +- .../compression/cusz/src/utils/dbg_print.cuh | 262 +- .../compression/cusz/src/utils/print_gpu.cu | 242 +- .../compression/cusz/src/utils/timer_cpu.cc | 60 +- .../compression/cusz/src/utils/timer_gpu.cu | 164 +- .../compression/cusz/src/utils/vis_stat.hh | 274 +- qtensor/compression/newsz/newsz.cu | 496 ++-- qtensor/compression/newsz/newsz.h | 6 +- qtensor/compression/newsz/newsz_wrapper.cu | 42 +- qtensor/compression/newsz/newsz_wrapper.py | 322 +-- qtensor/compression/szp/include/cuSZp.h | 22 +- qtensor/compression/szp/include/cuSZp_entry.h | 22 +- qtensor/compression/szp/include/cuSZp_timer.h | 60 +- .../compression/szp/include/cuSZp_utility.h | 26 +- qtensor/compression/szp/src/cuSZp.cu | 784 +++--- qtensor/compression/szp/src/cuSZp_entry.cu | 294 +- qtensor/compression/szp/src/cuSZp_timer.cu | 62 +- qtensor/compression/szp/src/cuSZp_utility.cu | 984 +++---- qtensor/compression/szp/src/cuSZp_wrapper.cu | 74 +- qtensor/compression/szp/src/cuSZp_wrapper.py | 380 +-- .../compression/torch_quant/torch_quant.py | 348 +-- .../torch_quant/torch_quant_perchannel.py | 406 +-- 145 files changed, 20935 insertions(+), 20935 deletions(-) diff --git a/qtensor/compression/cusz/include/cli/analyzer.hh b/qtensor/compression/cusz/include/cli/analyzer.hh index 7ff4b37d..8c58a71c 100644 --- a/qtensor/compression/cusz/include/cli/analyzer.hh +++ b/qtensor/compression/cusz/include/cli/analyzer.hh @@ -1,278 +1,278 @@ -/** - * @file analyzer.hh - * @author Jiannan Tian - * @brief - * @version 0.2 - * @date 2021-03-26 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef ANALYSIS_ANALYZER_HH -#define ANALYSIS_ANALYZER_HH - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include "../hf/hf_bookg.hh" -#include "../hf/hf_codecg.hh" -#include "../kernel/cpplaunch_cuda.hh" -#include "../utils/timer.hh" - -using std::cout; - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -enum class ExecutionPolicy { host, cuda_device }; -enum class AnalyzerMethod { thrust, cuda_native, stl }; - -class Analyzer { - typedef struct ExtremaResult { - double max_val, min_val, rng; - double seconds; - } extrema_result_t; - - typedef struct Compressibility { - size_t len; - struct { - double entropy; - unsigned int top1_freq; - double top1_prob; - double dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); } - double dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); } - } hist; - struct { - double r_lowerbound; - double avgb_lowerbound; - double r_upperbound; - double avgb_upperbound; - } huffman_theory; - struct { - double min_bitlen; - double avgb; - } huffman_stat; - } theory_t; - - theory_t theory; - - public: - Analyzer() = default; - ~Analyzer() = default; - - // TODO execution policy - template - static std::vector percentile100(T* in, size_t len) - { - std::vector res; - auto step = int(ceil(len / 100)); - - if CONSTEXPR (policy == ExecutionPolicy::cuda_device) { - // caveat: no residence check - thrust::sort(thrust::device, in, in + len); - T* htmp; - cudaMallocHost(&htmp, sizeof(T) * len); - cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost); - for (auto i = 0; i < len; i += step) { // - res.push_back(htmp[i]); - } - res.push_back(htmp[len - 1]); - cudaFreeHost(htmp); - } - else { // fallback - std::sort(in, in + len); - for (auto i = 0; i < len; i += step) { // - res.push_back(in[i]); - } - res.push_back(in[len - 1]); - } - - return res; - } - - template - static extrema_result_t get_maxmin_rng(Data* d_data, size_t len) - { - if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) { - auto t0 = hires::now(); - // ------------------------------------------------------------ - thrust::device_ptr g_ptr = thrust::device_pointer_cast(d_data); - - auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len); // excluding padded - auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len); // excluding padded - - double max_val = *max_el_loc; - double min_val = *min_el_loc; - double rng = max_val - min_val; - // ------------------------------------------------------------ - auto t1 = hires::now(); - - return extrema_result_t{max_val, min_val, rng, static_cast(t1 - t0).count()}; - } - else { - throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented."); - } - } - - template - static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins) - { - // TODO static check UInt - if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) { - float dummy; - launch_histogram(data, data_len, freq, num_bins, dummy); - } - else { - // TODO static check - throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented."); - } - } - - Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size) - { - auto len = std::accumulate(h_freq, h_freq + dict_size, 0u); // excluding outlier - auto top1_freq = *std::max_element(h_freq, h_freq + dict_size); - double top1_prob = (1.0 * top1_freq) / (1.0 * len); - double entropy = 0.0; - for (auto i = 0; i < dict_size; i++) { - double p = h_freq[i] / (1.0 * len); - if (p != 0) entropy += -std::log2(p) * p; - } - double r_lowerbound = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob)); - double r_upperbound = top1_prob + 0.086; // [Gallager 78] - double avgb_lowerbound = entropy + r_lowerbound; - double avgb_upperbound = entropy + r_upperbound; - - // dropout - // auto equiv_bitlen_dropout_2x = 64 * (1 - top1_prob); - // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob); - - // record - theory.len = len; - theory.hist.entropy = entropy; - theory.hist.top1_freq = top1_freq; - theory.hist.top1_prob = top1_prob; - theory.huffman_theory.r_lowerbound = r_lowerbound; - theory.huffman_theory.r_upperbound = r_upperbound; - theory.huffman_theory.avgb_lowerbound = avgb_lowerbound; - theory.huffman_theory.avgb_upperbound = avgb_upperbound; - - return *this; - }; - - template - Analyzer& - get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins) - { - // real-bitlen, for reference only, not part of workflow - std::vector v_canon_cb(h_codebook, h_codebook + num_bins); - std::vector v_freq(h_freq, h_freq + num_bins); - - // TODO somewhere explicitly state that null codeword is of length 0xff - std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) { - auto a_bits = reinterpret_cast*>(&a)->bits; - auto b_bits = reinterpret_cast*>(&b)->bits; - return a_bits < b_bits; - }); - std::sort(v_freq.begin(), v_freq.end(), std::greater()); - - double real_avgb = 0.0; - for (auto i = 0; i < num_bins; i++) { - if (v_freq[i] != 0) { - auto bits = reinterpret_cast*>(&v_canon_cb[i])->bits; - real_avgb += v_freq[i] * bits; - } - } - real_avgb /= len; - - theory.huffman_stat.avgb = real_avgb; - theory.huffman_stat.min_bitlen = - reinterpret_cast*>(&v_canon_cb.at(0))->bits; - - return *this; - } - - Analyzer& - print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32) - { - cout << "\n\e[31m"; // extra linebreak on start - - cout << "* Derived from histogram:" << '\n'; - cout << " - len (freq sum):\t" << theory.len << '\n'; - cout << " - entropy H(X):\t" << theory.hist.entropy << '\n'; - cout << " - most likely freq:\t" << theory.hist.top1_freq << '\n'; - cout << " - most likely prob (p1):\t" << theory.hist.top1_prob << '\n'; - cout << '\n'; - - if (theory.hist.top1_prob < 0.4) { - cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n'; - cout << "* Compressibility lower bound is for reference only." << '\n'; - cout << " - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n'; - cout << " - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound - << '\n'; - cout << " - est. CR lower bound (arbitrary p1):\t" - << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n'; - cout << '\n'; - } - else { - cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n'; - cout << " - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n'; - cout << " - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n'; - cout << " - est. CR upper bound (arbitrary p1):\t" - << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n'; - cout << '\n'; - - cout << "* Compressibility lower bound is for reference only." << '\n'; - cout << " - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n'; - cout << " - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound - << '\n'; - cout << " - est. CR lower bound (arbitrary p1):\t" - << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n'; - cout << '\n'; - - if (print_dropout) { - auto dropout_equiv_bitlen_2x = theory.hist.dropout_equiv_bitlen_2x(); - auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x(); - // TODO determine path, print log - cout << "* Considering dropout:" << '\n'; - cout << " - dropout at 1.0x metadata overhead" << '\n'; - cout << " | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n'; - cout << " | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n'; - cout << " | bitlen_dropout <= bitlen_enc?\t" - << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n'; - cout << " - dropout at 0.5x metadata overhead" << '\n'; - cout << " | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n'; - cout << " | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n'; - cout << " | bitlen_dropout <= bitlen_enc?\t" - << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n'; - cout << '\n'; - } - } - - if (print_huffman_stat) { - cout << "* From Huffman codebook:" << '\n'; - cout << " - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n'; - cout << " - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n'; - cout << '\n'; - } - cout << "\e[0m"; - - return *this; - } -}; - -#endif +/** + * @file analyzer.hh + * @author Jiannan Tian + * @brief + * @version 0.2 + * @date 2021-03-26 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef ANALYSIS_ANALYZER_HH +#define ANALYSIS_ANALYZER_HH + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "../hf/hf_bookg.hh" +#include "../hf/hf_codecg.hh" +#include "../kernel/cpplaunch_cuda.hh" +#include "../utils/timer.hh" + +using std::cout; + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +enum class ExecutionPolicy { host, cuda_device }; +enum class AnalyzerMethod { thrust, cuda_native, stl }; + +class Analyzer { + typedef struct ExtremaResult { + double max_val, min_val, rng; + double seconds; + } extrema_result_t; + + typedef struct Compressibility { + size_t len; + struct { + double entropy; + unsigned int top1_freq; + double top1_prob; + double dropout_equiv_bitlen_2x() const { return 64 * (1 - top1_prob); } + double dropout_equiv_bitlen_1_5x() const { return 48 * (1 - top1_prob); } + } hist; + struct { + double r_lowerbound; + double avgb_lowerbound; + double r_upperbound; + double avgb_upperbound; + } huffman_theory; + struct { + double min_bitlen; + double avgb; + } huffman_stat; + } theory_t; + + theory_t theory; + + public: + Analyzer() = default; + ~Analyzer() = default; + + // TODO execution policy + template + static std::vector percentile100(T* in, size_t len) + { + std::vector res; + auto step = int(ceil(len / 100)); + + if CONSTEXPR (policy == ExecutionPolicy::cuda_device) { + // caveat: no residence check + thrust::sort(thrust::device, in, in + len); + T* htmp; + cudaMallocHost(&htmp, sizeof(T) * len); + cudaMemcpy(htmp, in, sizeof(T) * len, cudaMemcpyDeviceToHost); + for (auto i = 0; i < len; i += step) { // + res.push_back(htmp[i]); + } + res.push_back(htmp[len - 1]); + cudaFreeHost(htmp); + } + else { // fallback + std::sort(in, in + len); + for (auto i = 0; i < len; i += step) { // + res.push_back(in[i]); + } + res.push_back(in[len - 1]); + } + + return res; + } + + template + static extrema_result_t get_maxmin_rng(Data* d_data, size_t len) + { + if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::thrust) { + auto t0 = hires::now(); + // ------------------------------------------------------------ + thrust::device_ptr g_ptr = thrust::device_pointer_cast(d_data); + + auto max_el_loc = thrust::max_element(g_ptr, g_ptr + len); // excluding padded + auto min_el_loc = thrust::min_element(g_ptr, g_ptr + len); // excluding padded + + double max_val = *max_el_loc; + double min_val = *min_el_loc; + double rng = max_val - min_val; + // ------------------------------------------------------------ + auto t1 = hires::now(); + + return extrema_result_t{max_val, min_val, rng, static_cast(t1 - t0).count()}; + } + else { + throw std::runtime_error("Analyzer::get_maxmin_rng() Other policy and method not implemented."); + } + } + + template + static void get_histogram(UInt* data, size_t data_len, unsigned int* freq, size_t num_bins) + { + // TODO static check UInt + if CONSTEXPR (policy == ExecutionPolicy::cuda_device and method == AnalyzerMethod::cuda_native) { + float dummy; + launch_histogram(data, data_len, freq, num_bins, dummy); + } + else { + // TODO static check + throw std::runtime_error("Analyzer::get_histogram() using other policy or method not implemented."); + } + } + + Analyzer& estimate_compressibility_from_histogram(unsigned int* h_freq, size_t dict_size) + { + auto len = std::accumulate(h_freq, h_freq + dict_size, 0u); // excluding outlier + auto top1_freq = *std::max_element(h_freq, h_freq + dict_size); + double top1_prob = (1.0 * top1_freq) / (1.0 * len); + double entropy = 0.0; + for (auto i = 0; i < dict_size; i++) { + double p = h_freq[i] / (1.0 * len); + if (p != 0) entropy += -std::log2(p) * p; + } + double r_lowerbound = 1 - (-std::log2(top1_prob) * top1_prob - std::log2(1 - top1_prob) * (1 - top1_prob)); + double r_upperbound = top1_prob + 0.086; // [Gallager 78] + double avgb_lowerbound = entropy + r_lowerbound; + double avgb_upperbound = entropy + r_upperbound; + + // dropout + // auto equiv_bitlen_dropout_2x = 64 * (1 - top1_prob); + // auto equiv_bitlen_dropout_1_5x = 48 * (1 - top1_prob); + + // record + theory.len = len; + theory.hist.entropy = entropy; + theory.hist.top1_freq = top1_freq; + theory.hist.top1_prob = top1_prob; + theory.huffman_theory.r_lowerbound = r_lowerbound; + theory.huffman_theory.r_upperbound = r_upperbound; + theory.huffman_theory.avgb_lowerbound = avgb_lowerbound; + theory.huffman_theory.avgb_upperbound = avgb_upperbound; + + return *this; + }; + + template + Analyzer& + get_stat_from_huffman_book(const unsigned int* h_freq, const Huff* h_codebook, size_t len, size_t num_bins) + { + // real-bitlen, for reference only, not part of workflow + std::vector v_canon_cb(h_codebook, h_codebook + num_bins); + std::vector v_freq(h_freq, h_freq + num_bins); + + // TODO somewhere explicitly state that null codeword is of length 0xff + std::sort(v_canon_cb.begin(), v_canon_cb.end(), [](Huff& a, Huff& b) { + auto a_bits = reinterpret_cast*>(&a)->bits; + auto b_bits = reinterpret_cast*>(&b)->bits; + return a_bits < b_bits; + }); + std::sort(v_freq.begin(), v_freq.end(), std::greater()); + + double real_avgb = 0.0; + for (auto i = 0; i < num_bins; i++) { + if (v_freq[i] != 0) { + auto bits = reinterpret_cast*>(&v_canon_cb[i])->bits; + real_avgb += v_freq[i] * bits; + } + } + real_avgb /= len; + + theory.huffman_stat.avgb = real_avgb; + theory.huffman_stat.min_bitlen = + reinterpret_cast*>(&v_canon_cb.at(0))->bits; + + return *this; + } + + Analyzer& + print_compressibility(bool print_huffman_stat = false, bool print_dropout = false, double equiv_origin_bitlen = 32) + { + cout << "\n\e[31m"; // extra linebreak on start + + cout << "* Derived from histogram:" << '\n'; + cout << " - len (freq sum):\t" << theory.len << '\n'; + cout << " - entropy H(X):\t" << theory.hist.entropy << '\n'; + cout << " - most likely freq:\t" << theory.hist.top1_freq << '\n'; + cout << " - most likely prob (p1):\t" << theory.hist.top1_prob << '\n'; + cout << '\n'; + + if (theory.hist.top1_prob < 0.4) { + cout << "* The probability of the most likely symbol < 0.4, go recoding (Huffman)." << '\n'; + cout << "* Compressibility lower bound is for reference only." << '\n'; + cout << " - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n'; + cout << " - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound + << '\n'; + cout << " - est. CR lower bound (arbitrary p1):\t" + << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n'; + cout << '\n'; + } + else { + cout << "* Compressibility upper bound is determined by the lower bound of average bitlength." << '\n'; + cout << " - est. redundancy lower bound (p1 > 0.4):\t" << theory.huffman_theory.r_lowerbound << '\n'; + cout << " - est. avg.bitlen lower bound (p1 > 0.4):\t" << theory.huffman_theory.avgb_lowerbound << '\n'; + cout << " - est. CR upper bound (arbitrary p1):\t" + << equiv_origin_bitlen / theory.huffman_theory.avgb_lowerbound << '\n'; + cout << '\n'; + + cout << "* Compressibility lower bound is for reference only." << '\n'; + cout << " - est. redundancy upper bound (arbitrary p1):\t" << theory.huffman_theory.r_upperbound << '\n'; + cout << " - est. avg.bitlen upper bound (arbitrary p1):\t" << theory.huffman_theory.avgb_upperbound + << '\n'; + cout << " - est. CR lower bound (arbitrary p1):\t" + << equiv_origin_bitlen / theory.huffman_theory.avgb_upperbound << '\n'; + cout << '\n'; + + if (print_dropout) { + auto dropout_equiv_bitlen_2x = theory.hist.dropout_equiv_bitlen_2x(); + auto dropout_equiv_bitlen_1_5x = theory.hist.dropout_equiv_bitlen_1_5x(); + // TODO determine path, print log + cout << "* Considering dropout:" << '\n'; + cout << " - dropout at 1.0x metadata overhead" << '\n'; + cout << " | equiv.bitlen:\t" << dropout_equiv_bitlen_2x << '\n'; + cout << " | reduction rate:\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_2x) << '\n'; + cout << " | bitlen_dropout <= bitlen_enc?\t" + << (dropout_equiv_bitlen_2x <= theory.huffman_theory.avgb_lowerbound) << '\n'; + cout << " - dropout at 0.5x metadata overhead" << '\n'; + cout << " | equiv.bitlen:\t" << dropout_equiv_bitlen_1_5x << '\n'; + cout << " | reduction rate (fp32):\t" << (equiv_origin_bitlen / dropout_equiv_bitlen_1_5x) << '\n'; + cout << " | bitlen_dropout <= bitlen_enc?\t" + << (dropout_equiv_bitlen_1_5x <= theory.huffman_theory.avgb_lowerbound) << '\n'; + cout << '\n'; + } + } + + if (print_huffman_stat) { + cout << "* From Huffman codebook:" << '\n'; + cout << " - avg. bitlen:\t" << theory.huffman_stat.avgb << '\n'; + cout << " - shortest bitlen:\t" << theory.huffman_stat.min_bitlen << '\n'; + cout << '\n'; + } + cout << "\e[0m"; + + return *this; + } +}; + +#endif diff --git a/qtensor/compression/cusz/include/cli/document.hh b/qtensor/compression/cusz/include/cli/document.hh index 240de036..ed68bdf5 100644 --- a/qtensor/compression/cusz/include/cli/document.hh +++ b/qtensor/compression/cusz/include/cli/document.hh @@ -1,272 +1,272 @@ -/** - * @file document.hh - * @author Jiannan Tian - * @brief - * @version 0.1.1 - * @date 2020-09-22 - * - * @copyright (C) 2020 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef ARGUMENT_PARSER_DOCUMENT_HH -#define ARGUMENT_PARSER_DOCUMENT_HH - -#include -#include - - -const std::string fmt_b("\e[1m"); -const std::string fmt_0("\e[0m"); - -const std::regex bful("@(.*?)@"); -const std::string bful_text("\e[1m\e[4m$1\e[0m"); -const std::regex bf("\\*(.*?)\\*"); -const std::string bf_text("\e[1m$1\e[0m"); -const std::regex ul(R"(_((\w|-|\d|\.)+?)_)"); -const std::string ul_text("\e[4m$1\e[0m"); -const std::regex red(R"(\^\^(.*?)\^\^)"); -const std::string red_text("\e[31m$1\e[0m"); - -std::string // -Format(const std::string& s) -{ - auto a = std::regex_replace(s, bful, bful_text); - auto b = std::regex_replace(a, bf, bf_text); - auto c = std::regex_replace(b, ul, ul_text); - auto d = std::regex_replace(c, red, red_text); - return d; -} - -static const char cusz_short_doc[] = - // "cusz, version [placeholder]\n" - "\n" - "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] " - "...\n" - "\n" - " z : zip/compress\n" - " x : unzip/decompress\n" - " r : dryrun\n" - " h : print full-length help document\n" - "\n" - " i file : path to input datum\n" - " t dtype : f32 or fp4 (to be updated)\n" - " m mode : compression mode; abs, r2r\n" - " e eb : error bound; default 1e-4\n" - " l size : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n" - // " p pred : select predictor from \"lorenzo\" and \"spline3d\"\n" - "\n" - " config list:\n" - " syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n" - " + eb error bound\n" - " + radius The number of quant-codes is 2x radius.\n" - " + demo load predefined lengths for demo datasets\n" - " - skipping \"-l x[,y[,z]]\"\n" - " - (1D) hacc hacc1b (2D) cesm exafel\n" - " - (3D) hurricane nyx-s nyx-m qmc qmcpre rtm parihaka\n" - " + anchor (on|off)\n" - // " + pipeline auto, binary, radius\n" - " example: \"--config demo=cesm,radius=512\"\n" - " report list: \n" - " syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n" - " keyworkds: time, quality\n" - " example: \"--report time\", \"--report time=off\"\n" - "\n" - "example:\n" - " CESM=./data/cesm-CLDHGH-3600x1800\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n" - " cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n" - "\n" - "\"cusz -h\" for details.\n"; - -static const char cusz_full_doc[] = - "*NAME*\n" - " cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n" - " Lowercased \"*cusz*\" is the command." - "\n" - "*SYNOPSIS*\n" - " The basic use is listed below,\n" - " *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* " - "time\n" - // cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n - " ^^------ ------ ----------- ------------------------------- ------------ | ^^\n" - " ^^ dtype mode error bound input file low-to-high zip ^^\n" - "\n" - " *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* " - "time\n" - // cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report - // time\n" - " ^^------------------------------------- | ^^\n" - " ^^ compressed file unzip ^^\n" - "\n" - " *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n" - " *cusz* *-i* [basename].cusza *-x*\n" - "\n" - "*OPTIONS*\n" - " *Mandatory* (zip and dryrun)\n" - " *-z* or *--compress* or *--*@z@*ip*\n" - " *-r* or *--dry-*@r@*un*\n" - " No lossless Huffman codec. Only to get data quality summary.\n" - " In addition, quant. rep. and dict. size are retained\n" - "\n" - " *-m* or *--*@m@*ode* \n" - " Specify error-controlling mode. Supported modes include:\n" - " _abs_: absolute mode, eb = input eb\n" - " _r2r_: relative-to-value-range mode, eb = input eb x value range\n" - "\n" - " *-e* or *--eb* or *--error-bound* [num]\n" - " Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n" - "\n" - " *-i* or *--*@i@*nput* [file]\n" - "\n" - " *-d* or *--dict-size* [256|512|1024|...]\n" - " Specify dictionary size/quantization bin number.\n" - " Should be a power-of-2.\n" - "\n" - " *-l* [x[,y[,z]]] Specify (1|2|3)D data size, with dimensions from low to high.\n" - "\n" - " *Mandatory* (unzip)\n" - " *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n" - "\n" - " *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n" - "\n" - " *Additional*\n" - " *-p* or *--*@p@*redictor*\n" - " Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n" - " *--origin* or *--compare* /path/to/origin-datum\n" - " For verification & get data quality evaluation.\n" - " *--opath* /path/to\n" - " Specify alternative output path.\n" - "\n" - " *Modules*\n" - " *--skip* _module-1_,_module-2_,...,_module-n_,\n" - " Disable functionality modules. Supported module(s) include:\n" - " _huffman_ Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n" - " _write2disk_ Skip write decompression data.\n" - // "\n" - // " *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n" - // " Enable preprocessing. Supported preprocessing method(s) include:\n" - // " _binning_ Downsampling datum by 2x2 to 1.\n" - "\n" - " *Print Report to stdout*\n" - " *--report* (option=on/off)-list\n" - " Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n" - " Keyworkds: time quality compressibility\n" - " Example: \"--report time\", \"--report time=off\"\n" - "\n" - " *Demonstration*\n" - " *-h* or *--help*\n" - " Get help documentation.\n" - "\n" - // " *-V* or *--verbose*\n" - // " Print host and device information for diagnostics.\n" - // "\n" - // " *-M* or *--meta*\n" - // " Get archive metadata. (TODO)\n" - "\n" - " *Advanced Runtime Configuration*\n" - " *--demo* [demo-dataset]\n" - " Use demo dataset, will omit given dimension(s). Supported datasets include:\n" - " 1D: _hacc_ _hacc1b_ 2D: _cesm_ _exafel_\n" - " 3D: _hurricane_ _nyx-s_ _nyx-m_ _qmc_ _qmcpre_ _rtm_ _parihaka_\n" - "\n" - " *-c* or *--config* (option=value)-list\n" - " Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n" - " + *eb*= error bound\n" - " + *cap*= capacity, number of quant-codes\n" - " + *demo*= skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n" - "\n" - " Other internal parameters:\n" - " + *quantbyte*=<1|2>\n" - " Specify quantization code representation.\n" - " Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n" - " ^^Manually specifying this may not result in optimal memory footprint.^^\n" - " + *huffbyte*=<4|8>\n" - " Specify Huffman codeword representation.\n" - " Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n" - " ^^Manually specifying this may not result in optimal memory footprint.^^\n" - " + *huffchunk*=[256|512|1024|...]\n" - " Manually specify chunk size for Huffman codec, overriding autotuning.\n" - " Should be a power-of-2 that is sufficiently large.\n" - " ^^This affects Huffman decoding performance significantly.^^\n" - "\n" - "*EXAMPLES*\n" - " *Demo Datasets*\n" - " Set a *shell variable*:\n" - " export PATH=$(pwd)/bin:$PATH\n" - " CESM=./data/cesm-CLDHGH-3600x1800\n" - " HURR=./data/hurr-CLOUDf48-500x500x100\n" - "\n" - " *CESM* example:\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n" - " cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n" - "\n" - " *CESM* example with specified output path:\n" - " mkdir data2 data3\n" - " ^^# zip, output to `data2`^^\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n" - " ^^# unzip, in situ^^\n" - " cusz -i ${CESM}.cusza -x && ls data2\n" - " ^^# unzip, output to `data3`^^\n" - " cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n" - " ^^# unzip, output to `data3`, compare to the original datum^^\n" - " cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n" - "\n" - " *Hurricane Isabel* example:\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n" - " cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n" - " cusz -i ${HURR}.cusza -x\n" - "\n"; - -// TODO -// " *EXAFEL* example:\n" -// " cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n" -// " cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning " -// "--skip huffman\n" -// " cusz -i ./data/exafel-59200x388.BN.cusza -x\n"; - -static const char huff_re_short_doc[] = - "\n" - "OVERVIEW: Huffman submodule as standalone program\n" // TODO from this line on - "\n" - "USAGE:\n" - " The basic use with demo datum is listed below,\n" - " ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n" - " -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n" - " or shorter\n" - " ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n" - " ^ ^ ^ --------------------------- -------------- ----- ----- ------- -------\n" - " | | | input datum file dimension input Huff. Huff. codebook\n" - " enc dec verify rep. rep. chunk size\n" - "\n" - "EXAMPLES\n" - " Essential:\n" - " ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n" - " have to input dimension, and higher dimension for a multiplication of each dim.,\n" - " as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n" - "\n"; - -static const char doc_dim_order[] = - "\n" - " Input dimension follows low-to-high (e.g., x-y-z) order.\n" - " Taking 2D CESM-ATM as an example, \n" - "\n" - " |<------------------------- x 3600 --------------------------->| \n" - " +--------------------------------------------------------------+ - \n" - " | | ^ \n" - " | | | \n" - " | CESM-ATM: 1800x3600 (y-x order) | | \n" - " | datum name: _1800_3600 | y \n" - " | | 1800 \n" - " | input: -l 3600,1800 | | \n" - " | input order: -l [x,y] | | \n" - " | | | \n" - " | | v \n" - " +--------------------------------------------------------------+ - \n" - "\n" - " Taking 3D Hurricane as another example, whose dimensions are\n" - " 100x500x500, the input is \"-l 500,500,100\".\n"; - -#endif +/** + * @file document.hh + * @author Jiannan Tian + * @brief + * @version 0.1.1 + * @date 2020-09-22 + * + * @copyright (C) 2020 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef ARGUMENT_PARSER_DOCUMENT_HH +#define ARGUMENT_PARSER_DOCUMENT_HH + +#include +#include + + +const std::string fmt_b("\e[1m"); +const std::string fmt_0("\e[0m"); + +const std::regex bful("@(.*?)@"); +const std::string bful_text("\e[1m\e[4m$1\e[0m"); +const std::regex bf("\\*(.*?)\\*"); +const std::string bf_text("\e[1m$1\e[0m"); +const std::regex ul(R"(_((\w|-|\d|\.)+?)_)"); +const std::string ul_text("\e[4m$1\e[0m"); +const std::regex red(R"(\^\^(.*?)\^\^)"); +const std::string red_text("\e[31m$1\e[0m"); + +std::string // +Format(const std::string& s) +{ + auto a = std::regex_replace(s, bful, bful_text); + auto b = std::regex_replace(a, bf, bf_text); + auto c = std::regex_replace(b, ul, ul_text); + auto d = std::regex_replace(c, red, red_text); + return d; +} + +static const char cusz_short_doc[] = + // "cusz, version [placeholder]\n" + "\n" + "usage: cusz [-zxrh] [-i file] [-t dtype] [-m mode] [-e eb] [-l x,y,z] " + "...\n" + "\n" + " z : zip/compress\n" + " x : unzip/decompress\n" + " r : dryrun\n" + " h : print full-length help document\n" + "\n" + " i file : path to input datum\n" + " t dtype : f32 or fp4 (to be updated)\n" + " m mode : compression mode; abs, r2r\n" + " e eb : error bound; default 1e-4\n" + " l size : \"-l x\" for 1D; \"-l [X]x[Y]\" for 2D; \"-l [X]x[Y]x[Z]\" for 3D\n" + // " p pred : select predictor from \"lorenzo\" and \"spline3d\"\n" + "\n" + " config list:\n" + " syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n" + " + eb error bound\n" + " + radius The number of quant-codes is 2x radius.\n" + " + demo load predefined lengths for demo datasets\n" + " - skipping \"-l x[,y[,z]]\"\n" + " - (1D) hacc hacc1b (2D) cesm exafel\n" + " - (3D) hurricane nyx-s nyx-m qmc qmcpre rtm parihaka\n" + " + anchor (on|off)\n" + // " + pipeline auto, binary, radius\n" + " example: \"--config demo=cesm,radius=512\"\n" + " report list: \n" + " syntax: opt[=v], \"kw1[=(on|off)],kw2[=(on|off)]\n" + " keyworkds: time, quality\n" + " example: \"--report time\", \"--report time=off\"\n" + "\n" + "example:\n" + " CESM=./data/cesm-CLDHGH-3600x1800\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n" + " cusz -i ${CESM}.cusza -x --report time --compare ${CESM}\n" + "\n" + "\"cusz -h\" for details.\n"; + +static const char cusz_full_doc[] = + "*NAME*\n" + " cuSZ: CUDA-Based Error-Bounded Lossy Compressor for Scientific Data\n" + " Lowercased \"*cusz*\" is the command." + "\n" + "*SYNOPSIS*\n" + " The basic use is listed below,\n" + " *cusz* *-t* f32 *-m* r2r *-e* 1.0e-4.0 *-i* ./data/cesm-CLDHGH-3600x1800 *-l* 3600,1800 *-z* *--report* " + "time\n" + // cusz -t f32 -m r2r -e 1.0e-4.0 -i ./data/cesm-CLDHGH-3600x1800 -l 3600x1800 -z --report time\n + " ^^------ ------ ----------- ------------------------------- ------------ | ^^\n" + " ^^ dtype mode error bound input file low-to-high zip ^^\n" + "\n" + " *cusz* *-i* ./data/cesm-CLDHGH-3600x1800.cusza *-x* *--compare* ./data/cesm-CLDHGH-3600x1800 *--report* " + "time\n" + // cusz -i ./data/cesm-CLDHGH-3600x1800.cusza -x --compare ./data/cesm-CLDHGH-3600x1800 --report + // time\n" + " ^^------------------------------------- | ^^\n" + " ^^ compressed file unzip ^^\n" + "\n" + " *cusz* *-t* f32|64 *-m* [eb mode] *-e* [eb] *-i* [datum file] *-l* [x[,y[,z]]] *-z*\n" + " *cusz* *-i* [basename].cusza *-x*\n" + "\n" + "*OPTIONS*\n" + " *Mandatory* (zip and dryrun)\n" + " *-z* or *--compress* or *--*@z@*ip*\n" + " *-r* or *--dry-*@r@*un*\n" + " No lossless Huffman codec. Only to get data quality summary.\n" + " In addition, quant. rep. and dict. size are retained\n" + "\n" + " *-m* or *--*@m@*ode* \n" + " Specify error-controlling mode. Supported modes include:\n" + " _abs_: absolute mode, eb = input eb\n" + " _r2r_: relative-to-value-range mode, eb = input eb x value range\n" + "\n" + " *-e* or *--eb* or *--error-bound* [num]\n" + " Specify error bound. e.g., _1.23_, _1e-4_, _1.23e-4.56_\n" + "\n" + " *-i* or *--*@i@*nput* [file]\n" + "\n" + " *-d* or *--dict-size* [256|512|1024|...]\n" + " Specify dictionary size/quantization bin number.\n" + " Should be a power-of-2.\n" + "\n" + " *-l* [x[,y[,z]]] Specify (1|2|3)D data size, with dimensions from low to high.\n" + "\n" + " *Mandatory* (unzip)\n" + " *-x* or *--e*@x@*tract* or *--decompress* or *--unzip*\n" + "\n" + " *-i* or *--*@i@*nput* [corresponding datum basename (w/o extension)]\n" + "\n" + " *Additional*\n" + " *-p* or *--*@p@*redictor*\n" + " Select predictor from \"lorenzo\" (default) or \"spline3d\" (3D only).\n" + " *--origin* or *--compare* /path/to/origin-datum\n" + " For verification & get data quality evaluation.\n" + " *--opath* /path/to\n" + " Specify alternative output path.\n" + "\n" + " *Modules*\n" + " *--skip* _module-1_,_module-2_,...,_module-n_,\n" + " Disable functionality modules. Supported module(s) include:\n" + " _huffman_ Huffman codec after prediction+quantization (p+q) and before reversed p+q.\n" + " _write2disk_ Skip write decompression data.\n" + // "\n" + // " *-p* or *--pre* _method-1_,_method-2_,...,_method-n_\n" + // " Enable preprocessing. Supported preprocessing method(s) include:\n" + // " _binning_ Downsampling datum by 2x2 to 1.\n" + "\n" + " *Print Report to stdout*\n" + " *--report* (option=on/off)-list\n" + " Syntax: opt[=v], \"kw1[=(on|off)],kw2=[=(on|off)]\n" + " Keyworkds: time quality compressibility\n" + " Example: \"--report time\", \"--report time=off\"\n" + "\n" + " *Demonstration*\n" + " *-h* or *--help*\n" + " Get help documentation.\n" + "\n" + // " *-V* or *--verbose*\n" + // " Print host and device information for diagnostics.\n" + // "\n" + // " *-M* or *--meta*\n" + // " Get archive metadata. (TODO)\n" + "\n" + " *Advanced Runtime Configuration*\n" + " *--demo* [demo-dataset]\n" + " Use demo dataset, will omit given dimension(s). Supported datasets include:\n" + " 1D: _hacc_ _hacc1b_ 2D: _cesm_ _exafel_\n" + " 3D: _hurricane_ _nyx-s_ _nyx-m_ _qmc_ _qmcpre_ _rtm_ _parihaka_\n" + "\n" + " *-c* or *--config* (option=value)-list\n" + " Syntax: opt=v, \"kw1=val1,kw1=val2[,...]\"\n" + " + *eb*= error bound\n" + " + *cap*= capacity, number of quant-codes\n" + " + *demo*= skip length input (\"-l x[,y[,z]]\"), alternative to \"--demo dataset\"\n" + "\n" + " Other internal parameters:\n" + " + *quantbyte*=<1|2>\n" + " Specify quantization code representation.\n" + " Options _1_, _2_ are for *1-* and *2-*byte, respectively. (default: 2)\n" + " ^^Manually specifying this may not result in optimal memory footprint.^^\n" + " + *huffbyte*=<4|8>\n" + " Specify Huffman codeword representation.\n" + " Options _4_, _8_ are for *4-* and *8-*byte, respectively. (default: 4)\n" + " ^^Manually specifying this may not result in optimal memory footprint.^^\n" + " + *huffchunk*=[256|512|1024|...]\n" + " Manually specify chunk size for Huffman codec, overriding autotuning.\n" + " Should be a power-of-2 that is sufficiently large.\n" + " ^^This affects Huffman decoding performance significantly.^^\n" + "\n" + "*EXAMPLES*\n" + " *Demo Datasets*\n" + " Set a *shell variable*:\n" + " export PATH=$(pwd)/bin:$PATH\n" + " CESM=./data/cesm-CLDHGH-3600x1800\n" + " HURR=./data/hurr-CLOUDf48-500x500x100\n" + "\n" + " *CESM* example:\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --report time\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -r\n" + " cusz -i ${CESM}.cusza -x --report time --compare ${CESM} --skip write2disk\n" + "\n" + " *CESM* example with specified output path:\n" + " mkdir data2 data3\n" + " ^^# zip, output to `data2`^^\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${CESM} -l 3600x1800 -z --opath data2\n" + " ^^# unzip, in situ^^\n" + " cusz -i ${CESM}.cusza -x && ls data2\n" + " ^^# unzip, output to `data3`^^\n" + " cusz -i ${CESM}.cusza -x --opath data3 && ls data3\n" + " ^^# unzip, output to `data3`, compare to the original datum^^\n" + " cusz -i ${CESM}.cusza -x --opath data3 --compare ${CESM} && ls data3\n" + "\n" + " *Hurricane Isabel* example:\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -z\n" + " cusz -t f32 -m r2r -e 1e-4 -i ${HURR} -l 500x500x100 -r\n" + " cusz -i ${HURR}.cusza -x\n" + "\n"; + +// TODO +// " *EXAFEL* example:\n" +// " cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning\n" +// " cusz -t f32 -m r2r -e 1e-4 -i ./data/exafel-59200x388 --demo exafeldemo -z -x --pre binning " +// "--skip huffman\n" +// " cusz -i ./data/exafel-59200x388.BN.cusza -x\n"; + +static const char huff_re_short_doc[] = + "\n" + "OVERVIEW: Huffman submodule as standalone program\n" // TODO from this line on + "\n" + "USAGE:\n" + " The basic use with demo datum is listed below,\n" + " ./huff --encode --decode --verify --input ./baryon_density.dat.b16 \\\n" + " -3 512 512 512 --input-rep 16 --huffman-rep 32 --huffman-chunk 2048 --dict-size 1024\n" + " or shorter\n" + " ./huff -e -d -V -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -H 32 -C 2048 -c 1024\n" + " ^ ^ ^ --------------------------- -------------- ----- ----- ------- -------\n" + " | | | input datum file dimension input Huff. Huff. codebook\n" + " enc dec verify rep. rep. chunk size\n" + "\n" + "EXAMPLES\n" + " Essential:\n" + " ./bin/huff -e -d -i ./baryon_density.dat.b16 -3 512 512 512 -R 16 -c 1024\n" + " have to input dimension, and higher dimension for a multiplication of each dim.,\n" + " as default values input-rep=16 (bits), huff-rep=32 (bits), codebook-size=1024 (symbols)\n" + "\n"; + +static const char doc_dim_order[] = + "\n" + " Input dimension follows low-to-high (e.g., x-y-z) order.\n" + " Taking 2D CESM-ATM as an example, \n" + "\n" + " |<------------------------- x 3600 --------------------------->| \n" + " +--------------------------------------------------------------+ - \n" + " | | ^ \n" + " | | | \n" + " | CESM-ATM: 1800x3600 (y-x order) | | \n" + " | datum name: _1800_3600 | y \n" + " | | 1800 \n" + " | input: -l 3600,1800 | | \n" + " | input order: -l [x,y] | | \n" + " | | | \n" + " | | v \n" + " +--------------------------------------------------------------+ - \n" + "\n" + " Taking 3D Hurricane as another example, whose dimensions are\n" + " 100x500x500, the input is \"-l 500,500,100\".\n"; + +#endif diff --git a/qtensor/compression/cusz/include/cli/quality_viewer.hh b/qtensor/compression/cusz/include/cli/quality_viewer.hh index 0a5e9eed..eb8a27c2 100644 --- a/qtensor/compression/cusz/include/cli/quality_viewer.hh +++ b/qtensor/compression/cusz/include/cli/quality_viewer.hh @@ -1,163 +1,163 @@ -/** - * @file quality_viewer.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-09 - * @deprecated 0.3.2 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef QUALITY_VIEWER_HH -#define QUALITY_VIEWER_HH - -// 22-11-20 would fail in cxxapi.cu if deleted -#include - -#include "../common/capsule.hh" -#include "../common/definition.hh" -#include "../header.h" -#include "../stat/compare_gpu.hh" -#include "verify.hh" - -namespace cusz { - -const static auto HOST = cusz::LOC::HOST; -const static auto DEVICE = cusz::LOC::DEVICE; -const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE; - -struct QualityViewer { - template - static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false) - { - auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)"); - auto bytes = (s->len * sizeof(Data) * 1.0); - - auto println = [](const char* s, double n1, double n2, double n3, double n4) { - printf(" %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4); - }; - auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) { - printf(" \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5); - }; - - auto is_fp = std::is_same::value or std::is_same::value ? const_cast("yes") - : const_cast("no"); - printf("\nquality metrics %s:\n", checker.c_str()); - - printhead("", "data-len", "data-byte", "fp-type?", ""); - printf(" %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp); - - printhead("", "min", "max", "rng", "std"); - println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std); - println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std); - - printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG"); - println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel); - - printhead("", "CR", "NRMSE", "cross-cor", "PSNR"); - println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR); - - // printf("\n"); - }; - - static void print_metrics_auto(double* lag1_cor, double* lag2_cor) - { - auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) { - printf(" \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5); - }; - - printhead("", "lag1-cor", "lag2-cor", "", ""); - printf(" %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor); - printf("\n"); - }; - - template - static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0) - { - // cross - auto stat_x = new cusz_stats; - psz::thrustgpu_assess_quality(stat_x, reconstructed, origin, len); - print_metrics_cross(stat_x, compressed_bytes, true); - - auto stat_auto_lag1 = new cusz_stats; - psz::thrustgpu_assess_quality(stat_auto_lag1, origin, origin + 1, len - 1); - auto stat_auto_lag2 = new cusz_stats; - psz::thrustgpu_assess_quality(stat_auto_lag2, origin, origin + 2, len - 2); - - print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff); - } - - template - static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true) - { - auto stat = new cusz_stats; - T* reconstructed; - T* origin; - if (not from_device) { - reconstructed = _d1; - origin = _d2; - } - else { - printf("allocating tmp space for CPU verification\n"); - auto bytes = sizeof(T) * len; - cudaMallocHost(&reconstructed, bytes); - cudaMallocHost(&origin, bytes); - cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost); - cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost); - } - cusz::verify_data(stat, reconstructed, origin, len); - print_metrics_cross(stat, compressed_bytes, false); - - auto stat_auto_lag1 = new cusz_stats; - verify_data(stat_auto_lag1, origin, origin + 1, len - 1); - auto stat_auto_lag2 = new cusz_stats; - verify_data(stat_auto_lag2, origin, origin + 2, len - 2); - - print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff); - - if (from_device) { - if (reconstructed) cudaFreeHost(reconstructed); - if (origin) cudaFreeHost(origin); - } - } - - template - static void load_origin(string const& fname, Capsule& origin) - { - origin.mallochost().malloc().fromfile(fname); - } - - template - static void view(header_t header, Capsule& xdata, Capsule& cmp, string const& compare) - { - auto len = ConfigHelper::get_uncompressed_len(header); - auto compressd_bytes = ConfigHelper::get_filesize(header); - - auto compare_on_gpu = [&]() { - cmp.mallochost().malloc().fromfile(compare).host2device(); - echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes); - cmp.freehost().free(); - }; - - auto compare_on_cpu = [&]() { - cmp.mallochost().fromfile(compare); - xdata.device2host(); - echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes); - cmp.freehost(); - }; - - if (compare != "") { - auto gb = 1.0 * sizeof(T) * len / 1e9; - if (gb < 0.8) - compare_on_gpu(); - else - compare_on_cpu(); - } - } -}; - -} // namespace cusz - -#endif +/** + * @file quality_viewer.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-09 + * @deprecated 0.3.2 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef QUALITY_VIEWER_HH +#define QUALITY_VIEWER_HH + +// 22-11-20 would fail in cxxapi.cu if deleted +#include + +#include "../common/capsule.hh" +#include "../common/definition.hh" +#include "../header.h" +#include "../stat/compare_gpu.hh" +#include "verify.hh" + +namespace cusz { + +const static auto HOST = cusz::LOC::HOST; +const static auto DEVICE = cusz::LOC::DEVICE; +const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE; + +struct QualityViewer { + template + static void print_metrics_cross(cusz_stats* s, size_t compressed_bytes = 0, bool gpu_checker = false) + { + auto checker = (not gpu_checker) ? string("(using CPU checker)") : string("(using GPU checker)"); + auto bytes = (s->len * sizeof(Data) * 1.0); + + auto println = [](const char* s, double n1, double n2, double n3, double n4) { + printf(" %-10s %16.8g %16.8g %16.8g %16.8g\n", s, n1, n2, n3, n4); + }; + auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) { + printf(" \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5); + }; + + auto is_fp = std::is_same::value or std::is_same::value ? const_cast("yes") + : const_cast("no"); + printf("\nquality metrics %s:\n", checker.c_str()); + + printhead("", "data-len", "data-byte", "fp-type?", ""); + printf(" %-10s %16zu %16lu %16s\n", "", s->len, sizeof(Data), is_fp); + + printhead("", "min", "max", "rng", "std"); + println("origin", s->odata.min, s->odata.max, s->odata.rng, s->odata.std); + println("eb-lossy", s->xdata.min, s->xdata.max, s->xdata.rng, s->xdata.std); + + printhead("", "abs-val", "abs-idx", "pw-rel", "VS-RNG"); + println("max-error", s->max_err.abs, s->max_err.idx, s->max_err.pwrrel, s->max_err.rel); + + printhead("", "CR", "NRMSE", "cross-cor", "PSNR"); + println("metrics", bytes / compressed_bytes, s->reduced.NRMSE, s->reduced.coeff, s->reduced.PSNR); + + // printf("\n"); + }; + + static void print_metrics_auto(double* lag1_cor, double* lag2_cor) + { + auto printhead = [](const char* s1, const char* s2, const char* s3, const char* s4, const char* s5) { + printf(" \e[1m\e[31m%-10s %16s %16s %16s %16s\e[0m\n", s1, s2, s3, s4, s5); + }; + + printhead("", "lag1-cor", "lag2-cor", "", ""); + printf(" %-10s %16lf %16lf\n", "auto", *lag1_cor, *lag2_cor); + printf("\n"); + }; + + template + static void echo_metric_gpu(T* reconstructed, T* origin, size_t len, size_t compressed_bytes = 0) + { + // cross + auto stat_x = new cusz_stats; + psz::thrustgpu_assess_quality(stat_x, reconstructed, origin, len); + print_metrics_cross(stat_x, compressed_bytes, true); + + auto stat_auto_lag1 = new cusz_stats; + psz::thrustgpu_assess_quality(stat_auto_lag1, origin, origin + 1, len - 1); + auto stat_auto_lag2 = new cusz_stats; + psz::thrustgpu_assess_quality(stat_auto_lag2, origin, origin + 2, len - 2); + + print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff); + } + + template + static void echo_metric_cpu(T* _d1, T* _d2, size_t len, size_t compressed_bytes = 0, bool from_device = true) + { + auto stat = new cusz_stats; + T* reconstructed; + T* origin; + if (not from_device) { + reconstructed = _d1; + origin = _d2; + } + else { + printf("allocating tmp space for CPU verification\n"); + auto bytes = sizeof(T) * len; + cudaMallocHost(&reconstructed, bytes); + cudaMallocHost(&origin, bytes); + cudaMemcpy(reconstructed, _d1, bytes, cudaMemcpyDeviceToHost); + cudaMemcpy(origin, _d2, bytes, cudaMemcpyDeviceToHost); + } + cusz::verify_data(stat, reconstructed, origin, len); + print_metrics_cross(stat, compressed_bytes, false); + + auto stat_auto_lag1 = new cusz_stats; + verify_data(stat_auto_lag1, origin, origin + 1, len - 1); + auto stat_auto_lag2 = new cusz_stats; + verify_data(stat_auto_lag2, origin, origin + 2, len - 2); + + print_metrics_auto(&stat_auto_lag1->reduced.coeff, &stat_auto_lag2->reduced.coeff); + + if (from_device) { + if (reconstructed) cudaFreeHost(reconstructed); + if (origin) cudaFreeHost(origin); + } + } + + template + static void load_origin(string const& fname, Capsule& origin) + { + origin.mallochost().malloc().fromfile(fname); + } + + template + static void view(header_t header, Capsule& xdata, Capsule& cmp, string const& compare) + { + auto len = ConfigHelper::get_uncompressed_len(header); + auto compressd_bytes = ConfigHelper::get_filesize(header); + + auto compare_on_gpu = [&]() { + cmp.mallochost().malloc().fromfile(compare).host2device(); + echo_metric_gpu(xdata.dptr(), cmp.dptr(), len, compressd_bytes); + cmp.freehost().free(); + }; + + auto compare_on_cpu = [&]() { + cmp.mallochost().fromfile(compare); + xdata.device2host(); + echo_metric_cpu(xdata.hptr(), cmp.hptr(), len, compressd_bytes); + cmp.freehost(); + }; + + if (compare != "") { + auto gb = 1.0 * sizeof(T) * len / 1e9; + if (gb < 0.8) + compare_on_gpu(); + else + compare_on_cpu(); + } + } +}; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/cli/query.hh b/qtensor/compression/cusz/include/cli/query.hh index 91fcf65d..c09326c8 100644 --- a/qtensor/compression/cusz/include/cli/query.hh +++ b/qtensor/compression/cusz/include/cli/query.hh @@ -1,71 +1,71 @@ -/** - * @file query.hh - * @author Jiannan Tian - * @brief query machine information - * @version 0.1.3 - * @date 2020-10-05 - * - * @copyright (C) 2020 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef QUERY_HH -#define QUERY_HH - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "query_dev.hh" - -struct Diagnostics { - static std::string ExecShellCommand(const char* cmd) - { - std::array buffer; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { throw std::runtime_error("popen() failed!"); } - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); } - return result; - } - - static void GetMachineProperties() - { - std::vector v; - std::cout << "host information: " << std::endl; - - auto cpuinfo = ExecShellCommand( // - std::string("cat /proc/cpuinfo " - "| grep \"model name\" " - "| head -n 1 " - "| awk -F': ' '{print $NF}'") - .c_str()); - std::cout << " cpu model\t" << cpuinfo; - - auto meminfo = ExecShellCommand( // - std::string("cat /proc/meminfo" - "| grep \"MemTotal\" " - "| awk -F' ' '{print $2\" \"$3}'") - .c_str()); - - std::cout << " memory size\t" << meminfo; - - auto endianness = ExecShellCommand( // - std::string("lscpu " - "| grep Endian " - "| awk -F' ' '{print $NF}'") - .c_str()); - - std::cout << " byte order\t" << endianness; - printf("\n"); - } -}; - -#endif +/** + * @file query.hh + * @author Jiannan Tian + * @brief query machine information + * @version 0.1.3 + * @date 2020-10-05 + * + * @copyright (C) 2020 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef QUERY_HH +#define QUERY_HH + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "query_dev.hh" + +struct Diagnostics { + static std::string ExecShellCommand(const char* cmd) + { + std::array buffer; + std::string result; + std::unique_ptr pipe(popen(cmd, "r"), pclose); + if (!pipe) { throw std::runtime_error("popen() failed!"); } + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { result += buffer.data(); } + return result; + } + + static void GetMachineProperties() + { + std::vector v; + std::cout << "host information: " << std::endl; + + auto cpuinfo = ExecShellCommand( // + std::string("cat /proc/cpuinfo " + "| grep \"model name\" " + "| head -n 1 " + "| awk -F': ' '{print $NF}'") + .c_str()); + std::cout << " cpu model\t" << cpuinfo; + + auto meminfo = ExecShellCommand( // + std::string("cat /proc/meminfo" + "| grep \"MemTotal\" " + "| awk -F' ' '{print $2\" \"$3}'") + .c_str()); + + std::cout << " memory size\t" << meminfo; + + auto endianness = ExecShellCommand( // + std::string("lscpu " + "| grep Endian " + "| awk -F' ' '{print $NF}'") + .c_str()); + + std::cout << " byte order\t" << endianness; + printf("\n"); + } +}; + +#endif diff --git a/qtensor/compression/cusz/include/cli/query_dev.hh b/qtensor/compression/cusz/include/cli/query_dev.hh index c2eb37aa..34a429ea 100644 --- a/qtensor/compression/cusz/include/cli/query_dev.hh +++ b/qtensor/compression/cusz/include/cli/query_dev.hh @@ -1,69 +1,69 @@ -/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* This sample queries the properties of the CUDA devices present in the system - * via CUDA Runtime API. */ - -/** - * @brief Get the Device Property object - * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp` - */ - -struct GpuDiagnostics { - static void GetDeviceProperty() - { - int num_dev = 0; - cudaError_t error_id = cudaGetDeviceCount(&num_dev); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast(error_id), cudaGetErrorString(error_id)); - exit(EXIT_FAILURE); - } - if (num_dev == 0) { printf("NO CUDA device detected.\n"); } - int dev, driver_ver = 0, runtime_ver = 0; - - for (dev = 0; dev < num_dev; ++dev) { - cudaSetDevice(dev); - cudaDeviceProp dev_prop; - cudaGetDeviceProperties(&dev_prop, dev); - printf("device #%d, %s: \n", dev, dev_prop.name); - - cudaDriverGetVersion(&driver_ver); - cudaRuntimeGetVersion(&runtime_ver); - printf( - " driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000, - (runtime_ver % 100) / 10); - printf(" compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor); - printf(" global memory:\t%.0f MiB\n", static_cast(dev_prop.totalGlobalMem / 1048576.0f)); - printf(" constant memory:\t%zu bytes\n", dev_prop.totalConstMem); - printf(" shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock); - printf(" shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor); - printf(" registers per block:\t%d\n", dev_prop.regsPerBlock); - } - printf("\n"); - } +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* This sample queries the properties of the CUDA devices present in the system + * via CUDA Runtime API. */ + +/** + * @brief Get the Device Property object + * modified from `cuda-samples/Samples/deviceQuery/deviceQuery.cpp` + */ + +struct GpuDiagnostics { + static void GetDeviceProperty() + { + int num_dev = 0; + cudaError_t error_id = cudaGetDeviceCount(&num_dev); + + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast(error_id), cudaGetErrorString(error_id)); + exit(EXIT_FAILURE); + } + if (num_dev == 0) { printf("NO CUDA device detected.\n"); } + int dev, driver_ver = 0, runtime_ver = 0; + + for (dev = 0; dev < num_dev; ++dev) { + cudaSetDevice(dev); + cudaDeviceProp dev_prop; + cudaGetDeviceProperties(&dev_prop, dev); + printf("device #%d, %s: \n", dev, dev_prop.name); + + cudaDriverGetVersion(&driver_ver); + cudaRuntimeGetVersion(&runtime_ver); + printf( + " driver/runtime\t%d.%d/%d.%d\n", driver_ver / 1000, (driver_ver % 100) / 10, runtime_ver / 1000, + (runtime_ver % 100) / 10); + printf(" compute capability:\t%d.%d\n", dev_prop.major, dev_prop.minor); + printf(" global memory:\t%.0f MiB\n", static_cast(dev_prop.totalGlobalMem / 1048576.0f)); + printf(" constant memory:\t%zu bytes\n", dev_prop.totalConstMem); + printf(" shared mem per block:\t%zu bytes\n", dev_prop.sharedMemPerBlock); + printf(" shared mem per SM:\t%zu bytes\n", dev_prop.sharedMemPerMultiprocessor); + printf(" registers per block:\t%d\n", dev_prop.regsPerBlock); + } + printf("\n"); + } }; \ No newline at end of file diff --git a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh index 9e245073..52baac95 100644 --- a/qtensor/compression/cusz/include/cli/timerecord_viewer.hh +++ b/qtensor/compression/cusz/include/cli/timerecord_viewer.hh @@ -1,109 +1,109 @@ -/** - * @file timerecord_viewer.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-09 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CLI_TIMERECORD_VIEWER_HH -#define CLI_TIMERECORD_VIEWER_HH - -#include -#include "../common/definition.hh" - -namespace cusz { - -struct TimeRecordViewer { - static float get_throughput(float milliseconds, size_t bytes) - { - auto GiB = 1.0 * 1024 * 1024 * 1024; - auto seconds = milliseconds * 1e-3; - return bytes / GiB / seconds; - } - - static void println_throughput(const char* s, float timer, size_t bytes) - { - if (timer == 0.0) return; - - auto t = get_throughput(timer, bytes); - printf(" %-12s %'12f %'10.2f\n", s, timer, t); - }; - - static void println_throughput_tablehead() - { - printf( - "\n \e[1m\e[31m%-12s %12s %10s\e[0m\n", // - const_cast("kernel"), // - const_cast("time, ms"), // - const_cast("GiB/s") // - ); - } - - static double get_total_time(timerecord_t r) - { - double total = 0.0; - std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); }); - return total; - } - static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0) - { - auto report_cr = [&]() { - auto cr = 1.0 * bytes / compressed_bytes; - if (compressed_bytes != 0) printf(" %-*s %.2f\n", 20, "compression ratio", cr); - }; - - TimeRecord reflow; - - { // reflow - TimeRecordTuple book_tuple; - - auto total_time = get_total_time(r); - auto subtotal_time = total_time; - - for (auto& i : *r) { - auto item = std::string(std::get<0>(i)); - if (item == "book") { - book_tuple = i; - subtotal_time -= std::get<1>(i); - } - else { - reflow.push_back(i); - } - } - reflow.push_back({const_cast("(subtotal)"), subtotal_time}); - printf("\e[2m"); - reflow.push_back(book_tuple); - reflow.push_back({const_cast("(total)"), total_time}); - printf("\e[0m"); - } - - printf("\n(c) COMPRESSION REPORT\n"); - report_cr(); - - ReportHelper::println_throughput_tablehead(); - for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes); - - printf("\n"); - } - - static void view_decompression(timerecord_t r, size_t bytes) - { - printf("\n(d) deCOMPRESSION REPORT\n"); - - auto total_time = get_total_time(r); - (*r).push_back({const_cast("(total)"), total_time}); - - ReportHelper::println_throughput_tablehead(); - for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes); - - printf("\n"); - } -}; - -} // namespace cusz - -#endif +/** + * @file timerecord_viewer.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-09 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CLI_TIMERECORD_VIEWER_HH +#define CLI_TIMERECORD_VIEWER_HH + +#include +#include "../common/definition.hh" + +namespace cusz { + +struct TimeRecordViewer { + static float get_throughput(float milliseconds, size_t bytes) + { + auto GiB = 1.0 * 1024 * 1024 * 1024; + auto seconds = milliseconds * 1e-3; + return bytes / GiB / seconds; + } + + static void println_throughput(const char* s, float timer, size_t bytes) + { + if (timer == 0.0) return; + + auto t = get_throughput(timer, bytes); + printf(" %-12s %'12f %'10.2f\n", s, timer, t); + }; + + static void println_throughput_tablehead() + { + printf( + "\n \e[1m\e[31m%-12s %12s %10s\e[0m\n", // + const_cast("kernel"), // + const_cast("time, ms"), // + const_cast("GiB/s") // + ); + } + + static double get_total_time(timerecord_t r) + { + double total = 0.0; + std::for_each(r->begin(), r->end(), [&](TimeRecordTuple t) { return total += std::get<1>(t); }); + return total; + } + static void view_compression(timerecord_t r, size_t bytes, size_t compressed_bytes = 0) + { + auto report_cr = [&]() { + auto cr = 1.0 * bytes / compressed_bytes; + if (compressed_bytes != 0) printf(" %-*s %.2f\n", 20, "compression ratio", cr); + }; + + TimeRecord reflow; + + { // reflow + TimeRecordTuple book_tuple; + + auto total_time = get_total_time(r); + auto subtotal_time = total_time; + + for (auto& i : *r) { + auto item = std::string(std::get<0>(i)); + if (item == "book") { + book_tuple = i; + subtotal_time -= std::get<1>(i); + } + else { + reflow.push_back(i); + } + } + reflow.push_back({const_cast("(subtotal)"), subtotal_time}); + printf("\e[2m"); + reflow.push_back(book_tuple); + reflow.push_back({const_cast("(total)"), total_time}); + printf("\e[0m"); + } + + printf("\n(c) COMPRESSION REPORT\n"); + report_cr(); + + ReportHelper::println_throughput_tablehead(); + for (auto& i : reflow) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes); + + printf("\n"); + } + + static void view_decompression(timerecord_t r, size_t bytes) + { + printf("\n(d) deCOMPRESSION REPORT\n"); + + auto total_time = get_total_time(r); + (*r).push_back({const_cast("(total)"), total_time}); + + ReportHelper::println_throughput_tablehead(); + for (auto& i : *r) ReportHelper::println_throughput(std::get<0>(i), std::get<1>(i), bytes); + + printf("\n"); + } +}; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/cli/verify.hh b/qtensor/compression/cusz/include/cli/verify.hh index 1e856021..621a0077 100644 --- a/qtensor/compression/cusz/include/cli/verify.hh +++ b/qtensor/compression/cusz/include/cli/verify.hh @@ -1,87 +1,87 @@ -#ifndef ANALYSIS_VERIFY_HH -#define ANALYSIS_VERIFY_HH - -/** - * @file verify.hh - * @author Jiannan Tian - * @brief Verification of decompressed data. - * @version 0.2 - * @date 2020-09-20 - * Created on: 2019-09-30 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include "../common.hh" -#include "../cusz/type.h" - -using namespace std; - -namespace cusz { - -template -void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len) -{ - double max_odata = odata[0], min_odata = odata[0]; - double max_xdata = xdata[0], min_xdata = xdata[0]; - double max_abserr = max_abserr = fabs(xdata[0] - odata[0]); - - double sum_0 = 0, sum_x = 0; - for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i]; - - double mean_odata = sum_0 / len, mean_xdata = sum_x / len; - double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0; - - double max_pwrrel_abserr = 0; - size_t max_abserr_index = 0; - for (size_t i = 0; i < len; i++) { - max_odata = max_odata < odata[i] ? odata[i] : max_odata; - min_odata = min_odata > odata[i] ? odata[i] : min_odata; - - max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata; - min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata; - - float abserr = fabs(xdata[i] - odata[i]); - if (odata[i] != 0) { - rel_abserr = abserr / fabs(odata[i]); - max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr; - } - max_abserr_index = max_abserr < abserr ? i : max_abserr_index; - max_abserr = max_abserr < abserr ? abserr : max_abserr; - sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata); - sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata); - sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata); - sum_err2 += abserr * abserr; - } - double std_odata = sqrt(sum_var_odata / len); - double std_xdata = sqrt(sum_var_xdata / len); - double ee = sum_corr / len; - - s->len = len; - - s->odata.max = max_odata; - s->odata.min = min_odata; - s->odata.rng = max_odata - min_odata; - s->odata.std = std_odata; - - s->xdata.max = max_xdata; - s->xdata.min = min_xdata; - s->xdata.rng = max_xdata - min_xdata; - s->xdata.std = std_xdata; - - s->max_err.idx = max_abserr_index; - s->max_err.abs = max_abserr; - s->max_err.rel = max_abserr / s->odata.rng; - s->max_err.pwrrel = max_pwrrel_abserr; - - s->reduced.coeff = ee / std_odata / std_xdata; - s->reduced.MSE = sum_err2 / len; - s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; - s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); -} - -} // namespace cusz - -#endif +#ifndef ANALYSIS_VERIFY_HH +#define ANALYSIS_VERIFY_HH + +/** + * @file verify.hh + * @author Jiannan Tian + * @brief Verification of decompressed data. + * @version 0.2 + * @date 2020-09-20 + * Created on: 2019-09-30 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include "../common.hh" +#include "../cusz/type.h" + +using namespace std; + +namespace cusz { + +template +void verify_data(cusz_stats* s, T* xdata, T* odata, size_t len) +{ + double max_odata = odata[0], min_odata = odata[0]; + double max_xdata = xdata[0], min_xdata = xdata[0]; + double max_abserr = max_abserr = fabs(xdata[0] - odata[0]); + + double sum_0 = 0, sum_x = 0; + for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i]; + + double mean_odata = sum_0 / len, mean_xdata = sum_x / len; + double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0; + + double max_pwrrel_abserr = 0; + size_t max_abserr_index = 0; + for (size_t i = 0; i < len; i++) { + max_odata = max_odata < odata[i] ? odata[i] : max_odata; + min_odata = min_odata > odata[i] ? odata[i] : min_odata; + + max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata; + min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata; + + float abserr = fabs(xdata[i] - odata[i]); + if (odata[i] != 0) { + rel_abserr = abserr / fabs(odata[i]); + max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr; + } + max_abserr_index = max_abserr < abserr ? i : max_abserr_index; + max_abserr = max_abserr < abserr ? abserr : max_abserr; + sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata); + sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata); + sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata); + sum_err2 += abserr * abserr; + } + double std_odata = sqrt(sum_var_odata / len); + double std_xdata = sqrt(sum_var_xdata / len); + double ee = sum_corr / len; + + s->len = len; + + s->odata.max = max_odata; + s->odata.min = min_odata; + s->odata.rng = max_odata - min_odata; + s->odata.std = std_odata; + + s->xdata.max = max_xdata; + s->xdata.min = min_xdata; + s->xdata.rng = max_xdata - min_xdata; + s->xdata.std = std_xdata; + + s->max_err.idx = max_abserr_index; + s->max_err.abs = max_abserr; + s->max_err.rel = max_abserr / s->odata.rng; + s->max_err.pwrrel = max_pwrrel_abserr; + + s->reduced.coeff = ee / std_odata / std_xdata; + s->reduced.MSE = sum_err2 / len; + s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; + s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); +} + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/common.hh b/qtensor/compression/cusz/include/common.hh index 5d2bf33e..b2741954 100644 --- a/qtensor/compression/cusz/include/common.hh +++ b/qtensor/compression/cusz/include/common.hh @@ -1,19 +1,19 @@ -/** - * @file common.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-26 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMMON_HH -#define CUSZ_COMMON_HH - -#include "common/configs.hh" -#include "common/definition.hh" -#include "common/type_traits.hh" - +/** + * @file common.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-26 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMMON_HH +#define CUSZ_COMMON_HH + +#include "common/configs.hh" +#include "common/definition.hh" +#include "common/type_traits.hh" + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/include/common/capsule.hh b/qtensor/compression/cusz/include/common/capsule.hh index 05d8ebf6..be1f1f1b 100644 --- a/qtensor/compression/cusz/include/common/capsule.hh +++ b/qtensor/compression/cusz/include/common/capsule.hh @@ -1,402 +1,402 @@ -/** - * @file capsule.hh - * @author Jiannan Tian - * @brief Simple data analysis (header) - * @version 0.2.3 - * @date 2020-11-03 - * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08 - * @deprecated 0.3.2 - * - * @copyright (C) 2020 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CAPSULE_HH -#define CAPSULE_HH - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -#include -#include - -#include -#include -#include -#include -#include - -#include "../stat/compare_gpu.hh" -// #include "../utils/io.hh" -#include "../utils/timer.hh" -#include "definition.hh" - -template -class Capsule { - private: - // variables - struct { - bool hptr{false}, dptr{false}, uniptr{false}; - } alloc_status; - - T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr}; - - uint32_t _len{0}; - dim3 _len3{1, 1, 1}, _stride3{1, 1, 1}; - - std::string name; - - // logging setup; standalone - const std::string LOG_NULL = " "; - const std::string LOG_INFO = " :: "; - const std::string LOG_ERR = " ERR "; - const std::string LOG_WARN = "WARN "; - const std::string LOG_DBG = " dbg "; - const std::string LOG_EXCEPTION = " !! "; - - // https://stackoverflow.com/a/26080768/8740097 CC BY-SA 3.0 - template - void build_string(std::ostream& o, S t) - { - o << t << " "; - } - - template - void build_string(std::ostream& o, S t, Args... args) // recursive variadic function - { - build_string(o, t); - build_string(o, args...); - } - - template - void LOGGING(const std::string& log_head, Args... args) - { - std::ostringstream oss; - oss << log_head; - build_string(oss, args...); - - oss.seekp(0, std::ios::end); - std::stringstream::pos_type offset = oss.tellp(); - if (log_head == LOG_DBG) { std::cout << "\e[2m"; } // dbg - std::cout << oss.str() << std::endl; // print content - if (log_head == LOG_DBG) std::cout << "\e[0m"; // finish printing dbg - } - - // IO - int fs2mem(const char* fname, void* array, size_t num_els) - { - auto bytes = sizeof(T) * num_els; - - std::ifstream ifs(fname, std::ios::binary | std::ios::in); - if (not ifs.is_open()) { - std::cerr << "fail to open " << fname << std::endl; - return -1; - } - ifs.read(reinterpret_cast(array), std::streamsize(bytes)); - ifs.close(); - - return 0; - } - - int mem2fs(const char* fname, void* array, size_t num_els) - { - auto bytes = sizeof(type) * num_els; - - std::ofstream ofs(fname, std::ios::binary | std::ios::out); - if (not ofs.is_open()) { - std::cerr << "fail to open " << fname << std::endl; - return -1; - } - - ofs.write(reinterpret_cast(array), std::streamsize(bytes)); - ofs.close(); - - return 0; - } - - std::string ERRSTR_BUILDER(std::string func, std::string msg) - { - return "[Capsule(\"" + name + "\")::" + func + "] " + msg; - } - - void check_len(std::string funcname) - { - if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0"); - } - - std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior") - { // - return ERRSTR_BUILDER(func, "undefined behavior"); - } - - public: - using type = T; - - // TODO rule of n - // constructor - Capsule() = default; - Capsule(const std::string _str) : name(_str){}; - Capsule(uint32_t len, const std::string _str = std::string("")) : _len(len), name(_str) {} - Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("")) : name(_str) - { - _len3 = dim3(x, y, z); - _len = x * y * z; - } - - ~Capsule() - { - // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly. - // if (alloc_status.hptr) cudaFreeHost(_hptr); - - if (alloc_status.dptr) cudaFree(_dptr); - if (alloc_status.uniptr) cudaFree(_uniptr); - } - - // getter start -------------------- - T*& dptr() { return _dptr; } - T*& hptr() { return _hptr; } - T*& uniptr() { return _uniptr; } - - uint32_t len() const { return _len; } - dim3 len3() const { return _len3; } - dim3 stride3() const { return _stride3; } - // 1D - T& dptr(uint32_t i) { return _dptr[i]; } - T& hptr(uint32_t i) { return _hptr[i]; } - T& uniptr(uint32_t i) { return _uniptr[i]; } - // 2D - T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; } - T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; } - T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; } - // 3D - T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; } - T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; } - T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; } - // getter end ----------------------- - - // setter start --------------------- - Capsule& set_hptr(T* ptr) - { - _hptr = ptr, alloc_status.hptr = true; - return *this; - } - Capsule& set_dptr(T* ptr) - { - _dptr = ptr, alloc_status.dptr = true; - return *this; - } - Capsule& set_uniptr(T* ptr) - { - _uniptr = ptr, alloc_status.uniptr = true; - return *this; - } - - // variable len - Capsule& set_len(uint32_t len) - { - if (len <= 0) throw std::runtime_error("length must be greater than 0"); - _len = len; - return *this; - } - - Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1) - { - if (x == 1) throw std::runtime_error("x must be > 1."); - if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero."); - - _len3 = dim3(x, y, z); - _stride3 = dim3(1, x, x * y); - _len = x * y * z; - - return *this; - } - // setter end ---------------------- - - // debug - void debug() - { - printf("Capsule debugging information\n"); - printf(" name : %s\n", name.c_str()); - printf(" len : %u\n", len()); - printf(" hptr : %s\n", alloc_status.hptr ? "set" : "not set"); - printf(" dptr : %s\n", alloc_status.dptr ? "set" : "not set"); - printf(" uniptr : %s\n", alloc_status.uniptr ? "set" : "not set"); - } - - // for debugging - Capsule& set_name(std::string _str) - { - name = _str; - return *this; - } - - // IO - Capsule& fromfile(std::string fname, double* time = nullptr) - { - if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set")); - if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0")); - - auto a = hires::now(); - fs2mem(fname.c_str(), _hptr, _len); - auto z = hires::now(); - - if (time) *time = static_cast(z - a).count(); - - return *this; - } - - Capsule& tofile(std::string fname, double* time = nullptr) - { - if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); } - if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0")); - - auto a = hires::now(); - mem2fs(fname.c_str(), _hptr, _len); - auto z = hires::now(); - - if (time) *time = static_cast(z - a).count(); - - return *this; - } - - uint32_t nbyte() const { return _len * sizeof(T); } - - // memcpy h2d, synchronous - Capsule& host2device() - { - check_len("host2device"); - - cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice); - return *this; - } - // memcpy d2h, synchronous - Capsule& device2host() - { - check_len("device2host"); - - cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost); - return *this; - } - // memcpy h2d, asynchronous - Capsule& host2device_async(cudaStream_t stream) - { - check_len("host2device_async"); - - cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream); - return *this; - } - // memcpy d2h, asynchronous - Capsule& device2host_async(cudaStream_t stream) - { - check_len("device2host_async"); - - cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream); - return *this; - } - // shorthand - Capsule& h2d() { return host2device(); } - Capsule& d2h() { return device2host(); } - Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); } - Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); } - - // cudaMalloc wrapper - Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0) - { - check_len("malloc"); - - if (alloc_status.dptr) - LOGGING(LOG_WARN, "already allocated on device"); - else { - cudaMalloc(&_dptr, nbyte()); - cudaMemset(_dptr, memset_val, nbyte()); - alloc_status.dptr = true; - } - return *this; - } - // cudaMallocHost wrapper, pinned - Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0) - { - check_len("mallochost"); - - if (alloc_status.hptr) - LOGGING(LOG_WARN, "already allocated on host"); - else { - cudaMallocHost(&_hptr, nbyte()); - memset(_hptr, memset_val, nbyte()); - alloc_status.hptr = true; - } - return *this; - } - // cudaMallocManaged wrapper - Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0) - { - check_len("mallocmanaged"); - - if (alloc_status.uniptr) - LOGGING(LOG_WARN, "already allocated as unified"); - else { - cudaMallocManaged(&_uniptr, nbyte()); - cudaMemset(_uniptr, memset_val, nbyte()); - alloc_status.uniptr = true; - } - return *this; - } - // cudaFree wrapper - Capsule& free() - { - if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null")); - cudaFree(_dptr); - alloc_status.dptr = false; - return *this; - } - // cudaFreeHost wrapper - Capsule& freehost() - { - if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null")); - cudaFreeHost(_hptr); - alloc_status.hptr = false; - return *this; - } - // cudaFree wrapper, but for unified memory - Capsule& freemanaged() - { - if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null")); - cudaFree(_uniptr); - alloc_status.uniptr = false; - return *this; - } - - private: - double maxval, minval, rng; - - public: - double get_maxval() { return maxval; } - double get_minval() { return minval; } - double get_rng() { return rng; } - - // data scan - Capsule& prescan(double& max_value, double& min_value, double& rng) - { - // may not work for _uniptr - T result[4]; - psz::thrustgpu_get_extrema_rawptr(_dptr, _len, result); - - min_value = result[0]; - max_value = result[1]; - rng = max_value - min_value; - - return *this; - } - // data scan - Capsule& prescan() - { - prescan(maxval, minval, rng); - return *this; - } -}; - -#endif +/** + * @file capsule.hh + * @author Jiannan Tian + * @brief Simple data analysis (header) + * @version 0.2.3 + * @date 2020-11-03 + * (create) 2020-11-03 (rev1) 2021-03-24 (rev2) 2021-09-08 + * @deprecated 0.3.2 + * + * @copyright (C) 2020 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CAPSULE_HH +#define CAPSULE_HH + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +#include +#include + +#include +#include +#include +#include +#include + +#include "../stat/compare_gpu.hh" +// #include "../utils/io.hh" +#include "../utils/timer.hh" +#include "definition.hh" + +template +class Capsule { + private: + // variables + struct { + bool hptr{false}, dptr{false}, uniptr{false}; + } alloc_status; + + T *_dptr{nullptr}, *_hptr{nullptr}, *_uniptr{nullptr}; + + uint32_t _len{0}; + dim3 _len3{1, 1, 1}, _stride3{1, 1, 1}; + + std::string name; + + // logging setup; standalone + const std::string LOG_NULL = " "; + const std::string LOG_INFO = " :: "; + const std::string LOG_ERR = " ERR "; + const std::string LOG_WARN = "WARN "; + const std::string LOG_DBG = " dbg "; + const std::string LOG_EXCEPTION = " !! "; + + // https://stackoverflow.com/a/26080768/8740097 CC BY-SA 3.0 + template + void build_string(std::ostream& o, S t) + { + o << t << " "; + } + + template + void build_string(std::ostream& o, S t, Args... args) // recursive variadic function + { + build_string(o, t); + build_string(o, args...); + } + + template + void LOGGING(const std::string& log_head, Args... args) + { + std::ostringstream oss; + oss << log_head; + build_string(oss, args...); + + oss.seekp(0, std::ios::end); + std::stringstream::pos_type offset = oss.tellp(); + if (log_head == LOG_DBG) { std::cout << "\e[2m"; } // dbg + std::cout << oss.str() << std::endl; // print content + if (log_head == LOG_DBG) std::cout << "\e[0m"; // finish printing dbg + } + + // IO + int fs2mem(const char* fname, void* array, size_t num_els) + { + auto bytes = sizeof(T) * num_els; + + std::ifstream ifs(fname, std::ios::binary | std::ios::in); + if (not ifs.is_open()) { + std::cerr << "fail to open " << fname << std::endl; + return -1; + } + ifs.read(reinterpret_cast(array), std::streamsize(bytes)); + ifs.close(); + + return 0; + } + + int mem2fs(const char* fname, void* array, size_t num_els) + { + auto bytes = sizeof(type) * num_els; + + std::ofstream ofs(fname, std::ios::binary | std::ios::out); + if (not ofs.is_open()) { + std::cerr << "fail to open " << fname << std::endl; + return -1; + } + + ofs.write(reinterpret_cast(array), std::streamsize(bytes)); + ofs.close(); + + return 0; + } + + std::string ERRSTR_BUILDER(std::string func, std::string msg) + { + return "[Capsule(\"" + name + "\")::" + func + "] " + msg; + } + + void check_len(std::string funcname) + { + if (_len == 0) throw std::runtime_error("[Capsule(\"" + name + "\")::" + funcname + "] " + "len == 0"); + } + + std::string ERROR_UNDEFINED_BEHAVIOR(std::string func, std::string msg = "undefined behavior") + { // + return ERRSTR_BUILDER(func, "undefined behavior"); + } + + public: + using type = T; + + // TODO rule of n + // constructor + Capsule() = default; + Capsule(const std::string _str) : name(_str){}; + Capsule(uint32_t len, const std::string _str = std::string("")) : _len(len), name(_str) {} + Capsule(uint32_t x, uint32_t y, uint32_t z, const std::string _str = std::string("")) : name(_str) + { + _len3 = dim3(x, y, z); + _len = x * y * z; + } + + ~Capsule() + { + // Becasue _hptr can be obtained externally, and could be non-pinned, cudaFreeHost may not work properly. + // if (alloc_status.hptr) cudaFreeHost(_hptr); + + if (alloc_status.dptr) cudaFree(_dptr); + if (alloc_status.uniptr) cudaFree(_uniptr); + } + + // getter start -------------------- + T*& dptr() { return _dptr; } + T*& hptr() { return _hptr; } + T*& uniptr() { return _uniptr; } + + uint32_t len() const { return _len; } + dim3 len3() const { return _len3; } + dim3 stride3() const { return _stride3; } + // 1D + T& dptr(uint32_t i) { return _dptr[i]; } + T& hptr(uint32_t i) { return _hptr[i]; } + T& uniptr(uint32_t i) { return _uniptr[i]; } + // 2D + T& dptr(uint32_t x, uint32_t y) { return _dptr[x + y * _stride3.y]; } + T& hptr(uint32_t x, uint32_t y) { return _hptr[x + y * _stride3.y]; } + T& uniptr(uint32_t x, uint32_t y) { return _uniptr[x + y * _stride3.y]; } + // 3D + T& dptr(uint32_t x, uint32_t y, uint32_t z) { return _dptr[x + y * _stride3.y + z * _stride3.z]; } + T& hptr(uint32_t x, uint32_t y, uint32_t z) { return _hptr[x + y * _stride3.y + z * _stride3.z]; } + T& uniptr(uint32_t x, uint32_t y, uint32_t z) { return _uniptr[x + y * _stride3.y + z * _stride3.z]; } + // getter end ----------------------- + + // setter start --------------------- + Capsule& set_hptr(T* ptr) + { + _hptr = ptr, alloc_status.hptr = true; + return *this; + } + Capsule& set_dptr(T* ptr) + { + _dptr = ptr, alloc_status.dptr = true; + return *this; + } + Capsule& set_uniptr(T* ptr) + { + _uniptr = ptr, alloc_status.uniptr = true; + return *this; + } + + // variable len + Capsule& set_len(uint32_t len) + { + if (len <= 0) throw std::runtime_error("length must be greater than 0"); + _len = len; + return *this; + } + + Capsule& set_len3(uint32_t x, uint32_t y = 1, uint32_t z = 1) + { + if (x == 1) throw std::runtime_error("x must be > 1."); + if (x * y * z == 0) throw std::runtime_error("x, y, z must be non-zero."); + + _len3 = dim3(x, y, z); + _stride3 = dim3(1, x, x * y); + _len = x * y * z; + + return *this; + } + // setter end ---------------------- + + // debug + void debug() + { + printf("Capsule debugging information\n"); + printf(" name : %s\n", name.c_str()); + printf(" len : %u\n", len()); + printf(" hptr : %s\n", alloc_status.hptr ? "set" : "not set"); + printf(" dptr : %s\n", alloc_status.dptr ? "set" : "not set"); + printf(" uniptr : %s\n", alloc_status.uniptr ? "set" : "not set"); + } + + // for debugging + Capsule& set_name(std::string _str) + { + name = _str; + return *this; + } + + // IO + Capsule& fromfile(std::string fname, double* time = nullptr) + { + if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "_hptr not set")); + if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("fromfile", "len == 0")); + + auto a = hires::now(); + fs2mem(fname.c_str(), _hptr, _len); + auto z = hires::now(); + + if (time) *time = static_cast(z - a).count(); + + return *this; + } + + Capsule& tofile(std::string fname, double* time = nullptr) + { + if (not _hptr) { throw std::runtime_error(ERRSTR_BUILDER("tofile", "_hptr not set")); } + if (_len == 0) throw std::runtime_error(ERRSTR_BUILDER("tofile", "len == 0")); + + auto a = hires::now(); + mem2fs(fname.c_str(), _hptr, _len); + auto z = hires::now(); + + if (time) *time = static_cast(z - a).count(); + + return *this; + } + + uint32_t nbyte() const { return _len * sizeof(T); } + + // memcpy h2d, synchronous + Capsule& host2device() + { + check_len("host2device"); + + cudaMemcpy(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice); + return *this; + } + // memcpy d2h, synchronous + Capsule& device2host() + { + check_len("device2host"); + + cudaMemcpy(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost); + return *this; + } + // memcpy h2d, asynchronous + Capsule& host2device_async(cudaStream_t stream) + { + check_len("host2device_async"); + + cudaMemcpyAsync(_dptr, _hptr, nbyte(), cudaMemcpyHostToDevice, stream); + return *this; + } + // memcpy d2h, asynchronous + Capsule& device2host_async(cudaStream_t stream) + { + check_len("device2host_async"); + + cudaMemcpyAsync(_hptr, _dptr, nbyte(), cudaMemcpyDeviceToHost, stream); + return *this; + } + // shorthand + Capsule& h2d() { return host2device(); } + Capsule& d2h() { return device2host(); } + Capsule& async_h2d(cudaStream_t stream) { return host2device_async(stream); } + Capsule& async_d2h(cudaStream_t stream) { return device2host_async(stream); } + + // cudaMalloc wrapper + Capsule& malloc(bool do_memset = true, uint8_t memset_val = 0) + { + check_len("malloc"); + + if (alloc_status.dptr) + LOGGING(LOG_WARN, "already allocated on device"); + else { + cudaMalloc(&_dptr, nbyte()); + cudaMemset(_dptr, memset_val, nbyte()); + alloc_status.dptr = true; + } + return *this; + } + // cudaMallocHost wrapper, pinned + Capsule& mallochost(bool do_memset = true, uint8_t memset_val = 0) + { + check_len("mallochost"); + + if (alloc_status.hptr) + LOGGING(LOG_WARN, "already allocated on host"); + else { + cudaMallocHost(&_hptr, nbyte()); + memset(_hptr, memset_val, nbyte()); + alloc_status.hptr = true; + } + return *this; + } + // cudaMallocManaged wrapper + Capsule& mallocmanaged(bool do_memset = true, uint8_t memset_val = 0) + { + check_len("mallocmanaged"); + + if (alloc_status.uniptr) + LOGGING(LOG_WARN, "already allocated as unified"); + else { + cudaMallocManaged(&_uniptr, nbyte()); + cudaMemset(_uniptr, memset_val, nbyte()); + alloc_status.uniptr = true; + } + return *this; + } + // cudaFree wrapper + Capsule& free() + { + if (not _dptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_dptr is null")); + cudaFree(_dptr); + alloc_status.dptr = false; + return *this; + } + // cudaFreeHost wrapper + Capsule& freehost() + { + if (not _hptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_hptr is null")); + cudaFreeHost(_hptr); + alloc_status.hptr = false; + return *this; + } + // cudaFree wrapper, but for unified memory + Capsule& freemanaged() + { + if (not _uniptr) throw std::runtime_error(ERRSTR_BUILDER("free", "_uniptr is null")); + cudaFree(_uniptr); + alloc_status.uniptr = false; + return *this; + } + + private: + double maxval, minval, rng; + + public: + double get_maxval() { return maxval; } + double get_minval() { return minval; } + double get_rng() { return rng; } + + // data scan + Capsule& prescan(double& max_value, double& min_value, double& rng) + { + // may not work for _uniptr + T result[4]; + psz::thrustgpu_get_extrema_rawptr(_dptr, _len, result); + + min_value = result[0]; + max_value = result[1]; + rng = max_value - min_value; + + return *this; + } + // data scan + Capsule& prescan() + { + prescan(maxval, minval, rng); + return *this; + } +}; + +#endif diff --git a/qtensor/compression/cusz/include/common/configs.hh b/qtensor/compression/cusz/include/common/configs.hh index 7c1e0654..d9a0bd39 100644 --- a/qtensor/compression/cusz/include/common/configs.hh +++ b/qtensor/compression/cusz/include/common/configs.hh @@ -1,354 +1,354 @@ -/** - * @file configs.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-26 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMMON_CONFIGS_HH -#define CUSZ_COMMON_CONFIGS_HH - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../header.h" -#include "definition.hh" - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -struct Reinterpret1DTo2D { - template - static T get_square_size(T len) - { - return static_cast(ceil(sqrt(len))); - } -}; - -struct Align { - template - static size_t get_aligned_datalen(size_t len) - { - if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len; - if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) { - auto m = Reinterpret1DTo2D::get_square_size(len); - return m * m; - } - } - - static const int DEFAULT_ALIGN_NBYTE = 128; - - template - static inline bool is_aligned_at(const void* ptr) - { // - return reinterpret_cast(ptr) % NUM == 0; - }; - - template - static size_t get_aligned_nbyte(size_t len) - { - return ((sizeof(T) * len - 1) / NUM + 1) * NUM; - } -}; - -// sparsity rate is less that 5% -struct SparseMethodSetup { - // "Density" denotes the degree of non-zeros (nz). - static constexpr float default_density = 0.25; // ratio of nonzeros (R_nz) - static constexpr float default_sparsity = 1 - default_density; // ratio of zeros, 1 - R_nz - - static constexpr int default_density_factor = 4; // ratio of nonzeros (R_nz) - - template - static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz) - { - auto m = Reinterpret1DTo2D::get_square_size(len); - auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz; - return nbyte; - } -}; - -struct HuffmanHelper { - // deprecated - // template - // static uint32_t get_revbook_nbyte(int dict_size) - // { - // constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8; - // return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size; - // } - - static const int BLOCK_DIM_ENCODE = 256; - static const int BLOCK_DIM_DEFLATE = 256; - - static const int ENC_SEQUENTIALITY = 4; // empirical - static const int DEFLATE_CONSTANT = 4; // TODO -> deflate_chunk_constant -}; - -struct StringHelper { - static std::string nnz_percentage(uint32_t nnz, uint32_t data_len) - { - return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)"; - } -}; - -struct ConfigHelper { - static uint32_t predictor_lookup(std::string name) - { - const std::unordered_map lut = { - {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2} // - }; - if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name); - return lut.at(name); - } - - static uint32_t codec_lookup(std::string name) - { - const std::unordered_map lut = { - {"huffman-coarse", 0} // - }; - if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name); - return lut.at(name); - } - - static uint32_t spcodec_lookup(std::string name) - { - const std::unordered_map lut = { - {"spmat", 0}, {"spvec", 1} // - }; - if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name); - return lut.at(name); - } - - static std::string get_default_predictor() { return "lorenzo"; } - static std::string get_default_spcodec() { return "csr11"; } - static std::string get_default_codec() { return "huffman-coarse"; } - static std::string get_default_cuszmode() { return "r2r"; } - static std::string get_default_dtype() { return "f32"; } - - static bool check_predictor(const std::string& val, bool fatal = false) - { - auto legal = (val == "lorenzo") or (val == "spline3"); - if (not legal) { - if (fatal) - throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\"."); - else - printf("fallback to the default \"%s\".", get_default_predictor().c_str()); - } - return legal; - } - - static bool check_codec(const std::string& val, bool fatal = false) - { - auto legal = (val == "huffman-coarse"); - if (not legal) { - if (fatal) - throw std::runtime_error("`codec` must be \"huffman-coarse\"."); - else - printf("fallback to the default \"%s\".", get_default_codec().c_str()); - } - return legal; - } - - static bool check_spcodec(const std::string& val, bool fatal = false) - { - auto legal = (val == "csr11") or (val == "rle"); - if (not legal) { - if (fatal) - throw std::runtime_error("`codec` must be \"csr11\" or \"rle\"."); - else - printf("fallback to the default \"%s\".", get_default_codec().c_str()); - } - return legal; - } - - static bool check_cuszmode(const std::string& val, bool fatal = false) - { - auto legal = (val == "r2r") or (val == "abs"); - if (not legal) { - if (fatal) - throw std::runtime_error("`mode` must be \"r2r\" or \"abs\"."); - else - printf("fallback to the default \"%s\".", get_default_cuszmode().c_str()); - } - return legal; - } - - static bool check_dtype(const std::string& val, bool fatal = false) - { - auto legal = (val == "f32"); - // auto legal = (val == "f32") or (val == "f64"); - if (not legal) { - if (fatal) - throw std::runtime_error("`dtype` must be \"f32\"."); - else - printf("fallback to the default \"%s\".", get_default_dtype().c_str()); - } - return legal; - } - - static bool check_opt_in_list(std::string const& opt, std::vector vs) - { - for (auto& i : vs) { - if (opt == i) return true; - } - return false; - } - - static void parse_length_literal(const char* str, std::vector& dims) - { - std::stringstream data_len_ss(str); - auto data_len_literal = data_len_ss.str(); - char delimiter = 'x'; - - while (data_len_ss.good()) { - std::string substr; - std::getline(data_len_ss, substr, delimiter); - dims.push_back(substr); - } - } - - static size_t get_filesize(std::string fname) - { - std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary); - return in.tellg(); - } - - static size_t get_filesize(cusz_header* h) - { - auto END = sizeof(h->entry) / sizeof(h->entry[0]); - return h->entry[END - 1]; - } - - static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; } - - template - static size_t get_npart(T1 size, T2 subsize) - { - static_assert( - std::numeric_limits::is_integer and std::numeric_limits::is_integer, - "[get_npart] must be plain interger types."); - - return (size + subsize - 1) / subsize; - } - - // #ifdef __CUDACC__ - static int get_ndim(dim3 len3) - { - auto ndim = 3; - if (len3.z == 1) ndim = 2; - if (len3.z == 1 and len3.y == 1) ndim = 1; - return ndim; - } - - static dim3 get_pardeg3(dim3 len3, dim3 sublen3) - { - return dim3( - get_npart(len3.x, sublen3.x), // - get_npart(len3.y, sublen3.y), // - get_npart(len3.z, sublen3.z)); - } - - template - static dim3 get_pardeg3(dim3 len3, T sublen3[3]) - { - return dim3( - get_npart(len3.x, sublen3[0]), // - get_npart(len3.y, sublen3[1]), // - get_npart(len3.z, sublen3[2])); - } - - template - static dim3 multiply_dim3(dim3 a, T b[3]) - { - return dim3(a.x * b[0], a.y * b[1], a.z * b[2]); - } - - static dim3 multiply_dim3(dim3 a, dim3 b) - { // - return dim3(a.x * b.x, a.y * b.y, a.z * b.z); - } - - static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; } - - static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); } - - // #endif - - template - static size_t get_serialized_len(T a[3]) - { // - return a[0] * a[1] * a[2]; - } -}; - -struct CompareHelper { - template - static bool eq(TRIO a, TRIO b) - { - return (a.x == b.x) and (a.y == b.y) and (a.z == b.z); - }; -}; - -struct ReportHelper { - static float get_throughput(float milliseconds, size_t nbyte) - { - auto GiB = 1.0 * 1024 * 1024 * 1024; - auto seconds = milliseconds * 1e-3; - return nbyte / GiB / seconds; - } - - static void println_throughput(const char* s, float timer, size_t _nbyte) - { - if (timer == 0.0) return; - auto t = get_throughput(timer, _nbyte); - printf(" %-12s %'12f %'10.2f\n", s, timer, t); - }; - - static void println_throughput_tablehead() - { - printf( - "\n \e[1m\e[31m%-12s %12s %10s\e[0m\n", // - const_cast("kernel"), // - const_cast("time, ms"), // - const_cast("GiB/s") // - ); - } - - static void print_datasegment_tablehead() - { - printf( - "\ndata segments:\n \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n", // - const_cast("name"), // - const_cast("nbyte"), // - const_cast("start"), // - const_cast("end")); - } - - static std::string demangle(const char* name) - { - int status = -4; - char* res = abi::__cxa_demangle(name, nullptr, nullptr, &status); - - const char* const demangled_name = (status == 0) ? res : name; - std::string ret_val(demangled_name); - free(res); - return ret_val; - }; -}; - -#endif +/** + * @file configs.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-26 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMMON_CONFIGS_HH +#define CUSZ_COMMON_CONFIGS_HH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../header.h" +#include "definition.hh" + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +struct Reinterpret1DTo2D { + template + static T get_square_size(T len) + { + return static_cast(ceil(sqrt(len))); + } +}; + +struct Align { + template + static size_t get_aligned_datalen(size_t len) + { + if CONSTEXPR (ad == cusz::ALIGNDATA::NONE) return len; + if CONSTEXPR (ad == cusz::ALIGNDATA::SQUARE_MATRIX) { + auto m = Reinterpret1DTo2D::get_square_size(len); + return m * m; + } + } + + static const int DEFAULT_ALIGN_NBYTE = 128; + + template + static inline bool is_aligned_at(const void* ptr) + { // + return reinterpret_cast(ptr) % NUM == 0; + }; + + template + static size_t get_aligned_nbyte(size_t len) + { + return ((sizeof(T) * len - 1) / NUM + 1) * NUM; + } +}; + +// sparsity rate is less that 5% +struct SparseMethodSetup { + // "Density" denotes the degree of non-zeros (nz). + static constexpr float default_density = 0.25; // ratio of nonzeros (R_nz) + static constexpr float default_sparsity = 1 - default_density; // ratio of zeros, 1 - R_nz + + static constexpr int default_density_factor = 4; // ratio of nonzeros (R_nz) + + template + static uint32_t get_csr_nbyte(uint32_t len, uint32_t nnz) + { + auto m = Reinterpret1DTo2D::get_square_size(len); + auto nbyte = sizeof(M) * (m + 1) + sizeof(M) * nnz + sizeof(T) * nnz; + return nbyte; + } +}; + +struct HuffmanHelper { + // deprecated + // template + // static uint32_t get_revbook_nbyte(int dict_size) + // { + // constexpr auto TYPE_BITCOUNT = sizeof(BOOK) * 8; + // return sizeof(BOOK) * (2 * TYPE_BITCOUNT) + sizeof(SYM) * dict_size; + // } + + static const int BLOCK_DIM_ENCODE = 256; + static const int BLOCK_DIM_DEFLATE = 256; + + static const int ENC_SEQUENTIALITY = 4; // empirical + static const int DEFLATE_CONSTANT = 4; // TODO -> deflate_chunk_constant +}; + +struct StringHelper { + static std::string nnz_percentage(uint32_t nnz, uint32_t data_len) + { + return "(" + std::to_string(nnz / 1.0 / data_len * 100) + "%)"; + } +}; + +struct ConfigHelper { + static uint32_t predictor_lookup(std::string name) + { + const std::unordered_map lut = { + {"lorenzo", 0}, {"lorenzoii", 1}, {"spline3", 2} // + }; + if (lut.find(name) != lut.end()) throw std::runtime_error("no such predictor as " + name); + return lut.at(name); + } + + static uint32_t codec_lookup(std::string name) + { + const std::unordered_map lut = { + {"huffman-coarse", 0} // + }; + if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name); + return lut.at(name); + } + + static uint32_t spcodec_lookup(std::string name) + { + const std::unordered_map lut = { + {"spmat", 0}, {"spvec", 1} // + }; + if (lut.find(name) != lut.end()) throw std::runtime_error("no such codec as " + name); + return lut.at(name); + } + + static std::string get_default_predictor() { return "lorenzo"; } + static std::string get_default_spcodec() { return "csr11"; } + static std::string get_default_codec() { return "huffman-coarse"; } + static std::string get_default_cuszmode() { return "r2r"; } + static std::string get_default_dtype() { return "f32"; } + + static bool check_predictor(const std::string& val, bool fatal = false) + { + auto legal = (val == "lorenzo") or (val == "spline3"); + if (not legal) { + if (fatal) + throw std::runtime_error("`predictor` must be \"lorenzo\" or \"spline3\"."); + else + printf("fallback to the default \"%s\".", get_default_predictor().c_str()); + } + return legal; + } + + static bool check_codec(const std::string& val, bool fatal = false) + { + auto legal = (val == "huffman-coarse"); + if (not legal) { + if (fatal) + throw std::runtime_error("`codec` must be \"huffman-coarse\"."); + else + printf("fallback to the default \"%s\".", get_default_codec().c_str()); + } + return legal; + } + + static bool check_spcodec(const std::string& val, bool fatal = false) + { + auto legal = (val == "csr11") or (val == "rle"); + if (not legal) { + if (fatal) + throw std::runtime_error("`codec` must be \"csr11\" or \"rle\"."); + else + printf("fallback to the default \"%s\".", get_default_codec().c_str()); + } + return legal; + } + + static bool check_cuszmode(const std::string& val, bool fatal = false) + { + auto legal = (val == "r2r") or (val == "abs"); + if (not legal) { + if (fatal) + throw std::runtime_error("`mode` must be \"r2r\" or \"abs\"."); + else + printf("fallback to the default \"%s\".", get_default_cuszmode().c_str()); + } + return legal; + } + + static bool check_dtype(const std::string& val, bool fatal = false) + { + auto legal = (val == "f32"); + // auto legal = (val == "f32") or (val == "f64"); + if (not legal) { + if (fatal) + throw std::runtime_error("`dtype` must be \"f32\"."); + else + printf("fallback to the default \"%s\".", get_default_dtype().c_str()); + } + return legal; + } + + static bool check_opt_in_list(std::string const& opt, std::vector vs) + { + for (auto& i : vs) { + if (opt == i) return true; + } + return false; + } + + static void parse_length_literal(const char* str, std::vector& dims) + { + std::stringstream data_len_ss(str); + auto data_len_literal = data_len_ss.str(); + char delimiter = 'x'; + + while (data_len_ss.good()) { + std::string substr; + std::getline(data_len_ss, substr, delimiter); + dims.push_back(substr); + } + } + + static size_t get_filesize(std::string fname) + { + std::ifstream in(fname.c_str(), std::ifstream::ate | std::ifstream::binary); + return in.tellg(); + } + + static size_t get_filesize(cusz_header* h) + { + auto END = sizeof(h->entry) / sizeof(h->entry[0]); + return h->entry[END - 1]; + } + + static size_t get_uncompressed_len(cusz_header* h) { return h->x * h->y * h->z; } + + template + static size_t get_npart(T1 size, T2 subsize) + { + static_assert( + std::numeric_limits::is_integer and std::numeric_limits::is_integer, + "[get_npart] must be plain interger types."); + + return (size + subsize - 1) / subsize; + } + + // #ifdef __CUDACC__ + static int get_ndim(dim3 len3) + { + auto ndim = 3; + if (len3.z == 1) ndim = 2; + if (len3.z == 1 and len3.y == 1) ndim = 1; + return ndim; + } + + static dim3 get_pardeg3(dim3 len3, dim3 sublen3) + { + return dim3( + get_npart(len3.x, sublen3.x), // + get_npart(len3.y, sublen3.y), // + get_npart(len3.z, sublen3.z)); + } + + template + static dim3 get_pardeg3(dim3 len3, T sublen3[3]) + { + return dim3( + get_npart(len3.x, sublen3[0]), // + get_npart(len3.y, sublen3[1]), // + get_npart(len3.z, sublen3[2])); + } + + template + static dim3 multiply_dim3(dim3 a, T b[3]) + { + return dim3(a.x * b[0], a.y * b[1], a.z * b[2]); + } + + static dim3 multiply_dim3(dim3 a, dim3 b) + { // + return dim3(a.x * b.x, a.y * b.y, a.z * b.z); + } + + static size_t get_serialized_len(dim3 a) { return a.x * a.y * a.z; } + + static dim3 get_leap(dim3 len3) { return dim3(1, len3.x, len3.x * len3.y); } + + // #endif + + template + static size_t get_serialized_len(T a[3]) + { // + return a[0] * a[1] * a[2]; + } +}; + +struct CompareHelper { + template + static bool eq(TRIO a, TRIO b) + { + return (a.x == b.x) and (a.y == b.y) and (a.z == b.z); + }; +}; + +struct ReportHelper { + static float get_throughput(float milliseconds, size_t nbyte) + { + auto GiB = 1.0 * 1024 * 1024 * 1024; + auto seconds = milliseconds * 1e-3; + return nbyte / GiB / seconds; + } + + static void println_throughput(const char* s, float timer, size_t _nbyte) + { + if (timer == 0.0) return; + auto t = get_throughput(timer, _nbyte); + printf(" %-12s %'12f %'10.2f\n", s, timer, t); + }; + + static void println_throughput_tablehead() + { + printf( + "\n \e[1m\e[31m%-12s %12s %10s\e[0m\n", // + const_cast("kernel"), // + const_cast("time, ms"), // + const_cast("GiB/s") // + ); + } + + static void print_datasegment_tablehead() + { + printf( + "\ndata segments:\n \e[1m\e[31m%-18s\t%12s\t%15s\t%15s\e[0m\n", // + const_cast("name"), // + const_cast("nbyte"), // + const_cast("start"), // + const_cast("end")); + } + + static std::string demangle(const char* name) + { + int status = -4; + char* res = abi::__cxa_demangle(name, nullptr, nullptr, &status); + + const char* const demangled_name = (status == 0) ? res : name; + std::string ret_val(demangled_name); + free(res); + return ret_val; + }; +}; + +#endif diff --git a/qtensor/compression/cusz/include/common/definition.hh b/qtensor/compression/cusz/include/common/definition.hh index c7c328ef..af30239b 100644 --- a/qtensor/compression/cusz/include/common/definition.hh +++ b/qtensor/compression/cusz/include/common/definition.hh @@ -1,66 +1,66 @@ -/** - * @file definition.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-20 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMMON_DEFINITION_HH -#define CUSZ_COMMON_DEFINITION_HH - -#include -#include -#include - -namespace cusz { - -enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN }; -enum class DEV { TEST, DEV, RELEASE }; -enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER }; -enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN }; -enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN }; -enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B }; - -// TODO when to use ADDR8? -// TODO change to `enum class` -enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA }; - -enum class execution { cuda, serial }; -enum class method { native, thrust }; - -struct OK { - template - static void ALLOC() - { - static_assert( - m == cusz::DEV::TEST or m == cusz::DEV::DEV, // - "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution"); - } - - template - static void FREE() - { - static_assert( - m == cusz::DEV::TEST or m == cusz::DEV::DEV, // - "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution"); - } -}; - -using ADDR4 = uint32_t; -using ADDR8 = size_t; - -using FREQ = uint32_t; - -using TimeRecordTuple = std::tuple; -using TimeRecord = std::vector; -using timerecord_t = TimeRecord*; - -using BYTE = uint8_t; - -}; // namespace cusz - -#endif +/** + * @file definition.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-20 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMMON_DEFINITION_HH +#define CUSZ_COMMON_DEFINITION_HH + +#include +#include +#include + +namespace cusz { + +enum class TASK { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN }; +enum class DEV { TEST, DEV, RELEASE }; +enum class LOC { HOST, DEVICE, HOST_DEVICE, UNIFIED, FS, NONE, __BUFFER }; +enum class WHEN { COMPRESS, DECOMPRESS, EXPERIMENT, COMPRESS_DRYRUN }; +enum class ALIGNDATA { NONE, SQUARE_MATRIX, POWEROF2, NEXT_EVEN }; +enum class ALIGNMEM { NONE, WARP32B, WARP64B, WARP128B }; + +// TODO when to use ADDR8? +// TODO change to `enum class` +enum class SEG { HEADER, BOOK, QUANT, REVBOOK, ANCHOR, SPFMT, HUFF_META, HUFF_DATA }; + +enum class execution { cuda, serial }; +enum class method { native, thrust }; + +struct OK { + template + static void ALLOC() + { + static_assert( + m == cusz::DEV::TEST or m == cusz::DEV::DEV, // + "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution"); + } + + template + static void FREE() + { + static_assert( + m == cusz::DEV::TEST or m == cusz::DEV::DEV, // + "muse be cusz::DEV::TEST or cusz::DEV::DEV; use with caution"); + } +}; + +using ADDR4 = uint32_t; +using ADDR8 = size_t; + +using FREQ = uint32_t; + +using TimeRecordTuple = std::tuple; +using TimeRecord = std::vector; +using timerecord_t = TimeRecord*; + +using BYTE = uint8_t; + +}; // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/common/type_traits.hh b/qtensor/compression/cusz/include/common/type_traits.hh index a77c2738..3d623beb 100644 --- a/qtensor/compression/cusz/include/common/type_traits.hh +++ b/qtensor/compression/cusz/include/common/type_traits.hh @@ -1,108 +1,108 @@ -/** - * @file type_traits.hh - * @author Jiannan Tian - * @brief - * @version 0.1.1 - * @date 2020-09-23 - * (create) 2020-09-23, (rev) 2021-09-17 - * - * @copyright (C) 2020 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef TYPE_TRAITS_HH -#define TYPE_TRAITS_HH - -#include -#include - -#include "cusz/type.h" -#include "definition.hh" - -template -cusz_datatype cusz_typeof() -{ - if (std::is_same::value) - return FP32; - else if (std::is_same::value) - return FP64; - else - throw std::runtime_error("Type not supported."); -} - -// clang-format off - -/** - * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, - * `unsigned long long`, which is ambiguous anyway. - */ -template struct cuszCOMPAT; -template <> struct cuszCOMPAT { using type = uint32_t; }; -template <> struct cuszCOMPAT { using type = unsigned long long; }; - -template struct DataTrait; -template <> struct DataTrait<4, true> { typedef float type; }; -template <> struct DataTrait<8, true> { typedef double type; }; -template <> struct DataTrait<1, false> { typedef int8_t type; }; // future use -template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use -template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use -template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use - -template struct ChunkingTrait; -template <> struct ChunkingTrait<1> { static const int BLOCK = 256; static const int SEQ = 8; }; -template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; }; -template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64; }; -template <> struct ChunkingTrait<2> { static const int BLOCK = 16; static const int YSEQ = 8; }; -template <> struct ChunkingTrait<3> { static const int BLOCK = 8; static const int YSEQ = 8; }; - -// template struct QuantTrait; -// template <> struct QuantTrait<1> { typedef uint8_t type; }; -// template <> struct QuantTrait<2> { typedef uint16_t type; }; -// template <> struct QuantTrait<4> { typedef uint32_t type; }; - -template struct ErrCtrlTrait; -template <> struct ErrCtrlTrait<1, false> { typedef uint8_t type; }; -template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; }; -template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; }; -template <> struct ErrCtrlTrait<4, true> { typedef float type; }; -template <> struct ErrCtrlTrait<8, true> { typedef double type; }; - -template struct HuffTrait; -template <> struct HuffTrait<4> { typedef cuszCOMPAT::type type; }; -template <> struct HuffTrait<8> { typedef cuszCOMPAT::type type; }; - -template struct ReducerTrait; -template <> struct ReducerTrait<4> { typedef uint32_t type; }; -template <> struct ReducerTrait<8> { typedef uint64_t type; }; - -template struct MetadataTrait; -template <> struct MetadataTrait<4> { typedef uint32_t type; }; -template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use - -template struct LargeInputTrait; -template <> struct LargeInputTrait { using type = MetadataTrait<4>::type; }; -template <> struct LargeInputTrait { using type = MetadataTrait<8>::type; }; - -template struct FastLowPrecisionTrait; -template <> struct FastLowPrecisionTrait { typedef float type; }; -template <> struct FastLowPrecisionTrait { typedef double type; }; - -// template struct cuszCUSPARSE; -// template <> struct cuszCUSPARSE { const static cudaDataType type = CUDA_R_32F; }; -// template <> struct cuszCUSPARSE { const static cudaDataType type = CUDA_R_64F; }; - -#ifdef __CUDACC__ -#include - -template struct CopyDirection; -template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyHostToHost; }; -template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice; }; -template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost; }; -template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; }; - -#endif - -// clang-format on - -#endif +/** + * @file type_traits.hh + * @author Jiannan Tian + * @brief + * @version 0.1.1 + * @date 2020-09-23 + * (create) 2020-09-23, (rev) 2021-09-17 + * + * @copyright (C) 2020 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef TYPE_TRAITS_HH +#define TYPE_TRAITS_HH + +#include +#include + +#include "cusz/type.h" +#include "definition.hh" + +template +cusz_datatype cusz_typeof() +{ + if (std::is_same::value) + return FP32; + else if (std::is_same::value) + return FP64; + else + throw std::runtime_error("Type not supported."); +} + +// clang-format off + +/** + * @brief CUDA API does not accept uint64_t (understandable by literal), but instead, + * `unsigned long long`, which is ambiguous anyway. + */ +template struct cuszCOMPAT; +template <> struct cuszCOMPAT { using type = uint32_t; }; +template <> struct cuszCOMPAT { using type = unsigned long long; }; + +template struct DataTrait; +template <> struct DataTrait<4, true> { typedef float type; }; +template <> struct DataTrait<8, true> { typedef double type; }; +template <> struct DataTrait<1, false> { typedef int8_t type; }; // future use +template <> struct DataTrait<2, false> { typedef int16_t type; }; // future use +template <> struct DataTrait<4, false> { typedef int32_t type; }; // future use +template <> struct DataTrait<8, false> { typedef int64_t type; }; // future use + +template struct ChunkingTrait; +template <> struct ChunkingTrait<1> { static const int BLOCK = 256; static const int SEQ = 8; }; +template <> struct ChunkingTrait<0x101> { static const int BLOCK = 128; }; +template <> struct ChunkingTrait<0x201> { static const int BLOCK = 64; }; +template <> struct ChunkingTrait<2> { static const int BLOCK = 16; static const int YSEQ = 8; }; +template <> struct ChunkingTrait<3> { static const int BLOCK = 8; static const int YSEQ = 8; }; + +// template struct QuantTrait; +// template <> struct QuantTrait<1> { typedef uint8_t type; }; +// template <> struct QuantTrait<2> { typedef uint16_t type; }; +// template <> struct QuantTrait<4> { typedef uint32_t type; }; + +template struct ErrCtrlTrait; +template <> struct ErrCtrlTrait<1, false> { typedef uint8_t type; }; +template <> struct ErrCtrlTrait<2, false> { typedef uint16_t type; }; +template <> struct ErrCtrlTrait<4, false> { typedef uint32_t type; }; +template <> struct ErrCtrlTrait<4, true> { typedef float type; }; +template <> struct ErrCtrlTrait<8, true> { typedef double type; }; + +template struct HuffTrait; +template <> struct HuffTrait<4> { typedef cuszCOMPAT::type type; }; +template <> struct HuffTrait<8> { typedef cuszCOMPAT::type type; }; + +template struct ReducerTrait; +template <> struct ReducerTrait<4> { typedef uint32_t type; }; +template <> struct ReducerTrait<8> { typedef uint64_t type; }; + +template struct MetadataTrait; +template <> struct MetadataTrait<4> { typedef uint32_t type; }; +template <> struct MetadataTrait<8> { typedef uint64_t type; }; // size_t is problematic; do not use + +template struct LargeInputTrait; +template <> struct LargeInputTrait { using type = MetadataTrait<4>::type; }; +template <> struct LargeInputTrait { using type = MetadataTrait<8>::type; }; + +template struct FastLowPrecisionTrait; +template <> struct FastLowPrecisionTrait { typedef float type; }; +template <> struct FastLowPrecisionTrait { typedef double type; }; + +// template struct cuszCUSPARSE; +// template <> struct cuszCUSPARSE { const static cudaDataType type = CUDA_R_32F; }; +// template <> struct cuszCUSPARSE { const static cudaDataType type = CUDA_R_64F; }; + +#ifdef __CUDACC__ +#include + +template struct CopyDirection; +template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyHostToHost; }; +template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyHostToDevice; }; +template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyDeviceToHost; }; +template <> struct CopyDirection { static const cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; }; + +#endif + +// clang-format on + +#endif diff --git a/qtensor/compression/cusz/include/compaction.hh b/qtensor/compression/cusz/include/compaction.hh index bd2a27eb..4a21f571 100644 --- a/qtensor/compression/cusz/include/compaction.hh +++ b/qtensor/compression/cusz/include/compaction.hh @@ -1,18 +1,18 @@ -/** - * @file compaction.hh - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-23 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef DAB40B13_9236_42A9_8047_49CD896671C9 -#define DAB40B13_9236_42A9_8047_49CD896671C9 - -template -struct CompactionDRAM; - -#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */ +/** + * @file compaction.hh + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-23 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef DAB40B13_9236_42A9_8047_49CD896671C9 +#define DAB40B13_9236_42A9_8047_49CD896671C9 + +template +struct CompactionDRAM; + +#endif /* DAB40B13_9236_42A9_8047_49CD896671C9 */ diff --git a/qtensor/compression/cusz/include/component.hh b/qtensor/compression/cusz/include/component.hh index ec5c08a6..34fb8e00 100644 --- a/qtensor/compression/cusz/include/component.hh +++ b/qtensor/compression/cusz/include/component.hh @@ -1,19 +1,19 @@ -/** - * @file componment.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-10-06 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMPONENT_HH -#define CUSZ_COMPONENT_HH - -#include "component/prediction.inl" -#include "component/spcodec.inl" -#include "hf/hf.hh" - +/** + * @file componment.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-10-06 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMPONENT_HH +#define CUSZ_COMPONENT_HH + +#include "component/prediction.inl" +#include "component/spcodec.inl" +#include "hf/hf.hh" + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/include/component/glue.cuh b/qtensor/compression/cusz/include/component/glue.cuh index c4d69141..cdcc8ff0 100644 --- a/qtensor/compression/cusz/include/component/glue.cuh +++ b/qtensor/compression/cusz/include/component/glue.cuh @@ -1,120 +1,120 @@ -/** - * @file glue.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-03-01 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef WRAPPER_GLUE_CUH -#define WRAPPER_GLUE_CUH - -#include -#include -#include -#include "spcodec.hh" - -// when using nvcc, functors must be defined outside a (__host__) function -template -struct cleanup : public thrust::unary_function { - int radius; - cleanup(int radius) : radius(radius) {} - __host__ __device__ E operator()(const E e) const { return e; } -}; - -template -void split_by_radius( - E* in_errctrl, - size_t in_len, - int const radius, - IDX* out_idx, - E* out_val, - int& out_nnz, - cudaStream_t stream = nullptr, - Policy policy = thrust::device) -{ - using thrust::placeholders::_1; - - thrust::cuda::par.on(stream); - thrust::counting_iterator zero(0); - - // find out the indices - out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx; - - // fetch corresponding values - thrust::copy( - policy, thrust::make_permutation_iterator(in_errctrl, out_idx), - thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val); - - // clear up - cleanup functor(radius); - thrust::transform( - policy, // - thrust::make_permutation_iterator(in_errctrl, out_idx), // - thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), // - thrust::make_permutation_iterator(in_errctrl, out_idx), // - functor); -} - -template -void split_by_binary_twopass( - E* in_errctrl, - size_t in_len, - int const radius, - IDX* out_idx, - E* out_val, - int& out_nnz, - cudaStream_t stream = nullptr, - Policy policy = thrust::device) -{ - using thrust::placeholders::_1; - - thrust::cuda::par.on(stream); - thrust::counting_iterator zero(0); - - // find out the indices - out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx; - - // fetch corresponding values - thrust::copy( - policy, thrust::make_permutation_iterator(in_errctrl, out_idx), - thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val); -} - -// when using nvcc, functors must be defined outside a (__host__) function -template -struct is_outlier { - int radius; - is_outlier(int radius) : radius(radius) {} - __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; } -}; - -template -void split_by_binary_onepass( - E* in_errctrl, - size_t in_len, - int const radius, - IDX* out_idx, - E* out_val, - int& out_nnz, - cudaStream_t stream = nullptr, - Policy policy = thrust::device) -{ - thrust::cuda::par.on(stream); - using Tuple = thrust::tuple; - thrust::counting_iterator zero(0); - - auto in = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl)); - auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len)); - auto out = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val)); - - is_outlier functor(radius); - out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out; -} - -enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS }; - -#endif +/** + * @file glue.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-03-01 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef WRAPPER_GLUE_CUH +#define WRAPPER_GLUE_CUH + +#include +#include +#include +#include "spcodec.hh" + +// when using nvcc, functors must be defined outside a (__host__) function +template +struct cleanup : public thrust::unary_function { + int radius; + cleanup(int radius) : radius(radius) {} + __host__ __device__ E operator()(const E e) const { return e; } +}; + +template +void split_by_radius( + E* in_errctrl, + size_t in_len, + int const radius, + IDX* out_idx, + E* out_val, + int& out_nnz, + cudaStream_t stream = nullptr, + Policy policy = thrust::device) +{ + using thrust::placeholders::_1; + + thrust::cuda::par.on(stream); + thrust::counting_iterator zero(0); + + // find out the indices + out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 >= 2 * radius or _1 <= 0) - out_idx; + + // fetch corresponding values + thrust::copy( + policy, thrust::make_permutation_iterator(in_errctrl, out_idx), + thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val); + + // clear up + cleanup functor(radius); + thrust::transform( + policy, // + thrust::make_permutation_iterator(in_errctrl, out_idx), // + thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), // + thrust::make_permutation_iterator(in_errctrl, out_idx), // + functor); +} + +template +void split_by_binary_twopass( + E* in_errctrl, + size_t in_len, + int const radius, + IDX* out_idx, + E* out_val, + int& out_nnz, + cudaStream_t stream = nullptr, + Policy policy = thrust::device) +{ + using thrust::placeholders::_1; + + thrust::cuda::par.on(stream); + thrust::counting_iterator zero(0); + + // find out the indices + out_nnz = thrust::copy_if(policy, zero, zero + in_len, in_errctrl, out_idx, _1 != radius) - out_idx; + + // fetch corresponding values + thrust::copy( + policy, thrust::make_permutation_iterator(in_errctrl, out_idx), + thrust::make_permutation_iterator(in_errctrl + out_nnz, out_idx + out_nnz), out_val); +} + +// when using nvcc, functors must be defined outside a (__host__) function +template +struct is_outlier { + int radius; + is_outlier(int radius) : radius(radius) {} + __host__ __device__ bool operator()(const Tuple t) const { return thrust::get<1>(t) != radius; } +}; + +template +void split_by_binary_onepass( + E* in_errctrl, + size_t in_len, + int const radius, + IDX* out_idx, + E* out_val, + int& out_nnz, + cudaStream_t stream = nullptr, + Policy policy = thrust::device) +{ + thrust::cuda::par.on(stream); + using Tuple = thrust::tuple; + thrust::counting_iterator zero(0); + + auto in = thrust::make_zip_iterator(thrust::make_tuple(zero, in_errctrl)); + auto in_last = thrust::make_zip_iterator(thrust::make_tuple(zero + in_len, in_errctrl + in_len)); + auto out = thrust::make_zip_iterator(thrust::make_tuple(out_idx, out_val)); + + is_outlier functor(radius); + out_nnz = thrust::copy_if(policy, in, in_last, out, functor) - out; +} + +enum class GlueMethod { SPLIT_BY_RADIUS, SPLIT_01_ONEPASS, SPLIT_01_TWOPASS }; + +#endif diff --git a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh index bb7a0584..f83c25cd 100644 --- a/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh +++ b/qtensor/compression/cusz/include/component/pred_boilerplate_deprecated.hh @@ -1,210 +1,210 @@ -/** - * @file predictor_boilerplate.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-15 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_INCLUDE_PREDICTOR_HH -#define CUSZ_INCLUDE_PREDICTOR_HH - -#include -#include -#include - -#include "../common/configs.hh" -#include "../cusz/type.h" - -namespace cusz { - -class PredictorBoilerplate { - protected: - struct DerivedLengths { - struct Interpretion3D { - dim3 len3, leap; - size_t serialized; - - void set_leap() { leap = ConfigHelper::get_leap(len3); } - void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); } - }; - - struct Interpretion3D base, anchor, aligned; - - dim3 nblock; - int ndim; - - struct { - size_t data, quant, outlier, anchor; - } assigned; - - dim3 get_len3() const { return base.len3; } - dim3 get_leap() const { return base.leap; } - }; - - template - void __derive_len(dim3 base, DERIVED& derived) - { - int sublen[3] = {1, 1, 1}; - int anchor_step[3] = {1, 1, 1}; - __derive_len(base, derived, sublen, anchor_step, false); - } - - template - void - __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false) - { - derived.base.len3 = base; - derived.base.set_leap(); - derived.base.set_serialized(); - derived.ndim = ConfigHelper::get_ndim(base); - - if (not use_anchor) { - derived.assigned.data = derived.base.serialized; - derived.assigned.quant = derived.base.serialized; - derived.assigned.outlier = derived.base.serialized; - derived.assigned.anchor = 0; - } - else { - derived.nblock = ConfigHelper::get_pardeg3(base, sublen3); - - derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3); - derived.aligned.set_leap(); - derived.aligned.set_serialized(); - - derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3); - derived.anchor.set_leap(); - derived.anchor.set_serialized(); - - derived.assigned.data = derived.base.serialized; - derived.assigned.quant = derived.aligned.serialized; - derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized); // TODO - derived.assigned.anchor = derived.anchor.serialized; - } - } - - template - void __debug_list_derived(DERIVED const& derived, bool use_anchor = false) - { - auto base = derived.base; - auto aligned = derived.aligned; - auto anchor = derived.anchor; - auto nblock = derived.nblock; - - printf("%-*s: (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP)); - printf("%-*s: (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z); - printf("%-*s: (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z); - printf("%-*s: %'zu\n", 16, "base.serial", base.serialized); - - if (use_anchor) { - printf("%-*s: (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z); - - printf("%-*s: (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z); - printf("%-*s: (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z); - printf("%-*s: %'zu\n", 16, "aligned.serial", aligned.serialized); - - printf("%-*s: (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z); - printf("%-*s: (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z); - printf("%-*s: %'zu\n", 16, "anchor.serial", anchor.serialized); - } - - printf("%-*s: %'zu\n", 16, "len.data", derived.assigned.data); - printf("%-*s: %'zu\n", 16, "len.quant", derived.assigned.quant); - printf("%-*s: %'zu\n", 16, "len.outlier", derived.assigned.outlier); - printf("%-*s: %'zu\n", 16, "len.anchor", derived.assigned.anchor); - } - - void check_rtlen() - { - auto rtlen3 = rtlen.get_len3(); - auto alloclen3 = alloclen.get_len3(); - - if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or - rtlen.base.serialized > alloclen.base.serialized) - throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths."); - } - - template - void debug_list_alloclen(bool use_anchor = false) - { - printf("\ndebugging, listing allocation lengths:\n"); - __debug_list_derived(alloclen, use_anchor); - } - - template - void debug_list_rtlen(bool use_anchor = false) - { - printf("\ndebugging, listing runtime lengths:\n"); - __debug_list_derived(rtlen, use_anchor); - } - - protected: - struct DerivedLengths alloclen, rtlen; - - float time_elapsed; - - // ----------------------------------------------------------------------------- - // accessor - // ----------------------------------------------------------------------------- - public: - // helper - size_t get_alloclen_data() const { return alloclen.assigned.data; } - size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; } - size_t get_alloclen_quant() const { return alloclen.assigned.quant; } - size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; } - - dim3 get_len3() const { return rtlen.base.len3; } - dim3 get_leap3() const { return rtlen.base.leap; } - size_t get_len_data() const { return rtlen.assigned.data; } - size_t get_len_anchor() const { return rtlen.assigned.anchor; } - size_t get_len_quant() const { return rtlen.assigned.quant; } - size_t get_len_outlier() const { return rtlen.assigned.outlier; } - - float get_time_elapsed() const { return time_elapsed; } - - size_t get_x() const { return this->rtlen.get_len3().x; } - size_t get_y() const { return this->rtlen.get_len3().y; } - size_t get_z() const { return this->rtlen.get_len3().z; } - - dim3 get_leap() const { return this->rtlen.get_leap(); } - int get_ndim() const { return this->rtlen.ndim; } - - void derive_alloclen(cusz_predictortype predictor, dim3 base) - { - if (predictor == LorenzoI) { - // normal - this->__derive_len(base, this->alloclen); - } - - else if (predictor == Spline3) { - // maximum possible - int sublen[3] = {32, 8, 8}; - int anchor_step[3] = {8, 8, 8}; - this->__derive_len(base, this->alloclen, sublen, anchor_step, true); - } - } - - void derive_rtlen(cusz_predictortype predictor, dim3 base) - { - if (predictor == LorenzoI) { - // normal - this->__derive_len(base, this->rtlen); - } - else if (predictor == Spline3) { - // maximum possible - int sublen[3] = {32, 8, 8}; - int anchor_step[3] = {8, 8, 8}; - this->__derive_len(base, this->rtlen, sublen, anchor_step, true); - } - } - - // "real" methods - virtual ~PredictorBoilerplate() = default; -}; - -} // namespace cusz - -#endif +/** + * @file predictor_boilerplate.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-15 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_INCLUDE_PREDICTOR_HH +#define CUSZ_INCLUDE_PREDICTOR_HH + +#include +#include +#include + +#include "../common/configs.hh" +#include "../cusz/type.h" + +namespace cusz { + +class PredictorBoilerplate { + protected: + struct DerivedLengths { + struct Interpretion3D { + dim3 len3, leap; + size_t serialized; + + void set_leap() { leap = ConfigHelper::get_leap(len3); } + void set_serialized() { serialized = ConfigHelper::get_serialized_len(len3); } + }; + + struct Interpretion3D base, anchor, aligned; + + dim3 nblock; + int ndim; + + struct { + size_t data, quant, outlier, anchor; + } assigned; + + dim3 get_len3() const { return base.len3; } + dim3 get_leap() const { return base.leap; } + }; + + template + void __derive_len(dim3 base, DERIVED& derived) + { + int sublen[3] = {1, 1, 1}; + int anchor_step[3] = {1, 1, 1}; + __derive_len(base, derived, sublen, anchor_step, false); + } + + template + void + __derive_len(dim3 base, DERIVED& derived, int const sublen3[3], int const anchor_step3[3], bool use_anchor = false) + { + derived.base.len3 = base; + derived.base.set_leap(); + derived.base.set_serialized(); + derived.ndim = ConfigHelper::get_ndim(base); + + if (not use_anchor) { + derived.assigned.data = derived.base.serialized; + derived.assigned.quant = derived.base.serialized; + derived.assigned.outlier = derived.base.serialized; + derived.assigned.anchor = 0; + } + else { + derived.nblock = ConfigHelper::get_pardeg3(base, sublen3); + + derived.aligned.len3 = ConfigHelper::multiply_dim3(derived.nblock, sublen3); + derived.aligned.set_leap(); + derived.aligned.set_serialized(); + + derived.anchor.len3 = ConfigHelper::get_pardeg3(base, anchor_step3); + derived.anchor.set_leap(); + derived.anchor.set_serialized(); + + derived.assigned.data = derived.base.serialized; + derived.assigned.quant = derived.aligned.serialized; + derived.assigned.outlier = std::max(derived.base.serialized, derived.aligned.serialized); // TODO + derived.assigned.anchor = derived.anchor.serialized; + } + } + + template + void __debug_list_derived(DERIVED const& derived, bool use_anchor = false) + { + auto base = derived.base; + auto aligned = derived.aligned; + auto anchor = derived.anchor; + auto nblock = derived.nblock; + + printf("%-*s: (%u, %u, %u)\n", 16, "sizeof.{T,E,FP}", (int)sizeof(T), (int)sizeof(E), (int)sizeof(FP)); + printf("%-*s: (%u, %u, %u)\n", 16, "base.len3", base.len3.x, base.len3.y, base.len3.z); + printf("%-*s: (%u, %u, %u)\n", 16, "base.leap", base.leap.x, base.leap.y, base.leap.z); + printf("%-*s: %'zu\n", 16, "base.serial", base.serialized); + + if (use_anchor) { + printf("%-*s: (%u, %u, %u)\n", 16, "nblock", nblock.x, nblock.y, nblock.z); + + printf("%-*s: (%u, %u, %u)\n", 16, "aligned.len3", aligned.len3.x, aligned.len3.y, aligned.len3.z); + printf("%-*s: (%u, %u, %u)\n", 16, "aligned.leap", aligned.leap.x, aligned.leap.y, aligned.leap.z); + printf("%-*s: %'zu\n", 16, "aligned.serial", aligned.serialized); + + printf("%-*s: (%u, %u, %u)\n", 16, "anchor.len3", anchor.len3.x, anchor.len3.y, anchor.len3.z); + printf("%-*s: (%u, %u, %u)\n", 16, "anchor.leap", anchor.leap.x, anchor.leap.y, anchor.leap.z); + printf("%-*s: %'zu\n", 16, "anchor.serial", anchor.serialized); + } + + printf("%-*s: %'zu\n", 16, "len.data", derived.assigned.data); + printf("%-*s: %'zu\n", 16, "len.quant", derived.assigned.quant); + printf("%-*s: %'zu\n", 16, "len.outlier", derived.assigned.outlier); + printf("%-*s: %'zu\n", 16, "len.anchor", derived.assigned.anchor); + } + + void check_rtlen() + { + auto rtlen3 = rtlen.get_len3(); + auto alloclen3 = alloclen.get_len3(); + + if (rtlen3.x > alloclen3.x or rtlen3.y > alloclen3.y or rtlen3.z > alloclen3.z or + rtlen.base.serialized > alloclen.base.serialized) + throw std::runtime_error("Predictor: the runtime lengths cannot be greater than the allocation lengths."); + } + + template + void debug_list_alloclen(bool use_anchor = false) + { + printf("\ndebugging, listing allocation lengths:\n"); + __debug_list_derived(alloclen, use_anchor); + } + + template + void debug_list_rtlen(bool use_anchor = false) + { + printf("\ndebugging, listing runtime lengths:\n"); + __debug_list_derived(rtlen, use_anchor); + } + + protected: + struct DerivedLengths alloclen, rtlen; + + float time_elapsed; + + // ----------------------------------------------------------------------------- + // accessor + // ----------------------------------------------------------------------------- + public: + // helper + size_t get_alloclen_data() const { return alloclen.assigned.data; } + size_t get_alloclen_anchor() const { return alloclen.assigned.anchor; } + size_t get_alloclen_quant() const { return alloclen.assigned.quant; } + size_t get_alloclen_outlier() const { return alloclen.assigned.outlier; } + + dim3 get_len3() const { return rtlen.base.len3; } + dim3 get_leap3() const { return rtlen.base.leap; } + size_t get_len_data() const { return rtlen.assigned.data; } + size_t get_len_anchor() const { return rtlen.assigned.anchor; } + size_t get_len_quant() const { return rtlen.assigned.quant; } + size_t get_len_outlier() const { return rtlen.assigned.outlier; } + + float get_time_elapsed() const { return time_elapsed; } + + size_t get_x() const { return this->rtlen.get_len3().x; } + size_t get_y() const { return this->rtlen.get_len3().y; } + size_t get_z() const { return this->rtlen.get_len3().z; } + + dim3 get_leap() const { return this->rtlen.get_leap(); } + int get_ndim() const { return this->rtlen.ndim; } + + void derive_alloclen(cusz_predictortype predictor, dim3 base) + { + if (predictor == LorenzoI) { + // normal + this->__derive_len(base, this->alloclen); + } + + else if (predictor == Spline3) { + // maximum possible + int sublen[3] = {32, 8, 8}; + int anchor_step[3] = {8, 8, 8}; + this->__derive_len(base, this->alloclen, sublen, anchor_step, true); + } + } + + void derive_rtlen(cusz_predictortype predictor, dim3 base) + { + if (predictor == LorenzoI) { + // normal + this->__derive_len(base, this->rtlen); + } + else if (predictor == Spline3) { + // maximum possible + int sublen[3] = {32, 8, 8}; + int anchor_step[3] = {8, 8, 8}; + this->__derive_len(base, this->rtlen, sublen, anchor_step, true); + } + } + + // "real" methods + virtual ~PredictorBoilerplate() = default; +}; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/component/prediction.inl b/qtensor/compression/cusz/include/component/prediction.inl index 941f2592..50091ae1 100644 --- a/qtensor/compression/cusz/include/component/prediction.inl +++ b/qtensor/compression/cusz/include/component/prediction.inl @@ -1,193 +1,193 @@ -/** - * @file prediction.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1 -#define FB315D3E_6B96_4F5D_9975_F35702205BC1 - -#include -#include -#include -#include "../common.hh" -#include "../kernel/cpplaunch_cuda.hh" -#include "../kernel/lorenzo_all.hh" -#include "../utils.hh" - -#include "cusz/type.h" -#include "pred_boilerplate_deprecated.hh" - -#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr}; - -#define ALLOCDEV(VAR, SYM, NBYTE) \ - if (NBYTE != 0) { \ - CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE)); \ - CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \ - } - -#define ALLOCDEV2(VAR, TYPE, LEN) \ - if (LEN != 0) { \ - CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN)); \ - CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \ - } - -#define FREE_DEV_ARRAY(VAR) \ - if (d_##VAR) { \ - CHECK_CUDA(cudaFree(d_##VAR)); \ - d_##VAR = nullptr; \ - } - -namespace cusz { - -template -class PredictionUnified : public PredictorBoilerplate { - public: - using Origin = T; - using Anchor = T; - using ErrCtrl = E; - using Precision = FP; - - public: - ~PredictionUnified() - { // dtor - FREE_DEV_ARRAY(anchor); - FREE_DEV_ARRAY(errctrl); - FREE_DEV_ARRAY(outlier); - } - PredictionUnified() {} // ctor - PredictionUnified(const PredictionUnified&); // copy ctor - PredictionUnified& operator=(const PredictionUnified&); // copy assign - PredictionUnified(PredictionUnified&&); // move ctor - PredictionUnified& operator=(PredictionUnified&&); // move assign - - void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false) - { - auto len3 = dim3(x, y, z); - init(predictor, len3, dbg_print); - } - void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false) - { - this->derive_alloclen(predictor, xyz); - - // allocate - ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor); - ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant); - ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier); - - if (dbg_print) this->debug_list_alloclen(); - } - - void construct( - cusz_predictortype predictor, - dim3 const len3, - T* data, - T** ptr_anchor, - E** ptr_errctrl, - T** ptr_outlier, - double const eb, - int const radius, - cudaStream_t stream) - { - *ptr_anchor = d_anchor; - *ptr_errctrl = d_errctrl; - *ptr_outlier = d_outlier; - - if (predictor == LorenzoI) { - derive_rtlen(LorenzoI, len3); - this->check_rtlen(); - - // ad hoc placeholder - // auto anchor_len3 = dim3(0, 0, 0); - // auto errctrl_len3 = dim3(0, 0, 0); - uint32_t* outlier_idx = nullptr; - - compress_predict_lorenzo_i( - data, len3, eb, radius, // - d_errctrl, d_outlier, outlier_idx, nullptr, // - &time_elapsed, stream); - } - else if (predictor == Spline3) { - this->derive_rtlen(Spline3, len3); - this->check_rtlen(); - - cusz::cpplaunch_construct_Spline3( - true, // - data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius, - &time_elapsed, stream); - } - } - - void reconstruct( - cusz_predictortype predictor, - dim3 len3, - T* outlier_xdata, - T* anchor, - E* errctrl, - double const eb, - int const radius, - cudaStream_t stream) - { - if (predictor == LorenzoI) { - this->derive_rtlen(LorenzoI, len3); - this->check_rtlen(); - - // ad hoc placeholder - // auto anchor_len3 = dim3(0, 0, 0); - // auto errctrl_len3 = dim3(0, 0, 0); - auto xdata = outlier_xdata; - auto outlier = outlier_xdata; - uint32_t* outlier_idx = nullptr; - - auto xdata_len3 = len3; - - decompress_predict_lorenzo_i( - errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius, // - xdata, // - &time_elapsed, stream); - } - else if (predictor == Spline3) { - this->derive_rtlen(Spline3, len3); - this->check_rtlen(); - // this->debug_list_rtlen(true); - - // launch_reconstruct_Spline3( - cusz::cpplaunch_reconstruct_Spline3( - outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius, - &time_elapsed, stream); - } - } - - void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); } - - float get_time_elapsed() const { return time_elapsed; } - // size_t get_alloclen_data() const; - // size_t get_alloclen_quant() const; - // size_t get_len_data() const; - // size_t get_len_quant() const; - // size_t get_len_anchor() const; - - E* expose_quant() const { return d_errctrl; } - E* expose_errctrl() const { return d_errctrl; } - T* expose_anchor() const { return d_anchor; } - T* expose_outlier() const { return d_outlier; } - - public: - // data - DEFINE_ARRAY(anchor, T); - DEFINE_ARRAY(errctrl, E); - DEFINE_ARRAY(outlier, T); -}; - -} // namespace cusz - -#undef ALLOCDEV -#undef FREE_DEV_ARRAY -#undef DEFINE_ARRAY - -#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */ +/** + * @file prediction.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef FB315D3E_6B96_4F5D_9975_F35702205BC1 +#define FB315D3E_6B96_4F5D_9975_F35702205BC1 + +#include +#include +#include +#include "../common.hh" +#include "../kernel/cpplaunch_cuda.hh" +#include "../kernel/lorenzo_all.hh" +#include "../utils.hh" + +#include "cusz/type.h" +#include "pred_boilerplate_deprecated.hh" + +#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr}; + +#define ALLOCDEV(VAR, SYM, NBYTE) \ + if (NBYTE != 0) { \ + CHECK_CUDA(cudaMalloc(&d_##VAR, NBYTE)); \ + CHECK_CUDA(cudaMemset(d_##VAR, 0x0, NBYTE)); \ + } + +#define ALLOCDEV2(VAR, TYPE, LEN) \ + if (LEN != 0) { \ + CHECK_CUDA(cudaMalloc(&d_##VAR, sizeof(TYPE) * LEN)); \ + CHECK_CUDA(cudaMemset(d_##VAR, 0x0, sizeof(TYPE) * LEN)); \ + } + +#define FREE_DEV_ARRAY(VAR) \ + if (d_##VAR) { \ + CHECK_CUDA(cudaFree(d_##VAR)); \ + d_##VAR = nullptr; \ + } + +namespace cusz { + +template +class PredictionUnified : public PredictorBoilerplate { + public: + using Origin = T; + using Anchor = T; + using ErrCtrl = E; + using Precision = FP; + + public: + ~PredictionUnified() + { // dtor + FREE_DEV_ARRAY(anchor); + FREE_DEV_ARRAY(errctrl); + FREE_DEV_ARRAY(outlier); + } + PredictionUnified() {} // ctor + PredictionUnified(const PredictionUnified&); // copy ctor + PredictionUnified& operator=(const PredictionUnified&); // copy assign + PredictionUnified(PredictionUnified&&); // move ctor + PredictionUnified& operator=(PredictionUnified&&); // move assign + + void init(cusz_predictortype predictor, size_t x, size_t y, size_t z, bool dbg_print = false) + { + auto len3 = dim3(x, y, z); + init(predictor, len3, dbg_print); + } + void init(cusz_predictortype predictor, dim3 xyz, bool dbg_print = false) + { + this->derive_alloclen(predictor, xyz); + + // allocate + ALLOCDEV2(anchor, T, this->alloclen.assigned.anchor); + ALLOCDEV2(errctrl, E, this->alloclen.assigned.quant); + ALLOCDEV2(outlier, T, this->alloclen.assigned.outlier); + + if (dbg_print) this->debug_list_alloclen(); + } + + void construct( + cusz_predictortype predictor, + dim3 const len3, + T* data, + T** ptr_anchor, + E** ptr_errctrl, + T** ptr_outlier, + double const eb, + int const radius, + cudaStream_t stream) + { + *ptr_anchor = d_anchor; + *ptr_errctrl = d_errctrl; + *ptr_outlier = d_outlier; + + if (predictor == LorenzoI) { + derive_rtlen(LorenzoI, len3); + this->check_rtlen(); + + // ad hoc placeholder + // auto anchor_len3 = dim3(0, 0, 0); + // auto errctrl_len3 = dim3(0, 0, 0); + uint32_t* outlier_idx = nullptr; + + compress_predict_lorenzo_i( + data, len3, eb, radius, // + d_errctrl, d_outlier, outlier_idx, nullptr, // + &time_elapsed, stream); + } + else if (predictor == Spline3) { + this->derive_rtlen(Spline3, len3); + this->check_rtlen(); + + cusz::cpplaunch_construct_Spline3( + true, // + data, len3, d_anchor, this->rtlen.anchor.len3, d_errctrl, this->rtlen.aligned.len3, eb, radius, + &time_elapsed, stream); + } + } + + void reconstruct( + cusz_predictortype predictor, + dim3 len3, + T* outlier_xdata, + T* anchor, + E* errctrl, + double const eb, + int const radius, + cudaStream_t stream) + { + if (predictor == LorenzoI) { + this->derive_rtlen(LorenzoI, len3); + this->check_rtlen(); + + // ad hoc placeholder + // auto anchor_len3 = dim3(0, 0, 0); + // auto errctrl_len3 = dim3(0, 0, 0); + auto xdata = outlier_xdata; + auto outlier = outlier_xdata; + uint32_t* outlier_idx = nullptr; + + auto xdata_len3 = len3; + + decompress_predict_lorenzo_i( + errctrl, xdata_len3, outlier, outlier_idx, 0, eb, radius, // + xdata, // + &time_elapsed, stream); + } + else if (predictor == Spline3) { + this->derive_rtlen(Spline3, len3); + this->check_rtlen(); + // this->debug_list_rtlen(true); + + // launch_reconstruct_Spline3( + cusz::cpplaunch_reconstruct_Spline3( + outlier_xdata, len3, anchor, this->rtlen.anchor.len3, errctrl, this->rtlen.aligned.len3, eb, radius, + &time_elapsed, stream); + } + } + + void clear_buffer() { cudaMemset(d_errctrl, 0x0, sizeof(E) * this->rtlen.assigned.quant); } + + float get_time_elapsed() const { return time_elapsed; } + // size_t get_alloclen_data() const; + // size_t get_alloclen_quant() const; + // size_t get_len_data() const; + // size_t get_len_quant() const; + // size_t get_len_anchor() const; + + E* expose_quant() const { return d_errctrl; } + E* expose_errctrl() const { return d_errctrl; } + T* expose_anchor() const { return d_anchor; } + T* expose_outlier() const { return d_outlier; } + + public: + // data + DEFINE_ARRAY(anchor, T); + DEFINE_ARRAY(errctrl, E); + DEFINE_ARRAY(outlier, T); +}; + +} // namespace cusz + +#undef ALLOCDEV +#undef FREE_DEV_ARRAY +#undef DEFINE_ARRAY + +#endif /* FB315D3E_6B96_4F5D_9975_F35702205BC1 */ diff --git a/qtensor/compression/cusz/include/component/spcodec.inl b/qtensor/compression/cusz/include/component/spcodec.inl index 32c91ab0..2a57f2f1 100644 --- a/qtensor/compression/cusz/include/component/spcodec.inl +++ b/qtensor/compression/cusz/include/component/spcodec.inl @@ -1,218 +1,218 @@ -/** - * @file spcodec_vec.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-08-22 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44 -#define CF358238_3946_4FFC_B5E6_45C12F0C0B44 - -#include -#include -#include - -#include -#include -#include - -#include "../common.hh" -#include "../kernel/spv_gpu.hh" -#include "utils/cuda_err.cuh" - -#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr}; - -#define SPVEC_ALLOCDEV(VAR, SYM) \ - CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \ - CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM])); - -#define SPVEC_FREEDEV(VAR) \ - if (d_##VAR) { \ - CHECK_CUDA(cudaFree(d_##VAR)); \ - d_##VAR = nullptr; \ - } - -#define SPVEC_D2DCPY(VAR, FIELD) \ - { \ - auto dst = d_spfmt + header.entry[Header::FIELD]; \ - auto src = reinterpret_cast(d_##VAR); \ - CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ - } - -namespace cusz { - -/******************************************************************************* - * sparsity-aware coder/decoder, vector - *******************************************************************************/ - -template -class SpcodecVec { - public: - using Origin = T; - using BYTE = uint8_t; - using MetadataT = M; - - struct alignas(128) Header { - static const int HEADER = 0; - static const int IDX = 1; - static const int VAL = 2; - static const int END = 3; - - int self_bytes : 16; - size_t uncompressed_len; - int nnz; - MetadataT entry[END + 1]; - - MetadataT subfile_size() const { return entry[END]; } - }; - - struct runtime_encode_helper { - static const int SPFMT = 0; - static const int IDX = 1; - static const int VAL = 2; - static const int END = 3; - - uint32_t nbyte[END]; - int nnz{0}; - }; - - private: - DEFINE_ARRAY(spfmt, BYTE); - DEFINE_ARRAY(idx, M); - DEFINE_ARRAY(val, T); - - using RTE = runtime_encode_helper; - - float milliseconds{0.0}; - - RTE rte; - - public: - ~SpcodecVec() - { - SPVEC_FREEDEV(spfmt); - SPVEC_FREEDEV(idx); - SPVEC_FREEDEV(val); - } // dtor - SpcodecVec() {} // ctor - SpcodecVec(const SpcodecVec&); // copy ctor - SpcodecVec& operator=(const SpcodecVec&); // copy assign - SpcodecVec(SpcodecVec&&); // move ctor - SpcodecVec& operator=(SpcodecVec&&); // move assign - - void init(size_t const len, int density_factor = 4, bool dbg_print = false) - { - auto max_bytes = [&]() { return len / density_factor * sizeof(T); }; - auto init_nnz = [&]() { return len / density_factor; }; - - memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END); - rte.nnz = init_nnz(); - - rte.nbyte[RTE::SPFMT] = max_bytes(); - rte.nbyte[RTE::IDX] = rte.nnz * sizeof(int); - rte.nbyte[RTE::VAL] = rte.nnz * sizeof(T); - - SPVEC_ALLOCDEV(spfmt, SPFMT); - SPVEC_ALLOCDEV(idx, IDX); - SPVEC_ALLOCDEV(val, VAL); - - // if (dbg_print) debug(); - } - - void encode( - T* in, - size_t const in_len, - BYTE*& out, - size_t& out_len, - cudaStream_t stream = nullptr, - bool dbg_print = false) - { - Header header; - - psz::spv_gather(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream); - - subfile_collect(header, in_len, stream, dbg_print); - out = d_spfmt; - out_len = header.subfile_size(); - } - - void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr) - { - Header header; - CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream)); - -#define ACCESSOR(SYM, TYPE) reinterpret_cast(coded + header.entry[Header::SYM]) - auto d_idx = ACCESSOR(IDX, uint32_t); - auto d_val = ACCESSOR(VAL, T); -#undef ACCESSOR - - psz::spv_scatter(d_val, d_idx, header.nnz, decoded, &milliseconds, stream); - } - - void clear_buffer() - { - cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]); - cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]); - cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]); - } - - float get_time_elapsed() const { return milliseconds; } - - void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print) - { - header.self_bytes = sizeof(Header); - header.uncompressed_len = len; - header.nnz = rte.nnz; - - // update (redundant here) - rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz; - rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz; - - MetadataT nbyte[Header::END]; - nbyte[Header::HEADER] = 128; - nbyte[Header::IDX] = rte.nbyte[RTE::IDX]; - nbyte[Header::VAL] = rte.nbyte[RTE::VAL]; - - header.entry[0] = 0; - // *.END + 1; need to knwo the ending position - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } - - auto debug_header_entry = [&]() { - printf("\nCSR11::subfile_collect() debugging:\n"); - printf("%-*s: %'10ld\n", 16, "final.nnz", rte.nnz); - printf(" ENTRIES\n"); - -#define PRINT_ENTRY(VAR) printf("%d %-*s: %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]); - PRINT_ENTRY(HEADER); - PRINT_ENTRY(IDX); - PRINT_ENTRY(VAL); - PRINT_ENTRY(END); - printf("\n"); -#undef PRINT_ENTRY - }; - if (dbg_print) debug_header_entry(); - - CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); - - /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); - - SPVEC_D2DCPY(idx, IDX) - SPVEC_D2DCPY(val, VAL) - - /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); - } -}; - -} // namespace cusz - -#undef DEFINE_ARRAY -#undef SPVEC_ALLOCDEV -#undef SPVEC_FREEDEV -#undef SPVEC_D2DCPY - -#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */ +/** + * @file spcodec_vec.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-08-22 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef CF358238_3946_4FFC_B5E6_45C12F0C0B44 +#define CF358238_3946_4FFC_B5E6_45C12F0C0B44 + +#include +#include +#include + +#include +#include +#include + +#include "../common.hh" +#include "../kernel/spv_gpu.hh" +#include "utils/cuda_err.cuh" + +#define DEFINE_ARRAY(VAR, TYPE) TYPE* d_##VAR{nullptr}; + +#define SPVEC_ALLOCDEV(VAR, SYM) \ + CHECK_CUDA(cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM])); \ + CHECK_CUDA(cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM])); + +#define SPVEC_FREEDEV(VAR) \ + if (d_##VAR) { \ + CHECK_CUDA(cudaFree(d_##VAR)); \ + d_##VAR = nullptr; \ + } + +#define SPVEC_D2DCPY(VAR, FIELD) \ + { \ + auto dst = d_spfmt + header.entry[Header::FIELD]; \ + auto src = reinterpret_cast(d_##VAR); \ + CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ + } + +namespace cusz { + +/******************************************************************************* + * sparsity-aware coder/decoder, vector + *******************************************************************************/ + +template +class SpcodecVec { + public: + using Origin = T; + using BYTE = uint8_t; + using MetadataT = M; + + struct alignas(128) Header { + static const int HEADER = 0; + static const int IDX = 1; + static const int VAL = 2; + static const int END = 3; + + int self_bytes : 16; + size_t uncompressed_len; + int nnz; + MetadataT entry[END + 1]; + + MetadataT subfile_size() const { return entry[END]; } + }; + + struct runtime_encode_helper { + static const int SPFMT = 0; + static const int IDX = 1; + static const int VAL = 2; + static const int END = 3; + + uint32_t nbyte[END]; + int nnz{0}; + }; + + private: + DEFINE_ARRAY(spfmt, BYTE); + DEFINE_ARRAY(idx, M); + DEFINE_ARRAY(val, T); + + using RTE = runtime_encode_helper; + + float milliseconds{0.0}; + + RTE rte; + + public: + ~SpcodecVec() + { + SPVEC_FREEDEV(spfmt); + SPVEC_FREEDEV(idx); + SPVEC_FREEDEV(val); + } // dtor + SpcodecVec() {} // ctor + SpcodecVec(const SpcodecVec&); // copy ctor + SpcodecVec& operator=(const SpcodecVec&); // copy assign + SpcodecVec(SpcodecVec&&); // move ctor + SpcodecVec& operator=(SpcodecVec&&); // move assign + + void init(size_t const len, int density_factor = 4, bool dbg_print = false) + { + auto max_bytes = [&]() { return len / density_factor * sizeof(T); }; + auto init_nnz = [&]() { return len / density_factor; }; + + memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END); + rte.nnz = init_nnz(); + + rte.nbyte[RTE::SPFMT] = max_bytes(); + rte.nbyte[RTE::IDX] = rte.nnz * sizeof(int); + rte.nbyte[RTE::VAL] = rte.nnz * sizeof(T); + + SPVEC_ALLOCDEV(spfmt, SPFMT); + SPVEC_ALLOCDEV(idx, IDX); + SPVEC_ALLOCDEV(val, VAL); + + // if (dbg_print) debug(); + } + + void encode( + T* in, + size_t const in_len, + BYTE*& out, + size_t& out_len, + cudaStream_t stream = nullptr, + bool dbg_print = false) + { + Header header; + + psz::spv_gather(in, in_len, this->d_val, this->d_idx, &rte.nnz, &milliseconds, stream); + + subfile_collect(header, in_len, stream, dbg_print); + out = d_spfmt; + out_len = header.subfile_size(); + } + + void decode(BYTE* coded, T* decoded, cudaStream_t stream = nullptr) + { + Header header; + CHECK_CUDA(cudaMemcpyAsync(&header, coded, sizeof(header), cudaMemcpyDeviceToHost, stream)); + +#define ACCESSOR(SYM, TYPE) reinterpret_cast(coded + header.entry[Header::SYM]) + auto d_idx = ACCESSOR(IDX, uint32_t); + auto d_val = ACCESSOR(VAL, T); +#undef ACCESSOR + + psz::spv_scatter(d_val, d_idx, header.nnz, decoded, &milliseconds, stream); + } + + void clear_buffer() + { + cudaMemset(d_spfmt, 0x0, rte.nbyte[RTE::SPFMT]); + cudaMemset(d_idx, 0x0, rte.nbyte[RTE::IDX]); + cudaMemset(d_val, 0x0, rte.nbyte[RTE::VAL]); + } + + float get_time_elapsed() const { return milliseconds; } + + void subfile_collect(Header& header, size_t len, cudaStream_t stream, bool dbg_print) + { + header.self_bytes = sizeof(Header); + header.uncompressed_len = len; + header.nnz = rte.nnz; + + // update (redundant here) + rte.nbyte[RTE::IDX] = sizeof(int) * rte.nnz; + rte.nbyte[RTE::VAL] = sizeof(T) * rte.nnz; + + MetadataT nbyte[Header::END]; + nbyte[Header::HEADER] = 128; + nbyte[Header::IDX] = rte.nbyte[RTE::IDX]; + nbyte[Header::VAL] = rte.nbyte[RTE::VAL]; + + header.entry[0] = 0; + // *.END + 1; need to knwo the ending position + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } + + auto debug_header_entry = [&]() { + printf("\nCSR11::subfile_collect() debugging:\n"); + printf("%-*s: %'10ld\n", 16, "final.nnz", rte.nnz); + printf(" ENTRIES\n"); + +#define PRINT_ENTRY(VAR) printf("%d %-*s: %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]); + PRINT_ENTRY(HEADER); + PRINT_ENTRY(IDX); + PRINT_ENTRY(VAL); + PRINT_ENTRY(END); + printf("\n"); +#undef PRINT_ENTRY + }; + if (dbg_print) debug_header_entry(); + + CHECK_CUDA(cudaMemcpyAsync(d_spfmt, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); + + /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); + + SPVEC_D2DCPY(idx, IDX) + SPVEC_D2DCPY(val, VAL) + + /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); + } +}; + +} // namespace cusz + +#undef DEFINE_ARRAY +#undef SPVEC_ALLOCDEV +#undef SPVEC_FREEDEV +#undef SPVEC_D2DCPY + +#endif /* CF358238_3946_4FFC_B5E6_45C12F0C0B44 */ diff --git a/qtensor/compression/cusz/include/compressor.hh b/qtensor/compression/cusz/include/compressor.hh index 7ea8c0ab..adea8f57 100644 --- a/qtensor/compression/cusz/include/compressor.hh +++ b/qtensor/compression/cusz/include/compressor.hh @@ -1,165 +1,165 @@ -/** - * @file compressor.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMPRESSOR_HH -#define CUSZ_COMPRESSOR_HH - -#include -#include - -#include "common/type_traits.hh" -#include "compaction.hh" -#include "component.hh" -#include "context.hh" -#include "header.h" - -#define PUBLIC_TYPES \ - using Predictor = typename BINDING::Predictor; \ - using Spcodec = typename BINDING::Spcodec; \ - using Codec = typename BINDING::Codec; \ - using FallbackCodec = typename BINDING::FallbackCodec; \ - using BYTE = uint8_t; \ - \ - using T = typename BINDING::DATA; \ - using FP = typename BINDING::FP; \ - using E = typename BINDING::ERRCTRL; \ - using H = typename Codec::Encoded; \ - using M = typename Codec::MetadataT; \ - using H_FB = typename FallbackCodec::Encoded; \ - \ - using TimeRecord = std::vector>; \ - using timerecord_t = TimeRecord*; - -namespace cusz { - -// extra helper -struct CompressorHelper { - static int autotune_coarse_parvle(Context* ctx); -}; - -template -class Compressor { - public: - using Predictor = typename BINDING::Predictor; - using Spcodec = typename BINDING::Spcodec; - using Codec = typename BINDING::Codec; - using FallbackCodec = typename BINDING::FallbackCodec; - using BYTE = uint8_t; - - using T = typename Predictor::Origin; - using FP = typename Predictor::Precision; - using E = typename Predictor::ErrCtrl; - using H = typename Codec::Encoded; - using M = typename Codec::MetadataT; - using H_FB = typename FallbackCodec::Encoded; - - using TimeRecord = std::vector>; - using timerecord_t = TimeRecord*; - - private: - class impl; - std::unique_ptr pimpl; - - public: - ~Compressor(); - Compressor(); - Compressor(const Compressor&); - Compressor& operator=(const Compressor&); - Compressor(Compressor&&); - Compressor& operator=(Compressor&&); - - // methods - void init(Context*, bool dbg_print = false); - void init(Header*, bool dbg_print = false); - void destroy(); - void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); - void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); - void clear_buffer(); - // getter - void export_header(Header&); - void export_header(Header*); - void export_timerecord(TimeRecord*); -}; - -template -class Compressor::impl { - public: - using Predictor = typename BINDING::Predictor; - using Spcodec = typename BINDING::Spcodec; - using Codec = typename BINDING::Codec; - using FallbackCodec = typename BINDING::FallbackCodec; - using BYTE = uint8_t; - - using T = typename Predictor::Origin; - using FP = typename Predictor::Precision; - using E = typename Predictor::ErrCtrl; - using H = typename Codec::Encoded; - using M = typename Codec::MetadataT; - using H_FB = typename FallbackCodec::Encoded; - - using TimeRecord = std::vector>; - using timerecord_t = TimeRecord*; - - private: - // state - bool use_fallback_codec{false}; - bool fallback_codec_allocated{false}; - BYTE* d_reserved_compressed{nullptr}; - // profiling - TimeRecord timerecord; - // header - Header header; - // components - - Predictor* predictor; - Spcodec* spcodec; - Codec* codec; - FallbackCodec* fb_codec; - // variables - uint32_t* d_freq; - float time_hist; - dim3 data_len3; - - public: - ~impl(); - impl(); - - // public methods - void init(Context* config, bool dbg_print = false); - void init(Header* config, bool dbg_print = false); - void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); - void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); - void clear_buffer(); - - // getter - void export_header(Header&); - void export_header(Header*); - void export_timerecord(TimeRecord*); - uint32_t get_len_data(); - - private: - // helper - template - void init_detail(CONFIG*, bool); - void init_codec(size_t, unsigned int, int, int, bool); - void collect_compress_timerecord(); - void collect_decompress_timerecord(); - void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool); - void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool); - void destroy(); - // getter -}; - -} // namespace cusz - -#undef PUBLIC_TYPES - -#endif +/** + * @file compressor.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMPRESSOR_HH +#define CUSZ_COMPRESSOR_HH + +#include +#include + +#include "common/type_traits.hh" +#include "compaction.hh" +#include "component.hh" +#include "context.hh" +#include "header.h" + +#define PUBLIC_TYPES \ + using Predictor = typename BINDING::Predictor; \ + using Spcodec = typename BINDING::Spcodec; \ + using Codec = typename BINDING::Codec; \ + using FallbackCodec = typename BINDING::FallbackCodec; \ + using BYTE = uint8_t; \ + \ + using T = typename BINDING::DATA; \ + using FP = typename BINDING::FP; \ + using E = typename BINDING::ERRCTRL; \ + using H = typename Codec::Encoded; \ + using M = typename Codec::MetadataT; \ + using H_FB = typename FallbackCodec::Encoded; \ + \ + using TimeRecord = std::vector>; \ + using timerecord_t = TimeRecord*; + +namespace cusz { + +// extra helper +struct CompressorHelper { + static int autotune_coarse_parvle(Context* ctx); +}; + +template +class Compressor { + public: + using Predictor = typename BINDING::Predictor; + using Spcodec = typename BINDING::Spcodec; + using Codec = typename BINDING::Codec; + using FallbackCodec = typename BINDING::FallbackCodec; + using BYTE = uint8_t; + + using T = typename Predictor::Origin; + using FP = typename Predictor::Precision; + using E = typename Predictor::ErrCtrl; + using H = typename Codec::Encoded; + using M = typename Codec::MetadataT; + using H_FB = typename FallbackCodec::Encoded; + + using TimeRecord = std::vector>; + using timerecord_t = TimeRecord*; + + private: + class impl; + std::unique_ptr pimpl; + + public: + ~Compressor(); + Compressor(); + Compressor(const Compressor&); + Compressor& operator=(const Compressor&); + Compressor(Compressor&&); + Compressor& operator=(Compressor&&); + + // methods + void init(Context*, bool dbg_print = false); + void init(Header*, bool dbg_print = false); + void destroy(); + void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); + void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); + void clear_buffer(); + // getter + void export_header(Header&); + void export_header(Header*); + void export_timerecord(TimeRecord*); +}; + +template +class Compressor::impl { + public: + using Predictor = typename BINDING::Predictor; + using Spcodec = typename BINDING::Spcodec; + using Codec = typename BINDING::Codec; + using FallbackCodec = typename BINDING::FallbackCodec; + using BYTE = uint8_t; + + using T = typename Predictor::Origin; + using FP = typename Predictor::Precision; + using E = typename Predictor::ErrCtrl; + using H = typename Codec::Encoded; + using M = typename Codec::MetadataT; + using H_FB = typename FallbackCodec::Encoded; + + using TimeRecord = std::vector>; + using timerecord_t = TimeRecord*; + + private: + // state + bool use_fallback_codec{false}; + bool fallback_codec_allocated{false}; + BYTE* d_reserved_compressed{nullptr}; + // profiling + TimeRecord timerecord; + // header + Header header; + // components + + Predictor* predictor; + Spcodec* spcodec; + Codec* codec; + FallbackCodec* fb_codec; + // variables + uint32_t* d_freq; + float time_hist; + dim3 data_len3; + + public: + ~impl(); + impl(); + + // public methods + void init(Context* config, bool dbg_print = false); + void init(Header* config, bool dbg_print = false); + void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); + void decompress(Header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); + void clear_buffer(); + + // getter + void export_header(Header&); + void export_header(Header*); + void export_timerecord(TimeRecord*); + uint32_t get_len_data(); + + private: + // helper + template + void init_detail(CONFIG*, bool); + void init_codec(size_t, unsigned int, int, int, bool); + void collect_compress_timerecord(); + void collect_decompress_timerecord(); + void encode_with_exception(E*, size_t, uint32_t*, int, int, int, bool, BYTE*&, size_t&, cudaStream_t, bool); + void subfile_collect(T*, size_t, BYTE*, size_t, BYTE*, size_t, cudaStream_t, bool); + void destroy(); + // getter +}; + +} // namespace cusz + +#undef PUBLIC_TYPES + +#endif diff --git a/qtensor/compression/cusz/include/context.hh b/qtensor/compression/cusz/include/context.hh index 36cbae57..d177fb8f 100644 --- a/qtensor/compression/cusz/include/context.hh +++ b/qtensor/compression/cusz/include/context.hh @@ -1,251 +1,251 @@ -#ifndef ARGPARSE_HH -#define ARGPARSE_HH - -/** - * @file argparse.hh - * @author Jiannan Tian - * @brief Argument parser (header). - * @version 0.1 - * @date 2020-09-20 - * Created on: 20-04-24 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include -#include - -#include "common/configs.hh" -#include "common/definition.hh" -#include "utils/format.hh" -#include "utils/strhelper.hh" - -namespace cusz { - -extern const char* VERSION_TEXT; -extern const int version; -extern const int compatibility; - -} // namespace cusz - -struct cuszCTX { - public: - // on-off's - struct { - bool construct{false}, reconstruct{false}, dryrun{false}; - bool experiment{false}; - bool gtest{false}; - } cli_task; - - struct { - bool binning{false}, logtransform{false}, prescan{false}; - } preprocess; - struct { - bool gpu_nvcomp_cascade{false}, cpu_gzip{false}; - } postcompress; - - struct { - bool predefined_demo{false}, release_input{false}; - bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false}; - } use; - - struct { - bool book{false}, quant{false}; - } export_raw; - - struct { - bool write2disk{false}, huffman{false}; - } skip; - struct { - bool time{false}, cr{false}, compressibility{false}; - } report; - - // filenames - struct { - std::string fname, origin_cmp, path_basename, basename, compress_output; - } fname; - - bool verbose{false}; - - // Stat stat; - - int read_args_status{0}; - - std::string opath; - - std::string demo_dataset; - std::string dtype = ConfigHelper::get_default_dtype(); // "f32" - std::string mode = ConfigHelper::get_default_cuszmode(); // "r2r" - std::string predictor = ConfigHelper::get_default_predictor(); // "lorenzo" - std::string codec = ConfigHelper::get_default_codec(); // "huffman-coarse" - std::string spcodec = ConfigHelper::get_default_spcodec(); // "cusparse-csr" - std::string pipeline = "auto"; - - // sparsity related: init_nnz when setting up Spcodec - float nz_density{SparseMethodSetup::default_density}; - float nz_density_factor{SparseMethodSetup::default_density_factor}; - - uint32_t codecs_in_use{0b01}; - - uint32_t quant_bytewidth{2}, huff_bytewidth{4}; - - bool codec_force_fallback() const { return huff_bytewidth == 8; } - - size_t huffman_num_uints, huffman_num_bits; - int vle_sublen{512}, vle_pardeg{-1}; - - unsigned int x{1}, y{1}, z{1}, w{1}; - - struct { - // size_t x, y, z, w; - size_t len; - } alloclen; - - size_t data_len{1}, quant_len{1}, anchor_len{1}; - int ndim{-1}; - - size_t get_len() const { return data_len; } - - double eb{0.0}; - int dict_size{1024}, radius{512}; - - void load_demo_sizes(); - - /******************************************************************************* - * another configuration method, alternative to - *******************************************************************************/ - public: - // for configuration - cuszCTX& set_eb(double _) - { - eb = _; - return *this; - } - - cuszCTX& set_radius(int _) - { - radius = _; - dict_size = radius * 2; - return *this; - } - - cuszCTX& set_huffbyte(int _) - { - huff_bytewidth = _; - codecs_in_use = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/; - return *this; - } - - cuszCTX& set_huffchunk(int _) - { - vle_sublen = _; - use.autotune_vle_pardeg = false; - return *this; - } - - cuszCTX& set_spcodec_densityfactor(int _) - { - if (_ <= 1) - throw std::runtime_error( - "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density " - "(the portion of nonzeros) is 25% in an array."); - nz_density_factor = _; - nz_density = 1.0 / _; - return *this; - } - - cuszCTX& enable_anchor(bool _) - { - use.anchor = true; - return *this; - } - cuszCTX& enable_input_nondestructive(bool _) - { - // placeholder - return *this; - } - - cuszCTX& enable_failfast(bool _) - { - // placeholder - return *this; - } - - cuszCTX& set_alloclen(size_t _) - { - alloclen.len = _; - return *this; - } - - cuszCTX& set_control_string(const char* in_str); - - cuszCTX& use_anchor(size_t _) - { - use.anchor = true; - return *this; - } - - // set x, y, z, w, ndim, data_len - cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1) - { - x = _x, y = _y, z = _z, w = _w; - - ndim = 4; - if (w == 1) ndim = 3; - if (z == 1) ndim = 2; - if (y == 1) ndim = 1; - - data_len = x * y * z * w; - - if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view)."); - if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view)."); - - return *this; - } - - private: - void derive_fnames(); - - void validate(); - - public: - void trap(int _status); - - static void print_doc(bool full = false); - - public: - static void parse_input_length(const char* lenstr, cuszCTX* ctx) - { - std::vector dims; - ConfigHelper::parse_length_literal(lenstr, dims); - ctx->ndim = dims.size(); - ctx->y = ctx->z = ctx->w = 1; - ctx->x = StrHelper::str2int(dims[0]); - if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]); - if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]); - if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]); - ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w; - } - - public: - cuszCTX() = default; - - cuszCTX(int argc, char** argv); - - cuszCTX(const char*, bool dbg_print = false); -}; - -typedef struct cuszCTX cusz_context; - -namespace cusz { - -using Context = cusz_context; -using context_t = cusz_context*; - -} // namespace cusz - -#endif // ARGPARSE_HH +#ifndef ARGPARSE_HH +#define ARGPARSE_HH + +/** + * @file argparse.hh + * @author Jiannan Tian + * @brief Argument parser (header). + * @version 0.1 + * @date 2020-09-20 + * Created on: 20-04-24 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include +#include + +#include "common/configs.hh" +#include "common/definition.hh" +#include "utils/format.hh" +#include "utils/strhelper.hh" + +namespace cusz { + +extern const char* VERSION_TEXT; +extern const int version; +extern const int compatibility; + +} // namespace cusz + +struct cuszCTX { + public: + // on-off's + struct { + bool construct{false}, reconstruct{false}, dryrun{false}; + bool experiment{false}; + bool gtest{false}; + } cli_task; + + struct { + bool binning{false}, logtransform{false}, prescan{false}; + } preprocess; + struct { + bool gpu_nvcomp_cascade{false}, cpu_gzip{false}; + } postcompress; + + struct { + bool predefined_demo{false}, release_input{false}; + bool anchor{false}, autotune_vle_pardeg{true}, gpu_verify{false}; + } use; + + struct { + bool book{false}, quant{false}; + } export_raw; + + struct { + bool write2disk{false}, huffman{false}; + } skip; + struct { + bool time{false}, cr{false}, compressibility{false}; + } report; + + // filenames + struct { + std::string fname, origin_cmp, path_basename, basename, compress_output; + } fname; + + bool verbose{false}; + + // Stat stat; + + int read_args_status{0}; + + std::string opath; + + std::string demo_dataset; + std::string dtype = ConfigHelper::get_default_dtype(); // "f32" + std::string mode = ConfigHelper::get_default_cuszmode(); // "r2r" + std::string predictor = ConfigHelper::get_default_predictor(); // "lorenzo" + std::string codec = ConfigHelper::get_default_codec(); // "huffman-coarse" + std::string spcodec = ConfigHelper::get_default_spcodec(); // "cusparse-csr" + std::string pipeline = "auto"; + + // sparsity related: init_nnz when setting up Spcodec + float nz_density{SparseMethodSetup::default_density}; + float nz_density_factor{SparseMethodSetup::default_density_factor}; + + uint32_t codecs_in_use{0b01}; + + uint32_t quant_bytewidth{2}, huff_bytewidth{4}; + + bool codec_force_fallback() const { return huff_bytewidth == 8; } + + size_t huffman_num_uints, huffman_num_bits; + int vle_sublen{512}, vle_pardeg{-1}; + + unsigned int x{1}, y{1}, z{1}, w{1}; + + struct { + // size_t x, y, z, w; + size_t len; + } alloclen; + + size_t data_len{1}, quant_len{1}, anchor_len{1}; + int ndim{-1}; + + size_t get_len() const { return data_len; } + + double eb{0.0}; + int dict_size{1024}, radius{512}; + + void load_demo_sizes(); + + /******************************************************************************* + * another configuration method, alternative to + *******************************************************************************/ + public: + // for configuration + cuszCTX& set_eb(double _) + { + eb = _; + return *this; + } + + cuszCTX& set_radius(int _) + { + radius = _; + dict_size = radius * 2; + return *this; + } + + cuszCTX& set_huffbyte(int _) + { + huff_bytewidth = _; + codecs_in_use = codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/; + return *this; + } + + cuszCTX& set_huffchunk(int _) + { + vle_sublen = _; + use.autotune_vle_pardeg = false; + return *this; + } + + cuszCTX& set_spcodec_densityfactor(int _) + { + if (_ <= 1) + throw std::runtime_error( + "Density factor for Spcodec must be >1. For example, setting the factor as 4 indicates the density " + "(the portion of nonzeros) is 25% in an array."); + nz_density_factor = _; + nz_density = 1.0 / _; + return *this; + } + + cuszCTX& enable_anchor(bool _) + { + use.anchor = true; + return *this; + } + cuszCTX& enable_input_nondestructive(bool _) + { + // placeholder + return *this; + } + + cuszCTX& enable_failfast(bool _) + { + // placeholder + return *this; + } + + cuszCTX& set_alloclen(size_t _) + { + alloclen.len = _; + return *this; + } + + cuszCTX& set_control_string(const char* in_str); + + cuszCTX& use_anchor(size_t _) + { + use.anchor = true; + return *this; + } + + // set x, y, z, w, ndim, data_len + cuszCTX& set_len(size_t _x, size_t _y = 1, size_t _z = 1, size_t _w = 1) + { + x = _x, y = _y, z = _z, w = _w; + + ndim = 4; + if (w == 1) ndim = 3; + if (z == 1) ndim = 2; + if (y == 1) ndim = 1; + + data_len = x * y * z * w; + + if (data_len == 1) throw std::runtime_error("Input data length cannot be 1 (in 1-D view)."); + if (data_len == 0) throw std::runtime_error("Input data length cannot be 0 (in 1-D view)."); + + return *this; + } + + private: + void derive_fnames(); + + void validate(); + + public: + void trap(int _status); + + static void print_doc(bool full = false); + + public: + static void parse_input_length(const char* lenstr, cuszCTX* ctx) + { + std::vector dims; + ConfigHelper::parse_length_literal(lenstr, dims); + ctx->ndim = dims.size(); + ctx->y = ctx->z = ctx->w = 1; + ctx->x = StrHelper::str2int(dims[0]); + if (ctx->ndim >= 2) ctx->y = StrHelper::str2int(dims[1]); + if (ctx->ndim >= 3) ctx->z = StrHelper::str2int(dims[2]); + if (ctx->ndim >= 4) ctx->w = StrHelper::str2int(dims[3]); + ctx->data_len = ctx->x * ctx->y * ctx->z * ctx->w; + } + + public: + cuszCTX() = default; + + cuszCTX(int argc, char** argv); + + cuszCTX(const char*, bool dbg_print = false); +}; + +typedef struct cuszCTX cusz_context; + +namespace cusz { + +using Context = cusz_context; +using context_t = cusz_context*; + +} // namespace cusz + +#endif // ARGPARSE_HH diff --git a/qtensor/compression/cusz/include/cusz.h b/qtensor/compression/cusz/include/cusz.h index 694d315c..420999cc 100644 --- a/qtensor/compression/cusz/include/cusz.h +++ b/qtensor/compression/cusz/include/cusz.h @@ -1,60 +1,60 @@ -/** - * @file cusz.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-29 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include -//#define __cplusplus -//#ifdef __cplusplus -extern "C" { -//#endif - -#ifndef CUSZ_H -#define CUSZ_H - -#include - -#include "cusz/custom.h" -#include "cusz/record.h" -#include "cusz/type.h" -#include "header.h" - -#pragma link C++ all function -#pragma link C++ all class - -cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type); - -cusz_error_status cusz_release(cusz_compressor* comp); - -cusz_error_status cusz_compress( - cusz_compressor* comp, - cusz_config* config, - void* uncompressed, - cusz_len const uncomp_len, - uint8_t** compressed, - size_t* comp_bytes, - cusz_header* header, - void* record, - cudaStream_t stream); - -cusz_error_status cusz_decompress( - cusz_compressor* comp, - cusz_header* header, - uint8_t* compressed, - size_t const comp_len, - void* decompressed, - cusz_len const decomp_len, - void* record, - cudaStream_t stream); - -#endif - -//#ifdef __cplusplus -} -//#endif +/** + * @file cusz.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-29 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include +//#define __cplusplus +//#ifdef __cplusplus +extern "C" { +//#endif + +#ifndef CUSZ_H +#define CUSZ_H + +#include + +#include "cusz/custom.h" +#include "cusz/record.h" +#include "cusz/type.h" +#include "header.h" + +#pragma link C++ all function +#pragma link C++ all class + +cusz_compressor* cusz_create(cusz_framework* framework, cusz_datatype const type); + +cusz_error_status cusz_release(cusz_compressor* comp); + +cusz_error_status cusz_compress( + cusz_compressor* comp, + cusz_config* config, + void* uncompressed, + cusz_len const uncomp_len, + uint8_t** compressed, + size_t* comp_bytes, + cusz_header* header, + void* record, + cudaStream_t stream); + +cusz_error_status cusz_decompress( + cusz_compressor* comp, + cusz_header* header, + uint8_t* compressed, + size_t const comp_len, + void* decompressed, + cusz_len const decomp_len, + void* record, + cudaStream_t stream); + +#endif + +//#ifdef __cplusplus +} +//#endif diff --git a/qtensor/compression/cusz/include/cusz/custom.h b/qtensor/compression/cusz/include/cusz/custom.h index c44682be..2ab7706d 100644 --- a/qtensor/compression/cusz/include/cusz/custom.h +++ b/qtensor/compression/cusz/include/cusz/custom.h @@ -1,26 +1,26 @@ -/** - * @file compress.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-30 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "type.h" - -cusz_custom_predictor cusz_default_predictor(); -cusz_custom_codec cusz_default_codec(); -cusz_custom_huffman_codec cusz_default_huffman_codec(); -cusz_custom_spcodec cusz_default_spcodec(); -cusz_custom_framework* cusz_default_framework(); - -#ifdef __cplusplus -} -#endif +/** + * @file compress.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-30 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "type.h" + +cusz_custom_predictor cusz_default_predictor(); +cusz_custom_codec cusz_default_codec(); +cusz_custom_huffman_codec cusz_default_huffman_codec(); +cusz_custom_spcodec cusz_default_spcodec(); +cusz_custom_framework* cusz_default_framework(); + +#ifdef __cplusplus +} +#endif diff --git a/qtensor/compression/cusz/include/cusz/it.hh b/qtensor/compression/cusz/include/cusz/it.hh index 1e8daa34..5334acde 100644 --- a/qtensor/compression/cusz/include/cusz/it.hh +++ b/qtensor/compression/cusz/include/cusz/it.hh @@ -1,78 +1,78 @@ -/** - * @file it.hh - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-03-13 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include -#include -#include -#include - -template -struct psz_buf { - private: - T* _buf; - size_t _len{1}; - static const int stridey{BLOCK}; - static const int stridez{BLOCK * BLOCK}; - - public: - psz_buf(bool do_memset = true) - { - if (DIM == 1) _len = BLOCK; - if (DIM == 2) _len = BLOCK * BLOCK; - if (DIM == 3) _len = BLOCK * BLOCK * BLOCK; - _buf = new T[_len]; - if (do_memset) memset(_buf, 0x0, sizeof(T) * _len); - } - - ~psz_buf() { delete[] _buf; } - - T*& buf() { return _buf; } - - T& operator()(int x) { return _buf[x]; } - T& operator()(int x, int y) { return _buf[x + y * stridey]; } - T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; } -}; - -template -struct psz_outlier_serial { - private: - T* _data; - IDX* _idx; - uint32_t _count{0}; - uint32_t _cap; - - public: - psz_outlier_serial(size_t cap) : _cap(cap) - { - _data = new T[cap + 1]; - _idx = new IDX[cap + 1]; - memset(_data, 0x0, sizeof(T) * cap); - } - - ~psz_outlier_serial() - { - delete[] _data; - delete[] _idx; - } - - T*& val() { return _data; } - IDX*& idx() { return _idx; } - uint32_t const count() { return _count; } - - void record(T data, IDX idx) - { - if (_count > _cap) throw std::runtime_error("Outlier overflows."); - _data[_count] = data; - _idx[_count] = idx; - ++_count; - } +/** + * @file it.hh + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-03-13 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include +#include +#include +#include + +template +struct psz_buf { + private: + T* _buf; + size_t _len{1}; + static const int stridey{BLOCK}; + static const int stridez{BLOCK * BLOCK}; + + public: + psz_buf(bool do_memset = true) + { + if (DIM == 1) _len = BLOCK; + if (DIM == 2) _len = BLOCK * BLOCK; + if (DIM == 3) _len = BLOCK * BLOCK * BLOCK; + _buf = new T[_len]; + if (do_memset) memset(_buf, 0x0, sizeof(T) * _len); + } + + ~psz_buf() { delete[] _buf; } + + T*& buf() { return _buf; } + + T& operator()(int x) { return _buf[x]; } + T& operator()(int x, int y) { return _buf[x + y * stridey]; } + T& operator()(int x, int y, int z) { return _buf[x + y * stridey + z * stridez]; } +}; + +template +struct psz_outlier_serial { + private: + T* _data; + IDX* _idx; + uint32_t _count{0}; + uint32_t _cap; + + public: + psz_outlier_serial(size_t cap) : _cap(cap) + { + _data = new T[cap + 1]; + _idx = new IDX[cap + 1]; + memset(_data, 0x0, sizeof(T) * cap); + } + + ~psz_outlier_serial() + { + delete[] _data; + delete[] _idx; + } + + T*& val() { return _data; } + IDX*& idx() { return _idx; } + uint32_t const count() { return _count; } + + void record(T data, IDX idx) + { + if (_count > _cap) throw std::runtime_error("Outlier overflows."); + _data[_count] = data; + _idx[_count] = idx; + ++_count; + } }; \ No newline at end of file diff --git a/qtensor/compression/cusz/include/cusz/nd.h b/qtensor/compression/cusz/include/cusz/nd.h index 007dfd7d..2c4443bc 100644 --- a/qtensor/compression/cusz/include/cusz/nd.h +++ b/qtensor/compression/cusz/include/cusz/nd.h @@ -1,15 +1,15 @@ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -typedef struct psz_dim3 { - uint32_t x, y, z; -} psz_dim3; - -#ifdef __cplusplus -} + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +typedef struct psz_dim3 { + uint32_t x, y, z; +} psz_dim3; + +#ifdef __cplusplus +} #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/include/cusz/pn.hh b/qtensor/compression/cusz/include/cusz/pn.hh index 1c1bb472..9c0f78bf 100644 --- a/qtensor/compression/cusz/include/cusz/pn.hh +++ b/qtensor/compression/cusz/include/cusz/pn.hh @@ -1,49 +1,49 @@ -/** - * @file pn.hh - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-05 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include - -// TODO typing should be more applicable - -namespace psz { -namespace typing { - -// clang-format off -template struct Int; -template <> struct Int<1> { typedef int8_t T; }; -template <> struct Int<2> { typedef int16_t T; }; -template <> struct Int<4> { typedef int32_t T; }; -template <> struct Int<8> { typedef int64_t T; }; - -template struct UInt; -template <> struct UInt<1> { typedef uint8_t T; }; -template <> struct UInt<2> { typedef uint16_t T; }; -template <> struct UInt<4> { typedef uint32_t T; }; -template <> struct UInt<8> { typedef uint64_t T; }; -// clang-format on - -} // namespace typing -} // namespace psz - -// TODO forward definition in another file -template -struct PN { - using UI = typename psz::typing::UInt::T; - using I = typename psz::typing::Int::T; - - // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/ - - static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); } - static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); } - static I decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); } - static I decode(UI x) { return (x >> 1) ^ (-(x & 1)); } -}; +/** + * @file pn.hh + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-05 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include + +// TODO typing should be more applicable + +namespace psz { +namespace typing { + +// clang-format off +template struct Int; +template <> struct Int<1> { typedef int8_t T; }; +template <> struct Int<2> { typedef int16_t T; }; +template <> struct Int<4> { typedef int32_t T; }; +template <> struct Int<8> { typedef int64_t T; }; + +template struct UInt; +template <> struct UInt<1> { typedef uint8_t T; }; +template <> struct UInt<2> { typedef uint16_t T; }; +template <> struct UInt<4> { typedef uint32_t T; }; +template <> struct UInt<8> { typedef uint64_t T; }; +// clang-format on + +} // namespace typing +} // namespace psz + +// TODO forward definition in another file +template +struct PN { + using UI = typename psz::typing::UInt::T; + using I = typename psz::typing::Int::T; + + // reference: https://lemire.me/blog/2022/11/25/making-all-your-integers-positive-with-zigzag-encoding/ + + static UI encode(I* x) { return (2 * (*x)) ^ ((*x) >> (BYTEWIDTH * 8 - 1)); } + static UI encode(I x) { return (2 * x) ^ (x >> (BYTEWIDTH * 8 - 1)); } + static I decode(UI* x) { return ((*x) >> 1) ^ (-((*x) & 1)); } + static I decode(UI x) { return (x >> 1) ^ (-(x & 1)); } +}; diff --git a/qtensor/compression/cusz/include/cusz/record.h b/qtensor/compression/cusz/include/cusz/record.h index d285f1b1..3c9be515 100644 --- a/qtensor/compression/cusz/include/cusz/record.h +++ b/qtensor/compression/cusz/include/cusz/record.h @@ -1,38 +1,38 @@ -/** - * @file record.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-30 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_RECORD_H -#define CUSZ_RECORD_H - -#ifdef __cplusplus -extern "C" { -#endif - -struct cusz_record_entry; - -struct cusz_record_entry { - const char* name; - double time; - - struct cusz_record_entry* next; -}; - -typedef struct cusz_record { - int n; - - struct cusz_record_entry* head; -} cusz_record; - -#ifdef __cplusplus -} -#endif - -#endif +/** + * @file record.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-30 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_RECORD_H +#define CUSZ_RECORD_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct cusz_record_entry; + +struct cusz_record_entry { + const char* name; + double time; + + struct cusz_record_entry* next; +}; + +typedef struct cusz_record { + int n; + + struct cusz_record_entry* head; +} cusz_record; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qtensor/compression/cusz/include/cusz/type.h b/qtensor/compression/cusz/include/cusz/type.h index b5f2d750..73e66086 100644 --- a/qtensor/compression/cusz/include/cusz/type.h +++ b/qtensor/compression/cusz/include/cusz/type.h @@ -1,219 +1,219 @@ -/** - * @file type.h - * @author Jiannan Tian - * @brief C-complient type definitions; no methods in this header. - * @version 0.3 - * @date 2022-04-29 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef CUSZ_TYPE_H -#define CUSZ_TYPE_H - -#include "stddef.h" - -enum cusz_execution_policy { CPU, CUDA }; -typedef enum cusz_execution_policy cusz_execution_policy; -typedef enum cusz_execution_policy cusz_policy; -typedef enum cusz_execution_policy asz_policy; - -//////// state enumeration - -typedef enum cusz_error_status { // - CUSZ_SUCCESS = 0x00, - CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01, - CUSZ_FAIL_DATA_NOT_READY = 0x02, - // specify error when calling CUDA API - CUSZ_FAIL_GPU_MALLOC, - CUSZ_FAIL_GPU_MEMCPY, - CUSZ_FAIL_GPU_ILLEGAL_ACCESS, - // specify error related to our own memory manager - CUSZ_FAIL_GPU_OUT_OF_MEMORY, - // when compression is useless - CUSZ_FAIL_INCOMPRESSIABLE, - // TODO component related error - CUSZ_FAIL_UNSUPPORTED_DATATYPE, - CUSZ_FAIL_UNSUPPORTED_QUANTTYPE, - CUSZ_FAIL_UNSUPPORTED_PRECISION, - CUSZ_FAIL_UNSUPPORTED_PIPELINE, - // not-implemented error - CUSZ_NOT_IMPLEMENTED = 0x0100, -} cusz_error_status; - -typedef struct cusz_fixedlen_internal { /* all nullable */ - void* encoding; -} cusz_fixedlen_internal; -typedef struct cusz_varlen_internal { /* all nullable */ - void* huffman; - void* outlier; -} cusz_varlen_internal; - -typedef enum cusz_datatype // -{ FP32 = 0, - FP64 = 1, - UINT8 = 10, - UINT16 = 11, - UINT32 = 12, - UINT64 = 13 } cusz_datatype; - -typedef enum cusz_executiontype // -{ Device = 0, - Host = 1, - None = 2 } cusz_executiontype; - -typedef enum cusz_mode // -{ Abs = 0, - Rel = 1 } cusz_mode; - -typedef enum cusz_pipelinetype // -{ Auto = 0, - Dense = 1, - Sparse = 2 } cusz_pipelinetype; - -typedef enum cusz_predictortype // -{ Lorenzo0 = 0, - LorenzoI = 1, - LorenzoII = 2, - Spline3 = 3 } cusz_predictortype; - -typedef enum cusz_preprocessingtype // -{ FP64toFP32 = 0, - LogTransform, - ShiftedLogTransform, - Binning2x2, - Binning2x1, - Binning1x2, -} cusz_preprocessingtype; - -typedef enum cusz_codectype // -{ Huffman = 0, - RunLength, - NvcompCascade, - NvcompLz4, - NvcompSnappy, -} cusz_codectype; - -typedef enum cusz_spcodectype // -{ SparseMat = 0, - SparseVec = 1 } cusz_spcodectype; - -typedef enum cusz_huffman_booktype // -{ Tree = 0, - Canonical = 1 } cusz_huffman_booktype; - -typedef enum cusz_huffman_codingtype // -{ Coarse = 0, - Fine = 1 } cusz_huffman_codingtype; - -//////// configuration template -typedef struct cusz_custom_len { - // clang-format off - union { size_t x0, x; }; - union { size_t x1, y; }; - union { size_t x2, z; }; - union { size_t x3, w; }; - // double factor; - // clang-format on -} cusz_custom_len; -typedef cusz_custom_len cusz_len; - -typedef struct cusz_custom_preprocessing { - cusz_custom_len before; - cusz_custom_len after; - cusz_preprocessingtype* list; - int nstep; - -} cusz_custom_preprocessing; - -typedef struct cusz_custom_predictor { - cusz_predictortype type; - - bool anchor; - bool nondestructive; -} cusz_custom_predictor; - -typedef struct cusz_custom_quantization { - int radius; - bool delayed; -} cusz_custom_quantization; - -typedef struct cusz_custom_codec { - cusz_codectype type; - - bool variable_length; - float presumed_density; -} cusz_custom_codec; - -typedef struct cusz_custom_huffman_codec { - cusz_huffman_booktype book; - cusz_executiontype book_policy; - cusz_huffman_codingtype coding; - - int booklen; - int coarse_pardeg; -} cusz_custom_huffman_codec; - -typedef struct cusz_custom_spcodec { - cusz_spcodectype type; - float presumed_density; -} cusz_custom_spcodec; - -////// wrap-up - -/** - * @deprecated The framework could be simplifed & unified. - */ -typedef struct cusz_custom_framework { - cusz_datatype datatype; - cusz_pipelinetype pipeline; - - cusz_custom_predictor predictor; - cusz_custom_quantization quantization; - cusz_custom_codec codec; - // cusz_custom_spcodec spcodec; - - cusz_custom_huffman_codec huffman; -} cusz_custom_framework; - -typedef cusz_custom_framework cusz_framework; - -typedef struct cusz_compressor_redundancy_compat_purpose { - void* compressor; - cusz_framework* framework; - cusz_datatype type; -} cusz_compressor_compat; - -typedef cusz_compressor_compat cusz_compressor; - -typedef struct cusz_runtime_config { - double eb; - cusz_mode mode; -} cusz_runtime_config; -typedef cusz_runtime_config cusz_config; - -typedef struct Res { - double min, max, rng, std; -} Res; - -typedef struct cusz_stats { - // clang-format off - Res odata, xdata; - struct { double PSNR, MSE, NRMSE, coeff; } reduced; - struct { double abs, rel, pwrrel; size_t idx; } max_err; - struct { double lag_one, lag_two; } autocor; - double user_eb; - size_t len; - // clang-format on -} cusz_stats; - -#endif - -#ifdef __cplusplus -} -#endif +/** + * @file type.h + * @author Jiannan Tian + * @brief C-complient type definitions; no methods in this header. + * @version 0.3 + * @date 2022-04-29 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef CUSZ_TYPE_H +#define CUSZ_TYPE_H + +#include "stddef.h" + +enum cusz_execution_policy { CPU, CUDA }; +typedef enum cusz_execution_policy cusz_execution_policy; +typedef enum cusz_execution_policy cusz_policy; +typedef enum cusz_execution_policy asz_policy; + +//////// state enumeration + +typedef enum cusz_error_status { // + CUSZ_SUCCESS = 0x00, + CUSZ_FAIL_ONDISK_FILE_ERROR = 0x01, + CUSZ_FAIL_DATA_NOT_READY = 0x02, + // specify error when calling CUDA API + CUSZ_FAIL_GPU_MALLOC, + CUSZ_FAIL_GPU_MEMCPY, + CUSZ_FAIL_GPU_ILLEGAL_ACCESS, + // specify error related to our own memory manager + CUSZ_FAIL_GPU_OUT_OF_MEMORY, + // when compression is useless + CUSZ_FAIL_INCOMPRESSIABLE, + // TODO component related error + CUSZ_FAIL_UNSUPPORTED_DATATYPE, + CUSZ_FAIL_UNSUPPORTED_QUANTTYPE, + CUSZ_FAIL_UNSUPPORTED_PRECISION, + CUSZ_FAIL_UNSUPPORTED_PIPELINE, + // not-implemented error + CUSZ_NOT_IMPLEMENTED = 0x0100, +} cusz_error_status; + +typedef struct cusz_fixedlen_internal { /* all nullable */ + void* encoding; +} cusz_fixedlen_internal; +typedef struct cusz_varlen_internal { /* all nullable */ + void* huffman; + void* outlier; +} cusz_varlen_internal; + +typedef enum cusz_datatype // +{ FP32 = 0, + FP64 = 1, + UINT8 = 10, + UINT16 = 11, + UINT32 = 12, + UINT64 = 13 } cusz_datatype; + +typedef enum cusz_executiontype // +{ Device = 0, + Host = 1, + None = 2 } cusz_executiontype; + +typedef enum cusz_mode // +{ Abs = 0, + Rel = 1 } cusz_mode; + +typedef enum cusz_pipelinetype // +{ Auto = 0, + Dense = 1, + Sparse = 2 } cusz_pipelinetype; + +typedef enum cusz_predictortype // +{ Lorenzo0 = 0, + LorenzoI = 1, + LorenzoII = 2, + Spline3 = 3 } cusz_predictortype; + +typedef enum cusz_preprocessingtype // +{ FP64toFP32 = 0, + LogTransform, + ShiftedLogTransform, + Binning2x2, + Binning2x1, + Binning1x2, +} cusz_preprocessingtype; + +typedef enum cusz_codectype // +{ Huffman = 0, + RunLength, + NvcompCascade, + NvcompLz4, + NvcompSnappy, +} cusz_codectype; + +typedef enum cusz_spcodectype // +{ SparseMat = 0, + SparseVec = 1 } cusz_spcodectype; + +typedef enum cusz_huffman_booktype // +{ Tree = 0, + Canonical = 1 } cusz_huffman_booktype; + +typedef enum cusz_huffman_codingtype // +{ Coarse = 0, + Fine = 1 } cusz_huffman_codingtype; + +//////// configuration template +typedef struct cusz_custom_len { + // clang-format off + union { size_t x0, x; }; + union { size_t x1, y; }; + union { size_t x2, z; }; + union { size_t x3, w; }; + // double factor; + // clang-format on +} cusz_custom_len; +typedef cusz_custom_len cusz_len; + +typedef struct cusz_custom_preprocessing { + cusz_custom_len before; + cusz_custom_len after; + cusz_preprocessingtype* list; + int nstep; + +} cusz_custom_preprocessing; + +typedef struct cusz_custom_predictor { + cusz_predictortype type; + + bool anchor; + bool nondestructive; +} cusz_custom_predictor; + +typedef struct cusz_custom_quantization { + int radius; + bool delayed; +} cusz_custom_quantization; + +typedef struct cusz_custom_codec { + cusz_codectype type; + + bool variable_length; + float presumed_density; +} cusz_custom_codec; + +typedef struct cusz_custom_huffman_codec { + cusz_huffman_booktype book; + cusz_executiontype book_policy; + cusz_huffman_codingtype coding; + + int booklen; + int coarse_pardeg; +} cusz_custom_huffman_codec; + +typedef struct cusz_custom_spcodec { + cusz_spcodectype type; + float presumed_density; +} cusz_custom_spcodec; + +////// wrap-up + +/** + * @deprecated The framework could be simplifed & unified. + */ +typedef struct cusz_custom_framework { + cusz_datatype datatype; + cusz_pipelinetype pipeline; + + cusz_custom_predictor predictor; + cusz_custom_quantization quantization; + cusz_custom_codec codec; + // cusz_custom_spcodec spcodec; + + cusz_custom_huffman_codec huffman; +} cusz_custom_framework; + +typedef cusz_custom_framework cusz_framework; + +typedef struct cusz_compressor_redundancy_compat_purpose { + void* compressor; + cusz_framework* framework; + cusz_datatype type; +} cusz_compressor_compat; + +typedef cusz_compressor_compat cusz_compressor; + +typedef struct cusz_runtime_config { + double eb; + cusz_mode mode; +} cusz_runtime_config; +typedef cusz_runtime_config cusz_config; + +typedef struct Res { + double min, max, rng, std; +} Res; + +typedef struct cusz_stats { + // clang-format off + Res odata, xdata; + struct { double PSNR, MSE, NRMSE, coeff; } reduced; + struct { double abs, rel, pwrrel; size_t idx; } max_err; + struct { double lag_one, lag_two; } autocor; + double user_eb; + size_t len; + // clang-format on +} cusz_stats; + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/qtensor/compression/cusz/include/framework.hh b/qtensor/compression/cusz/include/framework.hh index b0e99960..9655fe25 100644 --- a/qtensor/compression/cusz/include/framework.hh +++ b/qtensor/compression/cusz/include/framework.hh @@ -1,62 +1,62 @@ -/** - * @file framework.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * (create) 2021-10-06 (rev) 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_FRAMEWORK -#define CUSZ_FRAMEWORK - -#include "component.hh" -#include "compressor.hh" - -namespace cusz { - -template -struct Framework { - public: - /** - * - * Predictor - * | | ^ - * v | | - * Spcodec | +---- default "fast-lowlowprecision" - * v - * Encoder - */ - - using DATA = InputDataType; - using ERRCTRL = ErrCtrlTrait<4, false>::type; // predefined for mem. overlapping - using FP = typename FastLowPrecisionTrait::type; - using Huff4 = HuffTrait<4>::type; - using Huff8 = HuffTrait<8>::type; - using Meta4 = MetadataTrait<4>::type; - - template - struct CompressorTemplate; - - /* Predictor */ - using CompatPurposePredictor = typename cusz::PredictionUnified; - using Predictor = CompatPurposePredictor; - - using CompatPurposeSpcodec = typename cusz::SpcodecVec; - using Spcodec = CompatPurposeSpcodec; - - /* Lossless Codec*/ - using CodecHuffman32 = cusz::LosslessCodec; - using CodecHuffman64 = cusz::LosslessCodec; - using Codec = CodecHuffman32; - using FallbackCodec = CodecHuffman64; -}; - -using CompressorFP32 = cusz::Compressor>; - -} // namespace cusz - -#endif +/** + * @file framework.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * (create) 2021-10-06 (rev) 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_FRAMEWORK +#define CUSZ_FRAMEWORK + +#include "component.hh" +#include "compressor.hh" + +namespace cusz { + +template +struct Framework { + public: + /** + * + * Predictor + * | | ^ + * v | | + * Spcodec | +---- default "fast-lowlowprecision" + * v + * Encoder + */ + + using DATA = InputDataType; + using ERRCTRL = ErrCtrlTrait<4, false>::type; // predefined for mem. overlapping + using FP = typename FastLowPrecisionTrait::type; + using Huff4 = HuffTrait<4>::type; + using Huff8 = HuffTrait<8>::type; + using Meta4 = MetadataTrait<4>::type; + + template + struct CompressorTemplate; + + /* Predictor */ + using CompatPurposePredictor = typename cusz::PredictionUnified; + using Predictor = CompatPurposePredictor; + + using CompatPurposeSpcodec = typename cusz::SpcodecVec; + using Spcodec = CompatPurposeSpcodec; + + /* Lossless Codec*/ + using CodecHuffman32 = cusz::LosslessCodec; + using CodecHuffman64 = cusz::LosslessCodec; + using Codec = CodecHuffman32; + using FallbackCodec = CodecHuffman64; +}; + +using CompressorFP32 = cusz::Compressor>; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/header.h b/qtensor/compression/cusz/include/header.h index c0fd67d8..05287edc 100644 --- a/qtensor/compression/cusz/include/header.h +++ b/qtensor/compression/cusz/include/header.h @@ -1,111 +1,111 @@ -#ifndef CUSZ_HEADER_H -#define CUSZ_HEADER_H - -/** - * @file header.h - * @author Jiannan Tian - * @brief - * @version 0.2 - * @date 2021-01-22 - * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26 - * - * @copyright (C) 2020 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include - -typedef struct alignas(128) cusz_header { - static const int HEADER = 0; - static const int ANCHOR = 1; - static const int VLE = 2; - static const int SPFMT = 3; - - static const int END = 4; - - uint32_t self_bytes : 16; - uint32_t fp : 1; - uint32_t byte_vle : 4; // 4, 8 - uint32_t nz_density_factor : 8; - uint32_t codecs_in_use : 2; - uint32_t vle_pardeg; - uint32_t x, y, z, w; - double eb; - uint32_t radius : 16; - - uint32_t entry[END + 1]; - - // uint32_t byte_uncompressed : 4; // T; 1, 2, 4, 8 - // uint32_t byte_errctrl : 3; // 1, 2, 4 - // uint32_t byte_meta : 4; // 4, 8 - // uint32_t ndim : 3; // 1,2,3,4 - // size_t data_len; - // size_t errctrl_len; - -} cusz_header; - -typedef cusz_header cuszHEADER; - -typedef struct alignas(128) v2_cusz_header { - // data segments - static const int HEADER = 0; - static const int ANCHOR = 1; - static const int SP_IDX = 2; - static const int SP_VAL = 3; - static const int HF = 4; - static const int END = 5; - uint32_t entry[END + 1]; - - struct { - uint32_t precision : 1; - } data; - - uint32_t x, y, z, w; - - // struct { - // uint32_t codecs_in_use : 2; - double eb; - uint32_t radius : 16; - // } config; - - struct { - uint32_t factor : 8; // density = 1/factor - uint32_t count; - } sp; - - struct { - uint32_t rep_bytes : 4; // 4, 8 - uint32_t sublen : 28; - uint32_t pardeg; - } hf; - - // TODO replace the following with hf.VAR - uint32_t vle_pardeg; - -} psz_header; - -#ifdef __cplusplus -} -#endif - -namespace cusz { - -using Header = cusz_header; -using header_t = cusz_header*; - -} // namespace cusz - -namespace psz { - -using v2_header = v2_cusz_header; - -} - -#endif +#ifndef CUSZ_HEADER_H +#define CUSZ_HEADER_H + +/** + * @file header.h + * @author Jiannan Tian + * @brief + * @version 0.2 + * @date 2021-01-22 + * (created) 2020-09-25, (rev.1) 2021-01-22 (rev.2) 2021-09-08 (rev.3) 2022-02-26 + * + * @copyright (C) 2020 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +typedef struct alignas(128) cusz_header { + static const int HEADER = 0; + static const int ANCHOR = 1; + static const int VLE = 2; + static const int SPFMT = 3; + + static const int END = 4; + + uint32_t self_bytes : 16; + uint32_t fp : 1; + uint32_t byte_vle : 4; // 4, 8 + uint32_t nz_density_factor : 8; + uint32_t codecs_in_use : 2; + uint32_t vle_pardeg; + uint32_t x, y, z, w; + double eb; + uint32_t radius : 16; + + uint32_t entry[END + 1]; + + // uint32_t byte_uncompressed : 4; // T; 1, 2, 4, 8 + // uint32_t byte_errctrl : 3; // 1, 2, 4 + // uint32_t byte_meta : 4; // 4, 8 + // uint32_t ndim : 3; // 1,2,3,4 + // size_t data_len; + // size_t errctrl_len; + +} cusz_header; + +typedef cusz_header cuszHEADER; + +typedef struct alignas(128) v2_cusz_header { + // data segments + static const int HEADER = 0; + static const int ANCHOR = 1; + static const int SP_IDX = 2; + static const int SP_VAL = 3; + static const int HF = 4; + static const int END = 5; + uint32_t entry[END + 1]; + + struct { + uint32_t precision : 1; + } data; + + uint32_t x, y, z, w; + + // struct { + // uint32_t codecs_in_use : 2; + double eb; + uint32_t radius : 16; + // } config; + + struct { + uint32_t factor : 8; // density = 1/factor + uint32_t count; + } sp; + + struct { + uint32_t rep_bytes : 4; // 4, 8 + uint32_t sublen : 28; + uint32_t pardeg; + } hf; + + // TODO replace the following with hf.VAR + uint32_t vle_pardeg; + +} psz_header; + +#ifdef __cplusplus +} +#endif + +namespace cusz { + +using Header = cusz_header; +using header_t = cusz_header*; + +} // namespace cusz + +namespace psz { + +using v2_header = v2_cusz_header; + +} + +#endif diff --git a/qtensor/compression/cusz/include/hf/hf.hh b/qtensor/compression/cusz/include/hf/hf.hh index 692d0ea0..37438abb 100644 --- a/qtensor/compression/cusz/include/hf/hf.hh +++ b/qtensor/compression/cusz/include/hf/hf.hh @@ -1,170 +1,170 @@ -/** - * @file codec.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_COMPONENT_CODECS_HH -#define CUSZ_COMPONENT_CODECS_HH - -#include -#include -#include - -#include "hf/hf_struct.h" - -#define DEFINE_ARRAY(VAR, TYPE) \ - TYPE* d_##VAR{nullptr}; \ - TYPE* h_##VAR{nullptr}; - -namespace cusz { - -template -class LosslessCodec -// : CodecInterface -{ - public: - using Origin = T; - using Encoded = H; - using MetadataT = M; - using FreqT = uint32_t; - using BYTE = uint8_t; - - private: - class impl; - std::unique_ptr pimpl; - - public: - ~LosslessCodec(); // dtor - LosslessCodec(); // ctor - LosslessCodec(const LosslessCodec&); // copy ctor - LosslessCodec& operator=(const LosslessCodec&); // copy assign - LosslessCodec(LosslessCodec&&); // move ctor - LosslessCodec& operator=(LosslessCodec&&); // move assign - - void init(size_t const, int const, int const, bool dbg_print = false); - void build_codebook(uint32_t*, int const, cudaStream_t = nullptr); - void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr); - void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true); - void clear_buffer(); - - float get_time_elapsed() const; - float get_time_book() const; - float get_time_lossless() const; -}; - -template -class LosslessCodec::impl { - public: - using Origin = T; - using Encoded = H; - using MetadataT = M; - using FreqT = uint32_t; - using BYTE = uint8_t; - - private: - using BOOK = H; - using SYM = T; - - // TODO shared header - struct alignas(128) Header { - static const int HEADER = 0; - static const int REVBOOK = 1; - static const int PAR_NBIT = 2; - static const int PAR_ENTRY = 3; - static const int BITSTREAM = 4; - static const int END = 5; - - int self_bytes : 16; - int booklen : 16; - int sublen; - int pardeg; - size_t uncompressed_len; - size_t total_nbit; - size_t total_ncell; // TODO change to uint32_t - MetadataT entry[END + 1]; - - MetadataT subfile_size() const { return entry[END]; } - }; - - struct runtime_encode_helper { - static const int TMP = 0; - static const int FREQ = 1; - static const int BOOK = 2; - static const int REVBOOK = 3; - static const int PAR_NBIT = 4; - static const int PAR_NCELL = 5; - static const int PAR_ENTRY = 6; - static const int BITSTREAM = 7; - static const int END = 8; - - uint32_t nbyte[END]; - }; - - using RTE = runtime_encode_helper; - using Header = struct Header; - - private: - // array - DEFINE_ARRAY(tmp, H); - DEFINE_ARRAY(compressed, BYTE); // alias in address - DEFINE_ARRAY(book, H); - DEFINE_ARRAY(revbook, BYTE); - - DEFINE_ARRAY(par_metadata, M); - DEFINE_ARRAY(par_nbit, M); - DEFINE_ARRAY(par_ncell, M); - DEFINE_ARRAY(par_entry, M); - - DEFINE_ARRAY(bitstream, H); - // helper - RTE rte; - // memory - static const int CELL_BITWIDTH = sizeof(H) * 8; - // timer - float milliseconds{0.0}; - float time_hist{0.0}, time_book{0.0}, time_lossless{0.0}; - - hf_book* book_desc; - hf_chunk* chunk_desc_d; - hf_chunk* chunk_desc_h; - hf_bitstream* bitstream_desc; - - public: - ~impl(); // dtor - impl(); // ctor - - // getter - float get_time_elapsed() const; - float get_time_book() const; - float get_time_lossless() const; - size_t get_workspace_nbyte(size_t) const; - size_t get_max_output_nbyte(size_t len) const; - static size_t get_revbook_nbyte(int); - // getter for internal array - H* expose_book() const; - BYTE* expose_revbook() const; - // compile-time - constexpr bool can_overlap_input_and_firstphase_encode(); - // public methods - void init(size_t const, int const, int const, bool dbg_print = false); - void build_codebook(uint32_t*, int const, cudaStream_t = nullptr); - void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr); - void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true); - void clear_buffer(); - - private: - void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr); - void dbg_println(const std::string, void*, int); -}; - -} // namespace cusz - -#undef DEFINE_ARRAY - -#endif +/** + * @file codec.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_COMPONENT_CODECS_HH +#define CUSZ_COMPONENT_CODECS_HH + +#include +#include +#include + +#include "hf/hf_struct.h" + +#define DEFINE_ARRAY(VAR, TYPE) \ + TYPE* d_##VAR{nullptr}; \ + TYPE* h_##VAR{nullptr}; + +namespace cusz { + +template +class LosslessCodec +// : CodecInterface +{ + public: + using Origin = T; + using Encoded = H; + using MetadataT = M; + using FreqT = uint32_t; + using BYTE = uint8_t; + + private: + class impl; + std::unique_ptr pimpl; + + public: + ~LosslessCodec(); // dtor + LosslessCodec(); // ctor + LosslessCodec(const LosslessCodec&); // copy ctor + LosslessCodec& operator=(const LosslessCodec&); // copy assign + LosslessCodec(LosslessCodec&&); // move ctor + LosslessCodec& operator=(LosslessCodec&&); // move assign + + void init(size_t const, int const, int const, bool dbg_print = false); + void build_codebook(uint32_t*, int const, cudaStream_t = nullptr); + void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr); + void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true); + void clear_buffer(); + + float get_time_elapsed() const; + float get_time_book() const; + float get_time_lossless() const; +}; + +template +class LosslessCodec::impl { + public: + using Origin = T; + using Encoded = H; + using MetadataT = M; + using FreqT = uint32_t; + using BYTE = uint8_t; + + private: + using BOOK = H; + using SYM = T; + + // TODO shared header + struct alignas(128) Header { + static const int HEADER = 0; + static const int REVBOOK = 1; + static const int PAR_NBIT = 2; + static const int PAR_ENTRY = 3; + static const int BITSTREAM = 4; + static const int END = 5; + + int self_bytes : 16; + int booklen : 16; + int sublen; + int pardeg; + size_t uncompressed_len; + size_t total_nbit; + size_t total_ncell; // TODO change to uint32_t + MetadataT entry[END + 1]; + + MetadataT subfile_size() const { return entry[END]; } + }; + + struct runtime_encode_helper { + static const int TMP = 0; + static const int FREQ = 1; + static const int BOOK = 2; + static const int REVBOOK = 3; + static const int PAR_NBIT = 4; + static const int PAR_NCELL = 5; + static const int PAR_ENTRY = 6; + static const int BITSTREAM = 7; + static const int END = 8; + + uint32_t nbyte[END]; + }; + + using RTE = runtime_encode_helper; + using Header = struct Header; + + private: + // array + DEFINE_ARRAY(tmp, H); + DEFINE_ARRAY(compressed, BYTE); // alias in address + DEFINE_ARRAY(book, H); + DEFINE_ARRAY(revbook, BYTE); + + DEFINE_ARRAY(par_metadata, M); + DEFINE_ARRAY(par_nbit, M); + DEFINE_ARRAY(par_ncell, M); + DEFINE_ARRAY(par_entry, M); + + DEFINE_ARRAY(bitstream, H); + // helper + RTE rte; + // memory + static const int CELL_BITWIDTH = sizeof(H) * 8; + // timer + float milliseconds{0.0}; + float time_hist{0.0}, time_book{0.0}, time_lossless{0.0}; + + hf_book* book_desc; + hf_chunk* chunk_desc_d; + hf_chunk* chunk_desc_h; + hf_bitstream* bitstream_desc; + + public: + ~impl(); // dtor + impl(); // ctor + + // getter + float get_time_elapsed() const; + float get_time_book() const; + float get_time_lossless() const; + size_t get_workspace_nbyte(size_t) const; + size_t get_max_output_nbyte(size_t len) const; + static size_t get_revbook_nbyte(int); + // getter for internal array + H* expose_book() const; + BYTE* expose_revbook() const; + // compile-time + constexpr bool can_overlap_input_and_firstphase_encode(); + // public methods + void init(size_t const, int const, int const, bool dbg_print = false); + void build_codebook(uint32_t*, int const, cudaStream_t = nullptr); + void encode(T*, size_t const, BYTE*&, size_t&, cudaStream_t = nullptr); + void decode(BYTE*, T*, cudaStream_t = nullptr, bool = true); + void clear_buffer(); + + private: + void subfile_collect(Header&, size_t const, int const, int const, int const, cudaStream_t stream = nullptr); + void dbg_println(const std::string, void*, int); +}; + +} // namespace cusz + +#undef DEFINE_ARRAY + +#endif diff --git a/qtensor/compression/cusz/include/hf/hf_bookg.hh b/qtensor/compression/cusz/include/hf/hf_bookg.hh index 3d406f0f..f6187164 100644 --- a/qtensor/compression/cusz/include/hf/hf_bookg.hh +++ b/qtensor/compression/cusz/include/hf/hf_bookg.hh @@ -1,45 +1,45 @@ -/** - * @file huffman_parbook.cuh - * @author Cody Rivera (cjrivera1@crimson.ua.edu) - * @brief Parallel Huffman Construction to generates canonical forward codebook (header). - * Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib) - * "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes". - * @version 0.1 - * @date 2020-09-20 - * Created on: 2020-06 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef PAR_HUFFMAN_H -#define PAR_HUFFMAN_H - -// Parallel huffman global memory and kernels -namespace asz { - -/** - * @brief get codebook and reverse codebook in parallel - * - * @tparam T input type - * @tparam H codebook type - * @param freq input device array; frequency - * @param codebook output device array; codebook for encoding - * @param dict_size dictionary size; len of freq or codebook - * @param reverse_codebook output device array; reverse codebook for decoding - * @param time_book the returned time - */ -template -void hf_buildbook_g( - uint32_t* freq, - int const booksize, - H* codebook, - uint8_t* reverse_codebook, - int const revbook_nbyte, - float* time_book, - cudaStream_t = nullptr); - -} // namespace asz - -#endif +/** + * @file huffman_parbook.cuh + * @author Cody Rivera (cjrivera1@crimson.ua.edu) + * @brief Parallel Huffman Construction to generates canonical forward codebook (header). + * Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib) + * "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes". + * @version 0.1 + * @date 2020-09-20 + * Created on: 2020-06 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef PAR_HUFFMAN_H +#define PAR_HUFFMAN_H + +// Parallel huffman global memory and kernels +namespace asz { + +/** + * @brief get codebook and reverse codebook in parallel + * + * @tparam T input type + * @tparam H codebook type + * @param freq input device array; frequency + * @param codebook output device array; codebook for encoding + * @param dict_size dictionary size; len of freq or codebook + * @param reverse_codebook output device array; reverse codebook for decoding + * @param time_book the returned time + */ +template +void hf_buildbook_g( + uint32_t* freq, + int const booksize, + H* codebook, + uint8_t* reverse_codebook, + int const revbook_nbyte, + float* time_book, + cudaStream_t = nullptr); + +} // namespace asz + +#endif diff --git a/qtensor/compression/cusz/include/hf/hf_codecg.hh b/qtensor/compression/cusz/include/hf/hf_codecg.hh index 10cb1570..faad837a 100644 --- a/qtensor/compression/cusz/include/hf/hf_codecg.hh +++ b/qtensor/compression/cusz/include/hf/hf_codecg.hh @@ -1,82 +1,82 @@ -/** - * @file launch_lossless.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-06-13 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1 -#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1 - -#include -#include - -#include "hf_struct.h" - -template -struct PackedWordByWidth; - -template <> -struct PackedWordByWidth<4> { - uint32_t word : 24; - uint32_t bits : 8; -}; - -template <> -struct PackedWordByWidth<8> { - uint64_t word : 56; - uint64_t bits : 8; -}; - -namespace asz { - -template -void hf_encode_coarse( - T* uncompressed, - H* d_internal_coded, - size_t const len, - uint32_t* d_freq, - H* d_book, - int const booklen, - H* d_bitstream, - M* d_par_metadata, - M* h_par_metadata, - int const sublen, - int const pardeg, - int numSMs, - uint8_t*& out_compressed, - size_t& out_compressed_len, - float& time_lossless, - cudaStream_t stream); - -template -void hf_encode_coarse_rev1( - T* uncompressed, - size_t const len, - hf_book* book_desc, - hf_bitstream* bitstream_desc, - uint8_t*& out_compressed, // 22-10-12 buggy - size_t& out_compressed_len, // 22-10-12 buggy - float& time_lossless, - cudaStream_t stream); - -template -void hf_decode_coarse( - H* d_bitstream, - uint8_t* d_revbook, - int const revbook_nbyte, - M* d_par_nbit, - M* d_par_entry, - int const sublen, - int const pardeg, - T* out_decompressed, - float& time_lossless, - cudaStream_t stream); - -} // namespace asz - -#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */ +/** + * @file launch_lossless.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-06-13 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef ABAACE49_2C9E_4E3C_AEFF_B016276142E1 +#define ABAACE49_2C9E_4E3C_AEFF_B016276142E1 + +#include +#include + +#include "hf_struct.h" + +template +struct PackedWordByWidth; + +template <> +struct PackedWordByWidth<4> { + uint32_t word : 24; + uint32_t bits : 8; +}; + +template <> +struct PackedWordByWidth<8> { + uint64_t word : 56; + uint64_t bits : 8; +}; + +namespace asz { + +template +void hf_encode_coarse( + T* uncompressed, + H* d_internal_coded, + size_t const len, + uint32_t* d_freq, + H* d_book, + int const booklen, + H* d_bitstream, + M* d_par_metadata, + M* h_par_metadata, + int const sublen, + int const pardeg, + int numSMs, + uint8_t*& out_compressed, + size_t& out_compressed_len, + float& time_lossless, + cudaStream_t stream); + +template +void hf_encode_coarse_rev1( + T* uncompressed, + size_t const len, + hf_book* book_desc, + hf_bitstream* bitstream_desc, + uint8_t*& out_compressed, // 22-10-12 buggy + size_t& out_compressed_len, // 22-10-12 buggy + float& time_lossless, + cudaStream_t stream); + +template +void hf_decode_coarse( + H* d_bitstream, + uint8_t* d_revbook, + int const revbook_nbyte, + M* d_par_nbit, + M* d_par_entry, + int const sublen, + int const pardeg, + T* out_decompressed, + float& time_lossless, + cudaStream_t stream); + +} // namespace asz + +#endif /* ABAACE49_2C9E_4E3C_AEFF_B016276142E1 */ diff --git a/qtensor/compression/cusz/include/hf/hf_struct.h b/qtensor/compression/cusz/include/hf/hf_struct.h index c289a795..20ccf206 100644 --- a/qtensor/compression/cusz/include/hf/hf_struct.h +++ b/qtensor/compression/cusz/include/hf/hf_struct.h @@ -1,53 +1,53 @@ -/** - * @file hf_struct.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-09-14 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef DA6883A3_A70F_4690_A4FA_56644987725A -#define DA6883A3_A70F_4690_A4FA_56644987725A - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -// raw pointer array; regardless of being on host or device -typedef struct hf_book { - uint32_t* freq; - // undertermined on definition; could be uint32_t* and uint64_t* - void* book; - int booklen; -} hf_book; - -// typedef struct hf_revbook { -// } hf_revbook; - -typedef struct hf_chunk { - void* bits; // how many bits each chunk - void* cells; // how many cells each chunk - void* entries; // jump to the chunk -} hf_chunk; - -typedef struct hf_bitstream { - void* buffer; - void* bitstream; - hf_chunk* d_metadata; - hf_chunk* h_metadata; - int sublen; // data chunksize - int pardeg; // runtime paralleism degree - int numSMs; // number of streaming multiprocessor -} hf_bitstream; - -#ifdef __cplusplus -} -#endif - -#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */ +/** + * @file hf_struct.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-09-14 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef DA6883A3_A70F_4690_A4FA_56644987725A +#define DA6883A3_A70F_4690_A4FA_56644987725A + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +// raw pointer array; regardless of being on host or device +typedef struct hf_book { + uint32_t* freq; + // undertermined on definition; could be uint32_t* and uint64_t* + void* book; + int booklen; +} hf_book; + +// typedef struct hf_revbook { +// } hf_revbook; + +typedef struct hf_chunk { + void* bits; // how many bits each chunk + void* cells; // how many cells each chunk + void* entries; // jump to the chunk +} hf_chunk; + +typedef struct hf_bitstream { + void* buffer; + void* bitstream; + hf_chunk* d_metadata; + hf_chunk* h_metadata; + int sublen; // data chunksize + int pardeg; // runtime paralleism degree + int numSMs; // number of streaming multiprocessor +} hf_bitstream; + +#ifdef __cplusplus +} +#endif + +#endif /* DA6883A3_A70F_4690_A4FA_56644987725A */ diff --git a/qtensor/compression/cusz/include/kernel/claunch_cuda.h b/qtensor/compression/cusz/include/kernel/claunch_cuda.h index f19943c1..f160b5a3 100644 --- a/qtensor/compression/cusz/include/kernel/claunch_cuda.h +++ b/qtensor/compression/cusz/include/kernel/claunch_cuda.h @@ -1,49 +1,49 @@ -/** - * @file claunch_cuda.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-07-24 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef KERNEL_CUDA_H -#define KERNEL_CUDA_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -#include "../cusz/type.h" -// #include "../hf/hf_struct.h" - -#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ - cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \ - double const eb, int const radius, float* time_elapsed, cudaStream_t stream); \ - \ - cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb, \ - int const radius, float* time_elapsed, cudaStream_t stream); - -C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); -C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); -C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); -C_SPLINE3(fp32, fp32, fp32, float, float, float); - -#undef C_SPLINE3 - -#undef C_COARSE_HUFFMAN_DECODE - -#ifdef __cplusplus -} -#endif - -#endif +/** + * @file claunch_cuda.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-07-24 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef KERNEL_CUDA_H +#define KERNEL_CUDA_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "../cusz/type.h" +// #include "../hf/hf_struct.h" + +#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ + cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \ + double const eb, int const radius, float* time_elapsed, cudaStream_t stream); \ + \ + cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb, \ + int const radius, float* time_elapsed, cudaStream_t stream); + +C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); +C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); +C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); +C_SPLINE3(fp32, fp32, fp32, float, float, float); + +#undef C_SPLINE3 + +#undef C_COARSE_HUFFMAN_DECODE + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh index 5c8ee08d..7d35d59e 100644 --- a/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh +++ b/qtensor/compression/cusz/include/kernel/cpplaunch_cuda.hh @@ -1,51 +1,51 @@ -/** - * @file cpplaunch_cuda.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-07-27 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef COMPONENT_CALL_KERNEL_HH -#define COMPONENT_CALL_KERNEL_HH - -#include "../cusz/type.h" -#include "../hf/hf_struct.h" - -namespace cusz { - -// 22-10-27 revise later -template -cusz_error_status cpplaunch_construct_Spline3( - bool NO_R_SEPARATE, - T* data, - dim3 const len3, - T* anchor, - dim3 const an_len3, - E* eq, - dim3 const ec_len3, - double const eb, - int const radius, - float* time_elapsed, - cudaStream_t stream); - -// 22-10-27 revise later -template -cusz_error_status cpplaunch_reconstruct_Spline3( - T* xdata, - dim3 const len3, - T* anchor, - dim3 const an_len3, - E* eq, - dim3 const ec_len3, - double const eb, - int const radius, - float* time_elapsed, - cudaStream_t stream); - -} // namespace cusz - -#endif +/** + * @file cpplaunch_cuda.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-07-27 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef COMPONENT_CALL_KERNEL_HH +#define COMPONENT_CALL_KERNEL_HH + +#include "../cusz/type.h" +#include "../hf/hf_struct.h" + +namespace cusz { + +// 22-10-27 revise later +template +cusz_error_status cpplaunch_construct_Spline3( + bool NO_R_SEPARATE, + T* data, + dim3 const len3, + T* anchor, + dim3 const an_len3, + E* eq, + dim3 const ec_len3, + double const eb, + int const radius, + float* time_elapsed, + cudaStream_t stream); + +// 22-10-27 revise later +template +cusz_error_status cpplaunch_reconstruct_Spline3( + T* xdata, + dim3 const len3, + T* anchor, + dim3 const an_len3, + E* eq, + dim3 const ec_len3, + double const eb, + int const radius, + float* time_elapsed, + cudaStream_t stream); + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/include/kernel/dryrun.cuh b/qtensor/compression/cusz/include/kernel/dryrun.cuh index e96b3b96..d32800c1 100644 --- a/qtensor/compression/cusz/include/kernel/dryrun.cuh +++ b/qtensor/compression/cusz/include/kernel/dryrun.cuh @@ -1,47 +1,47 @@ -/** - * @file dryrun.cuh - * @author Jiannan Tian - * @brief cuSZ dryrun mode, checking data quality from lossy compression. - * @version 0.3 - * @date 2020-09-20 - * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_DRYRUN_CUH -#define CUSZ_KERNEL_DRYRUN_CUH - -namespace cusz { - -template -// template -__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2) -{ - { - constexpr auto NTHREAD = BLOCK / SEQ; - __shared__ Data shmem[BLOCK]; - auto id_base = blockIdx.x * BLOCK; - -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + threadIdx.x + i * NTHREAD; - if (id < len) { - shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2; - out_xdata[id] = shmem[threadIdx.x + i * NTHREAD]; - } - } - } - - // simplistic - // { - // auto id = blockIdx.x * blockDim.x + threadIdx.x; - // if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2; - // } -} - -} // namespace cusz - +/** + * @file dryrun.cuh + * @author Jiannan Tian + * @brief cuSZ dryrun mode, checking data quality from lossy compression. + * @version 0.3 + * @date 2020-09-20 + * (create) 2020-05-14, (release) 2020-09-20, (rev1) 2021-01-25, (rev2) 2021-06-21 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_DRYRUN_CUH +#define CUSZ_KERNEL_DRYRUN_CUH + +namespace cusz { + +template +// template +__global__ void dualquant_dryrun_kernel(Data* in_data, Data* out_xdata, size_t len, FP ebx2_r, FP ebx2) +{ + { + constexpr auto NTHREAD = BLOCK / SEQ; + __shared__ Data shmem[BLOCK]; + auto id_base = blockIdx.x * BLOCK; + +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + threadIdx.x + i * NTHREAD; + if (id < len) { + shmem[threadIdx.x + i * NTHREAD] = round(in_data[id] * ebx2_r) * ebx2; + out_xdata[id] = shmem[threadIdx.x + i * NTHREAD]; + } + } + } + + // simplistic + // { + // auto id = blockIdx.x * blockDim.x + threadIdx.x; + // if (id < len) out_xdata[id] = round(in_data[id] * ebx2_r) * ebx2; + // } +} + +} // namespace cusz + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/include/kernel/launch_spm.cuh b/qtensor/compression/cusz/include/kernel/launch_spm.cuh index fe4cfaae..4f0bcdd9 100644 --- a/qtensor/compression/cusz/include/kernel/launch_spm.cuh +++ b/qtensor/compression/cusz/include/kernel/launch_spm.cuh @@ -1,348 +1,348 @@ -/** - * @file launch_sparse_method.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-06-13 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH -#define CUSZ_LAUNCH_SPARSE_METHOD_CUH - -#include -#include - -#include "../common.hh" -#include "../utils.hh" -#include "../utils/cusparse_err.cuh" - -// #if CUDART_VERSION >= 11020 - -template -void launch_cusparse_gather_cuda11200_onward( - cusparseHandle_t handle, - T* in_dense, - uint32_t const num_rows, - uint32_t const num_cols, - cusparseDnMatDescr_t dnmat, - cusparseSpMatDescr_t spmat, - void* d_buffer, - size_t& d_buffer_size, - M* d_rowptr, - M* d_colidx, - T* d_val, - int64_t& nnz, - float& milliseconds, - cudaStream_t stream) -{ - auto ld = num_rows; - - auto gather11_init_mat = [&]() { - // create dense matrix wrapper - CHECK_CUSPARSE( - cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE::type, CUSPARSE_ORDER_ROW)); - - // create CSR wrapper - CHECK_CUSPARSE(cusparseCreateCsr( - &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE::type)); - }; - - auto gather11_init_buffer = [&]() { - { // allocate an external buffer if needed - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize( - handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - - CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size)); - } - }; - - auto gather11_analysis = [&]() { - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE( - cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - }; - - int64_t num_rows_tmp, num_cols_tmp; - - auto gather11_get_nnz = [&]() { - // get number of non-zero elements - CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz)); - }; - - auto gather11_get_rowptr = [&]() { - // reset offsets, column indices, and values pointers - CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val)); - }; - - auto gather11_dn2csr = [&]() { - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE( - cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - }; - - /********************************************************************************/ - milliseconds = 0; - - CHECK_CUSPARSE(cusparseCreate(&handle)); - if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream)); // TODO move out - - gather11_init_mat(); - gather11_init_buffer(); - gather11_analysis(); - gather11_get_nnz(); - gather11_get_rowptr(); - gather11_dn2csr(); - - // destroy matrix/vector descriptors - CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat)); - CHECK_CUSPARSE(cusparseDestroySpMat(spmat)); - CHECK_CUSPARSE(cusparseDestroy(handle)); -} - -// void SpcodecCSR::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool -// header_on_device) - -template -void launch_cusparse_scatter_cuda11200_onward( - cusparseHandle_t handle, - int* d_rowptr, - int* d_colidx, - T* d_val, - int const num_rows, - int const num_cols, - int const nnz, - cusparseDnMatDescr_t dnmat, - cusparseSpMatDescr_t spmat, - void* d_buffer, - size_t& d_buffer_size, - T* out_dense, - float& milliseconds, - cudaStream_t stream) -{ - auto ld = num_rows; - - auto scatter11_init_mat = [&]() { - CHECK_CUSPARSE(cusparseCreateCsr( - &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE::type)); - - CHECK_CUSPARSE( - cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE::type, CUSPARSE_ORDER_ROW)); - }; - - auto scatter11_init_buffer = [&]() { - cuda_timer_t t; - t.timer_start(stream); - - // allocate an external buffer if needed - CHECK_CUSPARSE( - cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - - CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size)); - }; - - auto scatter11_csr2dn = [&]() { - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - }; - - /******************************************************************************/ - milliseconds = 0; - - CHECK_CUSPARSE(cusparseCreate(&handle)); - if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream)); - - scatter11_init_mat(); - scatter11_init_buffer(); - scatter11_csr2dn(); - - // destroy matrix/vector descriptors - CHECK_CUSPARSE(cusparseDestroySpMat(spmat)); - CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat)); - CHECK_CUSPARSE(cusparseDestroy(handle)); -} - -// #elif CUDART_VERSION >= 10000 - -template -void launch_cusparse_gather_before_cuda11200( - cusparseHandle_t handle, - T* in_dense, - uint32_t const num_rows, - uint32_t const num_cols, - cusparseMatDescr_t mat_desc, - void* d_work, - size_t& lwork_in_bytes, - M* d_rowptr, - M* d_colidx, - T* d_val, - int& nnz, // int is for compatibility; cuSPARSE of CUDA 11 changed data type - float& milliseconds, - cudaStream_t stream) -{ - auto ld = num_rows; - - float threshold{0}; - auto has_ext_stream{false}; - - /******************************************************************************/ - - auto gather10_init_and_probe = [&]() { - { // init - - CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc)); // 4. create rte.mat_desc - CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO)); // zero based - CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL)); // type - } - - { // probe - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt( - handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, - &lwork_in_bytes)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - } - - if (nullptr != d_work) cudaFree(d_work); - CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes)); // TODO where to release d_work? - }; - - auto gather10_compute_rowptr_and_nnz = [&]() { // step 4 - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE(cusparseSpruneDense2csrNnz( - handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - }; - - auto gather10_compute_colidx_and_val = [&]() { // step 5 - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE(cusparseSpruneDense2csr( // - handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work)); - - t.timer_end(stream); - milliseconds += t.get_time_elapsed(); - CHECK_CUDA(cudaStreamSynchronize(stream)); - }; - - /********************************************************************************/ - milliseconds = 0; - - if (stream) - has_ext_stream = true; - else - CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // 1. create stream - CHECK_CUSPARSE(cusparseCreate(&handle)); // 2. create handle - CHECK_CUSPARSE(cusparseSetStream(handle, stream)); // 3. bind stream - - gather10_init_and_probe(); - gather10_compute_rowptr_and_nnz(); - if (nnz == 0) { return; } - gather10_compute_colidx_and_val(); - - // TODO no need to destroy? - if (handle) cusparseDestroy(handle); - if (mat_desc) cusparseDestroyMatDescr(mat_desc); - if ((not has_ext_stream) and stream) cudaStreamDestroy(stream); - /********************************************************************************/ -} - -// #endif - -template -void launch_cusparse_scatter_before_cuda11200( - cusparseHandle_t handle, - int* d_rowptr, - int* d_colidx, - T* d_val, - int const num_rows, - int const num_cols, - int const nnz, - cusparseMatDescr_t mat_desc, - void* d_buffer, - size_t& d_buffer_size, - T* out_dense, - float& milliseconds, - cudaStream_t stream) -{ - auto ld = num_rows; - - auto has_external_stream = false; - - /******************************************************************************/ - - auto scatter10_init = [&]() { - CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc)); // 4. create descr - CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO)); // zero based - CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL)); // type - }; - - auto scatter10_sparse2dense = [&]() { - cuda_timer_t t; - t.timer_start(stream); - - CHECK_CUSPARSE( - cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld)); - - t.timer_end(); - milliseconds += t.get_time_elapsed(); - CHECK_CUDA(cudaStreamSynchronize(stream)); - }; - - /******************************************************************************/ - if (stream) - has_external_stream = true; - else - CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - CHECK_CUSPARSE(cusparseCreate(&handle)); - CHECK_CUSPARSE(cusparseSetStream(handle, stream)); - - scatter10_init(); - scatter10_sparse2dense(); - - if (handle) cusparseDestroy(handle); - if (mat_desc) cusparseDestroyMatDescr(mat_desc); - if ((not has_external_stream) and stream) cudaStreamDestroy(stream); - /******************************************************************************/ -} - -#endif +/** + * @file launch_sparse_method.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-06-13 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_LAUNCH_SPARSE_METHOD_CUH +#define CUSZ_LAUNCH_SPARSE_METHOD_CUH + +#include +#include + +#include "../common.hh" +#include "../utils.hh" +#include "../utils/cusparse_err.cuh" + +// #if CUDART_VERSION >= 11020 + +template +void launch_cusparse_gather_cuda11200_onward( + cusparseHandle_t handle, + T* in_dense, + uint32_t const num_rows, + uint32_t const num_cols, + cusparseDnMatDescr_t dnmat, + cusparseSpMatDescr_t spmat, + void* d_buffer, + size_t& d_buffer_size, + M* d_rowptr, + M* d_colidx, + T* d_val, + int64_t& nnz, + float& milliseconds, + cudaStream_t stream) +{ + auto ld = num_rows; + + auto gather11_init_mat = [&]() { + // create dense matrix wrapper + CHECK_CUSPARSE( + cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, in_dense, cuszCUSPARSE::type, CUSPARSE_ORDER_ROW)); + + // create CSR wrapper + CHECK_CUSPARSE(cusparseCreateCsr( + &spmat, num_rows, num_cols, 0, d_rowptr, nullptr, nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE::type)); + }; + + auto gather11_init_buffer = [&]() { + { // allocate an external buffer if needed + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE(cusparseDenseToSparse_bufferSize( + handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, &d_buffer_size)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + + CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size)); + } + }; + + auto gather11_analysis = [&]() { + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE( + cusparseDenseToSparse_analysis(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + }; + + int64_t num_rows_tmp, num_cols_tmp; + + auto gather11_get_nnz = [&]() { + // get number of non-zero elements + CHECK_CUSPARSE(cusparseSpMatGetSize(spmat, &num_rows_tmp, &num_cols_tmp, &nnz)); + }; + + auto gather11_get_rowptr = [&]() { + // reset offsets, column indices, and values pointers + CHECK_CUSPARSE(cusparseCsrSetPointers(spmat, d_rowptr, d_colidx, d_val)); + }; + + auto gather11_dn2csr = [&]() { + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE( + cusparseDenseToSparse_convert(handle, dnmat, spmat, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, d_buffer)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + }; + + /********************************************************************************/ + milliseconds = 0; + + CHECK_CUSPARSE(cusparseCreate(&handle)); + if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream)); // TODO move out + + gather11_init_mat(); + gather11_init_buffer(); + gather11_analysis(); + gather11_get_nnz(); + gather11_get_rowptr(); + gather11_dn2csr(); + + // destroy matrix/vector descriptors + CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat)); + CHECK_CUSPARSE(cusparseDestroySpMat(spmat)); + CHECK_CUSPARSE(cusparseDestroy(handle)); +} + +// void SpcodecCSR::impl::scatter_CUDA_11020(BYTE* in_csr, T* out_dense, cudaStream_t stream, bool +// header_on_device) + +template +void launch_cusparse_scatter_cuda11200_onward( + cusparseHandle_t handle, + int* d_rowptr, + int* d_colidx, + T* d_val, + int const num_rows, + int const num_cols, + int const nnz, + cusparseDnMatDescr_t dnmat, + cusparseSpMatDescr_t spmat, + void* d_buffer, + size_t& d_buffer_size, + T* out_dense, + float& milliseconds, + cudaStream_t stream) +{ + auto ld = num_rows; + + auto scatter11_init_mat = [&]() { + CHECK_CUSPARSE(cusparseCreateCsr( + &spmat, num_rows, num_cols, nnz, d_rowptr, d_colidx, d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cuszCUSPARSE::type)); + + CHECK_CUSPARSE( + cusparseCreateDnMat(&dnmat, num_rows, num_cols, ld, out_dense, cuszCUSPARSE::type, CUSPARSE_ORDER_ROW)); + }; + + auto scatter11_init_buffer = [&]() { + cuda_timer_t t; + t.timer_start(stream); + + // allocate an external buffer if needed + CHECK_CUSPARSE( + cusparseSparseToDense_bufferSize(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, &d_buffer_size)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + + CHECK_CUDA(cudaMalloc(&d_buffer, d_buffer_size)); + }; + + auto scatter11_csr2dn = [&]() { + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE(cusparseSparseToDense(handle, spmat, dnmat, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, d_buffer)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + }; + + /******************************************************************************/ + milliseconds = 0; + + CHECK_CUSPARSE(cusparseCreate(&handle)); + if (stream) CHECK_CUSPARSE(cusparseSetStream(handle, stream)); + + scatter11_init_mat(); + scatter11_init_buffer(); + scatter11_csr2dn(); + + // destroy matrix/vector descriptors + CHECK_CUSPARSE(cusparseDestroySpMat(spmat)); + CHECK_CUSPARSE(cusparseDestroyDnMat(dnmat)); + CHECK_CUSPARSE(cusparseDestroy(handle)); +} + +// #elif CUDART_VERSION >= 10000 + +template +void launch_cusparse_gather_before_cuda11200( + cusparseHandle_t handle, + T* in_dense, + uint32_t const num_rows, + uint32_t const num_cols, + cusparseMatDescr_t mat_desc, + void* d_work, + size_t& lwork_in_bytes, + M* d_rowptr, + M* d_colidx, + T* d_val, + int& nnz, // int is for compatibility; cuSPARSE of CUDA 11 changed data type + float& milliseconds, + cudaStream_t stream) +{ + auto ld = num_rows; + + float threshold{0}; + auto has_ext_stream{false}; + + /******************************************************************************/ + + auto gather10_init_and_probe = [&]() { + { // init + + CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc)); // 4. create rte.mat_desc + CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO)); // zero based + CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL)); // type + } + + { // probe + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE(cusparseSpruneDense2csr_bufferSizeExt( + handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, + &lwork_in_bytes)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + } + + if (nullptr != d_work) cudaFree(d_work); + CHECK_CUDA(cudaMalloc((void**)&d_work, lwork_in_bytes)); // TODO where to release d_work? + }; + + auto gather10_compute_rowptr_and_nnz = [&]() { // step 4 + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE(cusparseSpruneDense2csrNnz( + handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_rowptr, &nnz, d_work)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + }; + + auto gather10_compute_colidx_and_val = [&]() { // step 5 + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE(cusparseSpruneDense2csr( // + handle, num_rows, num_cols, in_dense, ld, &threshold, mat_desc, d_val, d_rowptr, d_colidx, d_work)); + + t.timer_end(stream); + milliseconds += t.get_time_elapsed(); + CHECK_CUDA(cudaStreamSynchronize(stream)); + }; + + /********************************************************************************/ + milliseconds = 0; + + if (stream) + has_ext_stream = true; + else + CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // 1. create stream + CHECK_CUSPARSE(cusparseCreate(&handle)); // 2. create handle + CHECK_CUSPARSE(cusparseSetStream(handle, stream)); // 3. bind stream + + gather10_init_and_probe(); + gather10_compute_rowptr_and_nnz(); + if (nnz == 0) { return; } + gather10_compute_colidx_and_val(); + + // TODO no need to destroy? + if (handle) cusparseDestroy(handle); + if (mat_desc) cusparseDestroyMatDescr(mat_desc); + if ((not has_ext_stream) and stream) cudaStreamDestroy(stream); + /********************************************************************************/ +} + +// #endif + +template +void launch_cusparse_scatter_before_cuda11200( + cusparseHandle_t handle, + int* d_rowptr, + int* d_colidx, + T* d_val, + int const num_rows, + int const num_cols, + int const nnz, + cusparseMatDescr_t mat_desc, + void* d_buffer, + size_t& d_buffer_size, + T* out_dense, + float& milliseconds, + cudaStream_t stream) +{ + auto ld = num_rows; + + auto has_external_stream = false; + + /******************************************************************************/ + + auto scatter10_init = [&]() { + CHECK_CUSPARSE(cusparseCreateMatDescr(&mat_desc)); // 4. create descr + CHECK_CUSPARSE(cusparseSetMatIndexBase(mat_desc, CUSPARSE_INDEX_BASE_ZERO)); // zero based + CHECK_CUSPARSE(cusparseSetMatType(mat_desc, CUSPARSE_MATRIX_TYPE_GENERAL)); // type + }; + + auto scatter10_sparse2dense = [&]() { + cuda_timer_t t; + t.timer_start(stream); + + CHECK_CUSPARSE( + cusparseScsr2dense(handle, num_rows, num_cols, mat_desc, d_val, d_rowptr, d_colidx, out_dense, ld)); + + t.timer_end(); + milliseconds += t.get_time_elapsed(); + CHECK_CUDA(cudaStreamSynchronize(stream)); + }; + + /******************************************************************************/ + if (stream) + has_external_stream = true; + else + CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CHECK_CUSPARSE(cusparseCreate(&handle)); + CHECK_CUSPARSE(cusparseSetStream(handle, stream)); + + scatter10_init(); + scatter10_sparse2dense(); + + if (handle) cusparseDestroy(handle); + if (mat_desc) cusparseDestroyMatDescr(mat_desc); + if ((not has_external_stream) and stream) cudaStreamDestroy(stream); + /******************************************************************************/ +} + +#endif diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.h b/qtensor/compression/cusz/include/kernel/lorenzo_all.h index 89f6f38f..de9f087e 100644 --- a/qtensor/compression/cusz/include/kernel/lorenzo_all.h +++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.h @@ -1,44 +1,44 @@ -/** - * @file kernel_cuda.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1 -#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1 - -#ifdef __cplusplus -extern "C" { -#endif - -#include "cusz/type.h" - -#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP) \ - cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \ - cudaStream_t stream); \ - cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream); - -C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float); -C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float); -C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float); -C_LORENZOI(fp32, fp32, fp32, float, float, float); - -C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double); -C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double); -C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double); -C_LORENZOI(fp64, fp32, fp64, double, float, double); - -#undef C_LORENZOI - -#ifdef __cplusplus -} -#endif - -#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */ +/** + * @file kernel_cuda.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef BD8A19DE_E881_4A26_9464_C51DAC6B14E1 +#define BD8A19DE_E881_4A26_9464_C51DAC6B14E1 + +#ifdef __cplusplus +extern "C" { +#endif + +#include "cusz/type.h" + +#define C_LORENZOI(Tliteral, Eliteral, FPliteral, T, E, FP) \ + cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \ + cudaStream_t stream); \ + cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream); + +C_LORENZOI(fp32, ui8, fp32, float, uint8_t, float); +C_LORENZOI(fp32, ui16, fp32, float, uint16_t, float); +C_LORENZOI(fp32, ui32, fp32, float, uint32_t, float); +C_LORENZOI(fp32, fp32, fp32, float, float, float); + +C_LORENZOI(fp64, ui8, fp64, double, uint8_t, double); +C_LORENZOI(fp64, ui16, fp64, double, uint16_t, double); +C_LORENZOI(fp64, ui32, fp64, double, uint32_t, double); +C_LORENZOI(fp64, fp32, fp64, double, float, double); + +#undef C_LORENZOI + +#ifdef __cplusplus +} +#endif + +#endif /* BD8A19DE_E881_4A26_9464_C51DAC6B14E1 */ diff --git a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh index f7308fe1..d87baffa 100644 --- a/qtensor/compression/cusz/include/kernel/lorenzo_all.hh +++ b/qtensor/compression/cusz/include/kernel/lorenzo_all.hh @@ -1,96 +1,96 @@ -/** - * @file kernel_cuda.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-01 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714 -#define C8C37773_7EF2_439B_B0EF_14D0058DC714 - -#include -#include "cusz/type.h" - -template -cusz_error_status compress_predict_lorenzo_i( - T* const data, // input - dim3 const len3, // - double const eb, // input (config) - int const radius, // - EQ* const eq, // output - T* outlier, // - uint32_t* outlier_idx, // - uint32_t* num_outliers, // - float* time_elapsed, // optional - cudaStream_t stream); // - -template -cusz_error_status decompress_predict_lorenzo_i( - EQ* eq, // input - dim3 const len3, // - T* outlier, // - uint32_t* outlier_idx, // - uint32_t const num_outliers, // - double const eb, // input (config) - int const radius, // - T* xdata, // output - float* time_elapsed, // optional - cudaStream_t stream); - -namespace asz { -namespace experimental { - -template -cusz_error_status compress_predict_lorenzo_ivar( - T* data, - dim3 const len3, - double const eb, - DeltaT* delta, - bool* signum, - float* time_elapsed, - cudaStream_t stream); - -template -cusz_error_status decompress_predict_lorenzo_ivar( - DeltaT* delta, - bool* signum, - dim3 const len3, - double const eb, - T* xdata, - float* time_elapsed, - cudaStream_t stream); - -} // namespace experimental -} // namespace asz - -template -cusz_error_status compress_predict_lorenzo_iproto( - T* const data, // input - dim3 const len3, // - double const eb, // input (config) - int const radius, // - EQ* const eq, // output - T* outlier, // - uint32_t* outlier_idx, // - uint32_t* num_outliers, // - float* time_elapsed, // optional - cudaStream_t stream); // - -template -cusz_error_status decompress_predict_lorenzo_iproto( - EQ* eq, // input - dim3 const len3, // - T* outlier, // - uint32_t* outlier_idx, // - uint32_t const num_outliers, // - double const eb, // input (config) - int const radius, // - T* xdata, // output - float* time_elapsed, // optional - cudaStream_t stream); - -#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */ +/** + * @file kernel_cuda.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-01 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef C8C37773_7EF2_439B_B0EF_14D0058DC714 +#define C8C37773_7EF2_439B_B0EF_14D0058DC714 + +#include +#include "cusz/type.h" + +template +cusz_error_status compress_predict_lorenzo_i( + T* const data, // input + dim3 const len3, // + double const eb, // input (config) + int const radius, // + EQ* const eq, // output + T* outlier, // + uint32_t* outlier_idx, // + uint32_t* num_outliers, // + float* time_elapsed, // optional + cudaStream_t stream); // + +template +cusz_error_status decompress_predict_lorenzo_i( + EQ* eq, // input + dim3 const len3, // + T* outlier, // + uint32_t* outlier_idx, // + uint32_t const num_outliers, // + double const eb, // input (config) + int const radius, // + T* xdata, // output + float* time_elapsed, // optional + cudaStream_t stream); + +namespace asz { +namespace experimental { + +template +cusz_error_status compress_predict_lorenzo_ivar( + T* data, + dim3 const len3, + double const eb, + DeltaT* delta, + bool* signum, + float* time_elapsed, + cudaStream_t stream); + +template +cusz_error_status decompress_predict_lorenzo_ivar( + DeltaT* delta, + bool* signum, + dim3 const len3, + double const eb, + T* xdata, + float* time_elapsed, + cudaStream_t stream); + +} // namespace experimental +} // namespace asz + +template +cusz_error_status compress_predict_lorenzo_iproto( + T* const data, // input + dim3 const len3, // + double const eb, // input (config) + int const radius, // + EQ* const eq, // output + T* outlier, // + uint32_t* outlier_idx, // + uint32_t* num_outliers, // + float* time_elapsed, // optional + cudaStream_t stream); // + +template +cusz_error_status decompress_predict_lorenzo_iproto( + EQ* eq, // input + dim3 const len3, // + T* outlier, // + uint32_t* outlier_idx, // + uint32_t const num_outliers, // + double const eb, // input (config) + int const radius, // + T* xdata, // output + float* time_elapsed, // optional + cudaStream_t stream); + +#endif /* C8C37773_7EF2_439B_B0EF_14D0058DC714 */ diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.h b/qtensor/compression/cusz/include/kernel/spv_gpu.h index fb50119c..496dd4eb 100644 --- a/qtensor/compression/cusz/include/kernel/spv_gpu.h +++ b/qtensor/compression/cusz/include/kernel/spv_gpu.h @@ -1,42 +1,42 @@ -/** - * @file spv_gpu.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 -#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -#define SPV(Tliteral, Mliteral, T, M) \ - void spv_gather_T##Tliteral##_M##Mliteral( \ - T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \ - \ - void spv_scatter_T##Tliteral##_M##Mliteral( \ - T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream); - -SPV(ui8, ui32, uint8_t, uint32_t) -SPV(ui16, ui32, uint16_t, uint32_t) -SPV(ui32, ui32, uint32_t, uint32_t) -SPV(ui64, ui32, uint64_t, uint32_t) -SPV(fp32, ui32, float, uint32_t) -SPV(fp64, ui32, double, uint32_t) - -#undef SPV - -#ifdef __cplusplus -} -#endif - -#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */ +/** + * @file spv_gpu.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 +#define B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#define SPV(Tliteral, Mliteral, T, M) \ + void spv_gather_T##Tliteral##_M##Mliteral( \ + T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream); \ + \ + void spv_scatter_T##Tliteral##_M##Mliteral( \ + T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream); + +SPV(ui8, ui32, uint8_t, uint32_t) +SPV(ui16, ui32, uint16_t, uint32_t) +SPV(ui32, ui32, uint32_t, uint32_t) +SPV(ui64, ui32, uint64_t, uint32_t) +SPV(fp32, ui32, float, uint32_t) +SPV(fp64, ui32, double, uint32_t) + +#undef SPV + +#ifdef __cplusplus +} +#endif + +#endif /* B1B21251_C3C3_4BC1_B4E0_75D9D86EE7F3 */ diff --git a/qtensor/compression/cusz/include/kernel/spv_gpu.hh b/qtensor/compression/cusz/include/kernel/spv_gpu.hh index 6b978abc..c2f021df 100644 --- a/qtensor/compression/cusz/include/kernel/spv_gpu.hh +++ b/qtensor/compression/cusz/include/kernel/spv_gpu.hh @@ -1,33 +1,33 @@ -/** - * @file spv_gpu.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef A54D2009_1D4F_4113_9E26_9695A3669224 -#define A54D2009_1D4F_4113_9E26_9695A3669224 -#include - -namespace psz { - -template -void spv_gather( - T* in, - size_t const in_len, - T* d_val, - uint32_t* d_idx, - int* nnz, - float* milliseconds, - cudaStream_t stream); - -template -void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream); - -} // namespace psz - -#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */ +/** + * @file spv_gpu.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef A54D2009_1D4F_4113_9E26_9695A3669224 +#define A54D2009_1D4F_4113_9E26_9695A3669224 +#include + +namespace psz { + +template +void spv_gather( + T* in, + size_t const in_len, + T* d_val, + uint32_t* d_idx, + int* nnz, + float* milliseconds, + cudaStream_t stream); + +template +void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream); + +} // namespace psz + +#endif /* A54D2009_1D4F_4113_9E26_9695A3669224 */ diff --git a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh index 861a2e2c..7c8d4ce0 100644 --- a/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh +++ b/qtensor/compression/cusz/include/kernel/v2_lorenzo.hh @@ -1,32 +1,32 @@ -/** - * @file v2_lorenzo.hh - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-23 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762 -#define CD52BDA6_9376_43FF_BFDA_693204FA8762 - -#include "compaction.hh" -#include "cusz/type.h" - -template -cusz_error_status v2_compress_predict_lorenzo_i( - T* const data, // input - dim3 const data_len3, // - double const eb, // input (config) - int const radius, // - E* const eq, // output - dim3 const eq_len3, // - T* const anchor, // - dim3 const anchor_len3, // - CompactionDRAM outlier, // - float* time_elapsed, // optional - cudaStream_t stream); // - -#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */ +/** + * @file v2_lorenzo.hh + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-23 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef CD52BDA6_9376_43FF_BFDA_693204FA8762 +#define CD52BDA6_9376_43FF_BFDA_693204FA8762 + +#include "compaction.hh" +#include "cusz/type.h" + +template +cusz_error_status v2_compress_predict_lorenzo_i( + T* const data, // input + dim3 const data_len3, // + double const eb, // input (config) + int const radius, // + E* const eq, // output + dim3 const eq_len3, // + T* const anchor, // + dim3 const anchor_len3, // + CompactionDRAM outlier, // + float* time_elapsed, // optional + cudaStream_t stream); // + +#endif /* CD52BDA6_9376_43FF_BFDA_693204FA8762 */ diff --git a/qtensor/compression/cusz/include/pipeline/compaction_g.inl b/qtensor/compression/cusz/include/pipeline/compaction_g.inl index fd312c82..7a854101 100644 --- a/qtensor/compression/cusz/include/pipeline/compaction_g.inl +++ b/qtensor/compression/cusz/include/pipeline/compaction_g.inl @@ -1,73 +1,73 @@ -/** - * @file compaction_g.inl - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2022-12-22 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA -#define F712F74C_7488_4445_83EE_EE7F88A64BBA - -#include -#include -#include "compaction.hh" - -#include -#include - -// TODO filename -> `compaction` -template -struct CompactionDRAM { - using type = T; - T* val; - uint32_t* idx; - uint32_t* count; - uint32_t* h_count; - - void allocate(size_t len, bool device = true) - { - if (device) { - cudaMalloc(&idx, sizeof(uint32_t) * len); - cudaMalloc(&val, sizeof(T) * len); - cudaMalloc(&count, sizeof(T) * 1); - cudaMallocHost(&h_count, sizeof(T) * 1); - } - else { - cudaMallocHost(&idx, sizeof(uint32_t) * len); - cudaMallocHost(&val, sizeof(T) * len); - cudaMallocHost(&count, sizeof(T) * 1); - - memset(count, 0x0, sizeof(T) * 1); - } - } - - void make_count_host_accessible(cudaStream_t stream) - { - cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream); - } - - uint32_t access_count_on_host() { return *h_count; } - - void allocate_managed(size_t len) - { - cudaMallocManaged(&idx, sizeof(uint32_t) * len); - cudaMallocManaged(&val, sizeof(T) * len); - cudaMallocManaged(&count, sizeof(T) * 1); - - cudaMemset(count, 0x0, sizeof(T) * 1); - } - - void destroy() - { - if (h_count) cudaFreeHost(h_count); - cudaFree(idx); - cudaFree(val); - cudaFree(count); - } -}; - -#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */ +/** + * @file compaction_g.inl + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2022-12-22 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef F712F74C_7488_4445_83EE_EE7F88A64BBA +#define F712F74C_7488_4445_83EE_EE7F88A64BBA + +#include +#include +#include "compaction.hh" + +#include +#include + +// TODO filename -> `compaction` +template +struct CompactionDRAM { + using type = T; + T* val; + uint32_t* idx; + uint32_t* count; + uint32_t* h_count; + + void allocate(size_t len, bool device = true) + { + if (device) { + cudaMalloc(&idx, sizeof(uint32_t) * len); + cudaMalloc(&val, sizeof(T) * len); + cudaMalloc(&count, sizeof(T) * 1); + cudaMallocHost(&h_count, sizeof(T) * 1); + } + else { + cudaMallocHost(&idx, sizeof(uint32_t) * len); + cudaMallocHost(&val, sizeof(T) * len); + cudaMallocHost(&count, sizeof(T) * 1); + + memset(count, 0x0, sizeof(T) * 1); + } + } + + void make_count_host_accessible(cudaStream_t stream) + { + cudaMemcpyAsync(h_count, count, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream); + } + + uint32_t access_count_on_host() { return *h_count; } + + void allocate_managed(size_t len) + { + cudaMallocManaged(&idx, sizeof(uint32_t) * len); + cudaMallocManaged(&val, sizeof(T) * len); + cudaMallocManaged(&count, sizeof(T) * 1); + + cudaMemset(count, 0x0, sizeof(T) * 1); + } + + void destroy() + { + if (h_count) cudaFreeHost(h_count); + cudaFree(idx); + cudaFree(val); + cudaFree(count); + } +}; + +#endif /* F712F74C_7488_4445_83EE_EE7F88A64BBA */ diff --git a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh index fa843f5f..5e0c8a83 100644 --- a/qtensor/compression/cusz/include/pipeline/v2_compressor.hh +++ b/qtensor/compression/cusz/include/pipeline/v2_compressor.hh @@ -1,146 +1,146 @@ -/** - * @file v2_compressor.hh - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-29 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include - -#include "common/type_traits.hh" -#include "compaction.hh" -#include "component.hh" -#include "context.hh" -#include "header.h" - -// TODO move outward -#include "compaction_g.inl" - -using Context = cusz::Context; - -namespace psz { - -template -class v2_Compressor { - public: - using BYTE = uint8_t; - - using T = typename CONFIG::Predictor::Origin; - using FP = typename CONFIG::Predictor::Precision; - using E = typename CONFIG::Predictor::ErrCtrl; - using H = typename CONFIG::Codec::Encoded; - using M = typename CONFIG::Codec::MetadataT; - using H_FB = typename CONFIG::FallbackCodec::Encoded; - - using TimeRecord = std::vector>; - using timerecord_t = TimeRecord*; - - private: - class impl; - std::unique_ptr pimpl; - - public: - ~v2_Compressor(); - v2_Compressor(); - v2_Compressor(const v2_Compressor&); - v2_Compressor& operator=(const v2_Compressor&); - v2_Compressor(v2_Compressor&&); - v2_Compressor& operator=(v2_Compressor&&); - - // methods - void init(Context*); - void init(v2_header*); - void destroy(); - void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); - void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); - void clear_buffer(); - // getter - void export_header(v2_header&); - void export_header(v2_header*); - void export_timerecord(TimeRecord*); -}; - -template -class v2_Compressor::impl { - public: - using Codec = typename CONFIG::Codec; - using BYTE = uint8_t; - using T = typename CONFIG::Predictor::Origin; - using FP = typename CONFIG::Predictor::Precision; - using EQ = uint32_t; - using H = typename CONFIG::Codec::Encoded; - using M = uint32_t; - using IDX = uint32_t; - using H_FB = typename CONFIG::FallbackCodec::Encoded; - - using TimeRecord = std::vector>; - using timerecord_t = TimeRecord*; - - private: - // state - // bool use_fallback_codec{false}; - // bool fallback_codec_allocated{false}; - - BYTE* d_reserved_for_archive{nullptr}; - - // profiling - // TimeRecord timerecord; - // header - v2_header header; - // components - - Codec* codec; - - // arrays - T* d_anchor; - uint32_t* d_errctrl; - uint32_t* d_freq; - CompactionDRAM outlier; - - int sp_factor{20}; - - struct { - float construct, hist, encode; - } comp_time; - - struct { - float scatter, decode, reconstruct; - } decomp_time; - - dim3 data_len3; - size_t data_len; - - public: - ~impl(); - impl(); - - // public methods - void init(Context* config); - void init(v2_header* config); - - void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); - void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); - - // getter - void export_header(v2_header&); - void export_header(v2_header*); - // void export_timerecord(TimeRecord*); - BYTE* var_archive() { return d_reserved_for_archive; }; - - private: - // helper - template - void __init(ContextOrHeader*); - - // void collect_compress_timerecord(); - // void collect_decompress_timerecord(); - void destroy(); - // getter -}; - -} // namespace psz +/** + * @file v2_compressor.hh + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-29 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include + +#include "common/type_traits.hh" +#include "compaction.hh" +#include "component.hh" +#include "context.hh" +#include "header.h" + +// TODO move outward +#include "compaction_g.inl" + +using Context = cusz::Context; + +namespace psz { + +template +class v2_Compressor { + public: + using BYTE = uint8_t; + + using T = typename CONFIG::Predictor::Origin; + using FP = typename CONFIG::Predictor::Precision; + using E = typename CONFIG::Predictor::ErrCtrl; + using H = typename CONFIG::Codec::Encoded; + using M = typename CONFIG::Codec::MetadataT; + using H_FB = typename CONFIG::FallbackCodec::Encoded; + + using TimeRecord = std::vector>; + using timerecord_t = TimeRecord*; + + private: + class impl; + std::unique_ptr pimpl; + + public: + ~v2_Compressor(); + v2_Compressor(); + v2_Compressor(const v2_Compressor&); + v2_Compressor& operator=(const v2_Compressor&); + v2_Compressor(v2_Compressor&&); + v2_Compressor& operator=(v2_Compressor&&); + + // methods + void init(Context*); + void init(v2_header*); + void destroy(); + void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); + void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); + void clear_buffer(); + // getter + void export_header(v2_header&); + void export_header(v2_header*); + void export_timerecord(TimeRecord*); +}; + +template +class v2_Compressor::impl { + public: + using Codec = typename CONFIG::Codec; + using BYTE = uint8_t; + using T = typename CONFIG::Predictor::Origin; + using FP = typename CONFIG::Predictor::Precision; + using EQ = uint32_t; + using H = typename CONFIG::Codec::Encoded; + using M = uint32_t; + using IDX = uint32_t; + using H_FB = typename CONFIG::FallbackCodec::Encoded; + + using TimeRecord = std::vector>; + using timerecord_t = TimeRecord*; + + private: + // state + // bool use_fallback_codec{false}; + // bool fallback_codec_allocated{false}; + + BYTE* d_reserved_for_archive{nullptr}; + + // profiling + // TimeRecord timerecord; + // header + v2_header header; + // components + + Codec* codec; + + // arrays + T* d_anchor; + uint32_t* d_errctrl; + uint32_t* d_freq; + CompactionDRAM outlier; + + int sp_factor{20}; + + struct { + float construct, hist, encode; + } comp_time; + + struct { + float scatter, decode, reconstruct; + } decomp_time; + + dim3 data_len3; + size_t data_len; + + public: + ~impl(); + impl(); + + // public methods + void init(Context* config); + void init(v2_header* config); + + void compress(Context*, T*, BYTE*&, size_t&, cudaStream_t = nullptr, bool = false); + void decompress(v2_header*, BYTE*, T*, cudaStream_t = nullptr, bool = true); + + // getter + void export_header(v2_header&); + void export_header(v2_header*); + // void export_timerecord(TimeRecord*); + BYTE* var_archive() { return d_reserved_for_archive; }; + + private: + // helper + template + void __init(ContextOrHeader*); + + // void collect_compress_timerecord(); + // void collect_decompress_timerecord(); + void destroy(); + // getter +}; + +} // namespace psz diff --git a/qtensor/compression/cusz/include/stat/compare.h b/qtensor/compression/cusz/include/stat/compare.h index 9575d72a..bc60fb0b 100644 --- a/qtensor/compression/cusz/include/stat/compare.h +++ b/qtensor/compression/cusz/include/stat/compare.h @@ -1,57 +1,57 @@ -/** - * @file compare.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-09 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2 -#define CE05A256_23CB_4243_8839_B1FDA9C540D2 - -#ifdef __cplus_plus -extern "C" { -#endif - -#include -#include -#include "../cusz/type.h" - -#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]); - -#define COMPARE_LOSSLESS(Tliteral, T) \ - bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \ - bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len); - -#define COMPARE_LOSSY(Tliteral, T) \ - bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \ - void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len); \ - \ - bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \ - void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len); - -DESCRIPTION(ui8, uint8_t) -DESCRIPTION(ui16, uint16_t) -DESCRIPTION(ui32, uint32_t) -DESCRIPTION(fp32, float) -DESCRIPTION(fp64, double) - -COMPARE_LOSSLESS(fp32, float) -COMPARE_LOSSLESS(fp64, double) -COMPARE_LOSSLESS(ui8, uint8_t) -COMPARE_LOSSLESS(ui16, uint16_t) -COMPARE_LOSSLESS(ui32, uint32_t) - -COMPARE_LOSSY(fp32, float) -COMPARE_LOSSY(fp64, double) - -#undef CPPSTD_COMPARE - -#ifdef __cplus_plus -} -#endif - -#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */ +/** + * @file compare.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-09 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef CE05A256_23CB_4243_8839_B1FDA9C540D2 +#define CE05A256_23CB_4243_8839_B1FDA9C540D2 + +#ifdef __cplus_plus +extern "C" { +#endif + +#include +#include +#include "../cusz/type.h" + +#define DESCRIPTION(Tliteral, T) void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]); + +#define COMPARE_LOSSLESS(Tliteral, T) \ + bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len); \ + bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len); + +#define COMPARE_LOSSY(Tliteral, T) \ + bool cppstd_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \ + void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len); \ + \ + bool thrustgpu_error_bounded_T##Tliteral(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); \ + void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len); + +DESCRIPTION(ui8, uint8_t) +DESCRIPTION(ui16, uint16_t) +DESCRIPTION(ui32, uint32_t) +DESCRIPTION(fp32, float) +DESCRIPTION(fp64, double) + +COMPARE_LOSSLESS(fp32, float) +COMPARE_LOSSLESS(fp64, double) +COMPARE_LOSSLESS(ui8, uint8_t) +COMPARE_LOSSLESS(ui16, uint16_t) +COMPARE_LOSSLESS(ui32, uint32_t) + +COMPARE_LOSSY(fp32, float) +COMPARE_LOSSY(fp64, double) + +#undef CPPSTD_COMPARE + +#ifdef __cplus_plus +} +#endif + +#endif /* CE05A256_23CB_4243_8839_B1FDA9C540D2 */ diff --git a/qtensor/compression/cusz/include/stat/compare_cpu.hh b/qtensor/compression/cusz/include/stat/compare_cpu.hh index 19846adc..3cd6c421 100644 --- a/qtensor/compression/cusz/include/stat/compare_cpu.hh +++ b/qtensor/compression/cusz/include/stat/compare_cpu.hh @@ -1,62 +1,62 @@ -/** - * @file compare_cpu.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-09 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef C93C3857_8821_4988_B6F0_4E885060F642 -#define C93C3857_8821_4988_B6F0_4E885060F642 - -#include "compare.h" - -namespace psz { - -template -bool cppstd_identical(T* d1, T* d2, size_t const len); - -template -bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); - -template -void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len); - -} // namespace psz - -#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T) \ - template <> \ - bool psz::cppstd_identical(T * d1, T * d2, size_t const len) \ - { \ - return cppstd_identical_T##Tliteral(d1, d2, len); \ - } - -#define CPPSTD_COMPARE_LOSSY(Tliteral, T) \ - template <> \ - bool psz::cppstd_error_bounded(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \ - { \ - return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx); \ - } \ - \ - template <> \ - void psz::cppstd_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ - { \ - cppstd_assess_quality_T##Tliteral(s, xdata, odata, len); \ - } - -CPPSTD_COMPARE_LOSSLESS(fp32, float) -CPPSTD_COMPARE_LOSSLESS(fp64, double) -CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t) -CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t) -CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t) - -CPPSTD_COMPARE_LOSSY(fp32, float); -CPPSTD_COMPARE_LOSSY(fp64, double); - -#undef CPPSTD_COMPARE_LOSSLESS -#undef CPPSTD_COMPARE_LOSSY - -#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */ +/** + * @file compare_cpu.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-09 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef C93C3857_8821_4988_B6F0_4E885060F642 +#define C93C3857_8821_4988_B6F0_4E885060F642 + +#include "compare.h" + +namespace psz { + +template +bool cppstd_identical(T* d1, T* d2, size_t const len); + +template +bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); + +template +void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len); + +} // namespace psz + +#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T) \ + template <> \ + bool psz::cppstd_identical(T * d1, T * d2, size_t const len) \ + { \ + return cppstd_identical_T##Tliteral(d1, d2, len); \ + } + +#define CPPSTD_COMPARE_LOSSY(Tliteral, T) \ + template <> \ + bool psz::cppstd_error_bounded(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \ + { \ + return cppstd_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx); \ + } \ + \ + template <> \ + void psz::cppstd_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ + { \ + cppstd_assess_quality_T##Tliteral(s, xdata, odata, len); \ + } + +CPPSTD_COMPARE_LOSSLESS(fp32, float) +CPPSTD_COMPARE_LOSSLESS(fp64, double) +CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t) +CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t) +CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t) + +CPPSTD_COMPARE_LOSSY(fp32, float); +CPPSTD_COMPARE_LOSSY(fp64, double); + +#undef CPPSTD_COMPARE_LOSSLESS +#undef CPPSTD_COMPARE_LOSSY + +#endif /* C93C3857_8821_4988_B6F0_4E885060F642 */ diff --git a/qtensor/compression/cusz/include/stat/compare_gpu.hh b/qtensor/compression/cusz/include/stat/compare_gpu.hh index 482c2fab..78013ca7 100644 --- a/qtensor/compression/cusz/include/stat/compare_gpu.hh +++ b/qtensor/compression/cusz/include/stat/compare_gpu.hh @@ -1,33 +1,33 @@ -/** - * @file compare_gpu.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-09 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862 -#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862 - -#include "compare.h" - -namespace psz { - -template -void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]); - -template -bool thrustgpu_identical(T* d1, T* d2, size_t const len); - -template -bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); - -template -void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len); - -} // namespace psz - -#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */ +/** + * @file compare_gpu.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-09 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef B0EE0E82_B3AA_4946_A589_A3A6A83DD862 +#define B0EE0E82_B3AA_4946_A589_A3A6A83DD862 + +#include "compare.h" + +namespace psz { + +template +void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]); + +template +bool thrustgpu_identical(T* d1, T* d2, size_t const len); + +template +bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx); + +template +void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len); + +} // namespace psz + +#endif /* B0EE0E82_B3AA_4946_A589_A3A6A83DD862 */ diff --git a/qtensor/compression/cusz/include/stat/stat.h b/qtensor/compression/cusz/include/stat/stat.h index 971d94bc..ade8deea 100644 --- a/qtensor/compression/cusz/include/stat/stat.h +++ b/qtensor/compression/cusz/include/stat/stat.h @@ -1,29 +1,29 @@ -/** - * @file stat.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA -#define BBBB5712_FF60_4262_B927_85B113FD26BA - -#include "cusz/type.h" - -#define HIST_C(Tname, T) \ - cusz_error_status histogram_T##Tname( \ - T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \ - cudaStream_t stream); - -HIST_C(ui8, uint8_t) -HIST_C(ui16, uint16_t) -HIST_C(ui32, uint32_t) -HIST_C(ui64, uint64_t) - -#undef HIST_C - -#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */ +/** + * @file stat.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef BBBB5712_FF60_4262_B927_85B113FD26BA +#define BBBB5712_FF60_4262_B927_85B113FD26BA + +#include "cusz/type.h" + +#define HIST_C(Tname, T) \ + cusz_error_status histogram_T##Tname( \ + T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \ + cudaStream_t stream); + +HIST_C(ui8, uint8_t) +HIST_C(ui16, uint16_t) +HIST_C(ui32, uint32_t) +HIST_C(ui64, uint64_t) + +#undef HIST_C + +#endif /* BBBB5712_FF60_4262_B927_85B113FD26BA */ diff --git a/qtensor/compression/cusz/include/stat/stat.hh b/qtensor/compression/cusz/include/stat/stat.hh index 636192a4..fedf6417 100644 --- a/qtensor/compression/cusz/include/stat/stat.hh +++ b/qtensor/compression/cusz/include/stat/stat.hh @@ -1,15 +1,15 @@ -/** - * @file stat.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9 -#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9 - -#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */ +/** + * @file stat.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef B005D07B_D92D_4DF0_90D0_87A7B7C310C9 +#define B005D07B_D92D_4DF0_90D0_87A7B7C310C9 + +#endif /* B005D07B_D92D_4DF0_90D0_87A7B7C310C9 */ diff --git a/qtensor/compression/cusz/include/stat/stat_g.hh b/qtensor/compression/cusz/include/stat/stat_g.hh index a76ea6f9..45f2f84d 100644 --- a/qtensor/compression/cusz/include/stat/stat_g.hh +++ b/qtensor/compression/cusz/include/stat/stat_g.hh @@ -1,44 +1,44 @@ -/** - * @file stat_g.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 -#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 - -#include -#include "cusz/type.h" - -namespace asz { -namespace stat { - -/** - * @brief Get frequency: a kernel wrapper - * - * @tparam T input type - * @param in_data input device array - * @param in_len input host var; len of in_data - * @param out_freq output device array - * @param nbin input host var; len of out_freq - * @param milliseconds output time elapsed - * @param stream optional stream - */ -template -cusz_error_status histogram( - T* in_data, - size_t const in_len, - uint32_t* out_freq, - int const nbin, - float* milliseconds, - cudaStream_t stream = nullptr); - -} // namespace stat -} // namespace asz - -#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */ +/** + * @file stat_g.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 +#define D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 + +#include +#include "cusz/type.h" + +namespace asz { +namespace stat { + +/** + * @brief Get frequency: a kernel wrapper + * + * @tparam T input type + * @param in_data input device array + * @param in_len input host var; len of in_data + * @param out_freq output device array + * @param nbin input host var; len of out_freq + * @param milliseconds output time elapsed + * @param stream optional stream + */ +template +cusz_error_status histogram( + T* in_data, + size_t const in_len, + uint32_t* out_freq, + int const nbin, + float* milliseconds, + cudaStream_t stream = nullptr); + +} // namespace stat +} // namespace asz + +#endif /* D8B68EB9_A86B_4AEA_AD4C_3DF22827E7C3 */ diff --git a/qtensor/compression/cusz/include/utils.hh b/qtensor/compression/cusz/include/utils.hh index 68ec1d2b..fd15517c 100644 --- a/qtensor/compression/cusz/include/utils.hh +++ b/qtensor/compression/cusz/include/utils.hh @@ -1,21 +1,21 @@ -/** - * @file utils.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-07-12 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef UTILS_HH -#define UTILS_HH - -#include "utils/cuda_err.cuh" -#include "utils/cuda_mem.cuh" -#include "utils/format.hh" -#include "utils/io.hh" -#include "utils/strhelper.hh" - +/** + * @file utils.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-07-12 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef UTILS_HH +#define UTILS_HH + +#include "utils/cuda_err.cuh" +#include "utils/cuda_mem.cuh" +#include "utils/format.hh" +#include "utils/io.hh" +#include "utils/strhelper.hh" + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/include/utils/cuda_err.cuh b/qtensor/compression/cusz/include/utils/cuda_err.cuh index 0812c60e..5b80b04b 100644 --- a/qtensor/compression/cusz/include/utils/cuda_err.cuh +++ b/qtensor/compression/cusz/include/utils/cuda_err.cuh @@ -1,185 +1,185 @@ -#ifndef CUDA_ERR_CUH -#define CUDA_ERR_CUH - -/** - * @file cuda_err.cuh - * @author Jiannan Tian - * @brief CUDA runtime error handling macros. - * @version 0.2 - * @date 2020-09-20 - * Created on: 2019-10-08 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include -#include -#include - -struct cusz_cuda_exception : public std::exception { - cusz_cuda_exception(const char* err, int err_code, const char* file, int line) { - std::stringstream ss; - ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')'; - err_msg = ss.str(); - } - const char* what() const noexcept { - return err_msg.c_str(); - } - std::string err_msg; -}; - -// back compatibility start -static void HandleError(cudaError_t err, const char* file, int line) -{ - if (err != cudaSuccess) { - throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line); - } -} -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) -// back compatibility end - -static void check_cuda_error(cudaError_t status, const char* file, int line) -{ - if (cudaSuccess != status) { - /* - printf("\nCUDA error/status reference (as of CUDA 11):\n"); - printf("cudaSuccess -> %d\n", cudaSuccess); - printf("cudaErrorInvalidValue -> %d\n", cudaErrorInvalidValue); - printf("cudaErrorMemoryAllocation -> %d\n", cudaErrorMemoryAllocation); - printf("cudaErrorInitializationError -> %d\n", cudaErrorInitializationError); - printf("cudaErrorCudartUnloading -> %d\n", cudaErrorCudartUnloading); - printf("cudaErrorProfilerDisabled -> %d\n", cudaErrorProfilerDisabled); - printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized); - printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted); - printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped); - printf("cudaErrorInvalidConfiguration -> %d\n", cudaErrorInvalidConfiguration); - printf("cudaErrorInvalidPitchValue -> %d\n", cudaErrorInvalidPitchValue); - printf("cudaErrorInvalidSymbol -> %d\n", cudaErrorInvalidSymbol); - printf("cudaErrorInvalidHostPointer (Deprecated)-> %d\n", cudaErrorInvalidHostPointer); - printf("cudaErrorInvalidDevicePointer (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer); - printf("cudaErrorInvalidTexture -> %d\n", cudaErrorInvalidTexture); - printf("cudaErrorInvalidTextureBinding -> %d\n", cudaErrorInvalidTextureBinding); - printf("cudaErrorInvalidChannelDescriptor -> %d\n", cudaErrorInvalidChannelDescriptor); - printf("cudaErrorInvalidMemcpyDirection -> %d\n", cudaErrorInvalidMemcpyDirection); - printf("cudaErrorAddressOfConstant (Deprecated)-> %d\n", cudaErrorAddressOfConstant); - printf("cudaErrorTextureFetchFailed (Deprecated)-> %d\n", cudaErrorTextureFetchFailed); - printf("cudaErrorTextureNotBound (Deprecated)-> %d\n", cudaErrorTextureNotBound); - printf("cudaErrorSynchronizationError (Deprecated)-> %d\n", cudaErrorSynchronizationError); - printf("cudaErrorInvalidFilterSetting -> %d\n", cudaErrorInvalidFilterSetting); - printf("cudaErrorInvalidNormSetting -> %d\n", cudaErrorInvalidNormSetting); - printf("cudaErrorMixedDeviceExecution (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution); - printf("cudaErrorNotYetImplemented (Deprecated)-> %d\n", cudaErrorNotYetImplemented); - printf("cudaErrorMemoryValueTooLarge (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge); - printf("cudaErrorInsufficientDriver -> %d\n", cudaErrorInsufficientDriver); - printf("cudaErrorInvalidSurface -> %d\n", cudaErrorInvalidSurface); - printf("cudaErrorDuplicateVariableName -> %d\n", cudaErrorDuplicateVariableName); - printf("cudaErrorDuplicateTextureName -> %d\n", cudaErrorDuplicateTextureName); - printf("cudaErrorDuplicateSurfaceName -> %d\n", cudaErrorDuplicateSurfaceName); - printf("cudaErrorDevicesUnavailable -> %d\n", cudaErrorDevicesUnavailable); - printf("cudaErrorIncompatibleDriverContext -> %d\n", cudaErrorIncompatibleDriverContext); - printf("cudaErrorMissingConfiguration -> %d\n", cudaErrorMissingConfiguration); - printf("cudaErrorPriorLaunchFailure (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure); - printf("cudaErrorLaunchMaxDepthExceeded -> %d\n", cudaErrorLaunchMaxDepthExceeded); - printf("cudaErrorLaunchFileScopedTex -> %d\n", cudaErrorLaunchFileScopedTex); - printf("cudaErrorLaunchFileScopedSurf -> %d\n", cudaErrorLaunchFileScopedSurf); - printf("cudaErrorSyncDepthExceeded -> %d\n", cudaErrorSyncDepthExceeded); - printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded); - printf("cudaErrorInvalidDeviceFunction -> %d\n", cudaErrorInvalidDeviceFunction); - printf("cudaErrorNoDevice -> %d\n", cudaErrorNoDevice); - printf("cudaErrorInvalidDevice -> %d\n", cudaErrorInvalidDevice); - printf("cudaErrorStartupFailure -> %d\n", cudaErrorStartupFailure); - printf("cudaErrorInvalidKernelImage -> %d\n", cudaErrorInvalidKernelImage); - #if (CUDART_VERSION == 1100) - printf("cudaErrorDeviceUninitialized -> %d\n", cudaErrorDeviceUninitialized); - #endif - printf("cudaErrorMapBufferObjectFailed -> %d\n", cudaErrorMapBufferObjectFailed); - printf("cudaErrorUnmapBufferObjectFailed -> %d\n", cudaErrorUnmapBufferObjectFailed); - #if (CUDART_VERSION == 1010) - printf("cudaErrorArrayIsMapped -> %d\n", cudaErrorArrayIsMapped); - printf("cudaErrorAlreadyMapped -> %d\n", cudaErrorAlreadyMapped); - #endif - printf("cudaErrorNoKernelImageForDevice -> %d\n", cudaErrorNoKernelImageForDevice); - #if (CUDART_VERSION == 1010) - printf("cudaErrorAlreadyAcquired -> %d\n", cudaErrorAlreadyAcquired); - printf("cudaErrorNotMapped -> %d\n", cudaErrorNotMapped); - printf("cudaErrorNotMappedAsArray -> %d\n", cudaErrorNotMappedAsArray); - printf("cudaErrorNotMappedAsPointer -> %d\n", cudaErrorNotMappedAsPointer); - #endif - printf("cudaErrorECCUncorrectable -> %d\n", cudaErrorECCUncorrectable); - printf("cudaErrorUnsupportedLimit -> %d\n", cudaErrorUnsupportedLimit); - printf("cudaErrorDeviceAlreadyInUse -> %d\n", cudaErrorDeviceAlreadyInUse); - printf("cudaErrorPeerAccessUnsupported -> %d\n", cudaErrorPeerAccessUnsupported); - printf("cudaErrorInvalidPtx -> %d\n", cudaErrorInvalidPtx); - printf("cudaErrorInvalidGraphicsContext -> %d\n", cudaErrorInvalidGraphicsContext); - printf("cudaErrorNvlinkUncorrectable -> %d\n", cudaErrorNvlinkUncorrectable); - printf("cudaErrorJitCompilerNotFound -> %d\n", cudaErrorJitCompilerNotFound); - #if (CUDART_VERSION == 1010) - printf("cudaErrorInvalidSource -> %d\n", cudaErrorInvalidSource); - printf("cudaErrorFileNotFound -> %d\n", cudaErrorFileNotFound); - #endif - printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound); - printf("cudaErrorSharedObjectInitFailed -> %d\n", cudaErrorSharedObjectInitFailed); - printf("cudaErrorOperatingSystem -> %d\n", cudaErrorOperatingSystem); - printf("cudaErrorInvalidResourceHandle -> %d\n", cudaErrorInvalidResourceHandle); - #if (CUDART_VERSION == 1010) - printf("cudaErrorIllegalState -> %d\n", cudaErrorIllegalState); - printf("cudaErrorSymbolNotFound -> %d\n", cudaErrorSymbolNotFound); - #endif - printf("cudaErrorNotReady -> %d\n", cudaErrorNotReady); - printf("cudaErrorIllegalAddress -> %d\n", cudaErrorIllegalAddress); - printf("cudaErrorLaunchOutOfResources -> %d\n", cudaErrorLaunchOutOfResources); - printf("cudaErrorLaunchTimeout -> %d\n", cudaErrorLaunchTimeout); - #if (CUDART_VERSION == 1010) - printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing); - #endif - printf("cudaErrorPeerAccessAlreadyEnabled -> %d\n", cudaErrorPeerAccessAlreadyEnabled); - printf("cudaErrorPeerAccessNotEnabled -> %d\n", cudaErrorPeerAccessNotEnabled); - printf("cudaErrorSetOnActiveProcess -> %d\n", cudaErrorSetOnActiveProcess); - #if (CUDART_VERSION == 1010) - printf("cudaErrorContextIsDestroyed -> %d\n", cudaErrorContextIsDestroyed); - #endif - printf("cudaErrorAssert -> %d\n", cudaErrorAssert); - printf("cudaErrorTooManyPeers -> %d\n", cudaErrorTooManyPeers); - printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered); - printf("cudaErrorHostMemoryNotRegistered -> %d\n", cudaErrorHostMemoryNotRegistered); - printf("cudaErrorHardwareStackError -> %d\n", cudaErrorHardwareStackError); - printf("cudaErrorIllegalInstruction -> %d\n", cudaErrorIllegalInstruction); - printf("cudaErrorMisalignedAddress -> %d\n", cudaErrorMisalignedAddress); - printf("cudaErrorInvalidAddressSpace -> %d\n", cudaErrorInvalidAddressSpace); - printf("cudaErrorInvalidPc -> %d\n", cudaErrorInvalidPc); - printf("cudaErrorLaunchFailure -> %d\n", cudaErrorLaunchFailure); - printf("cudaErrorCooperativeLaunchTooLarge -> %d\n", cudaErrorCooperativeLaunchTooLarge); - printf("cudaErrorNotPermitted -> %d\n", cudaErrorNotPermitted); - printf("cudaErrorNotSupported -> %d\n", cudaErrorNotSupported); - #if (CUDART_VERSION == 1010) - printf("cudaErrorSystemNotReady -> %d\n", cudaErrorSystemNotReady); - printf("cudaErrorSystemDriverMismatch -> %d\n", cudaErrorSystemDriverMismatch); - printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice); - printf("cudaErrorStreamCaptureUnsupported -> %d\n", cudaErrorStreamCaptureUnsupported); - printf("cudaErrorStreamCaptureInvalidated -> %d\n", cudaErrorStreamCaptureInvalidated); - printf("cudaErrorStreamCaptureMerge -> %d\n", cudaErrorStreamCaptureMerge); - printf("cudaErrorStreamCaptureUnmatched -> %d\n", cudaErrorStreamCaptureUnmatched); - printf("cudaErrorStreamCaptureUnjoined -> %d\n", cudaErrorStreamCaptureUnjoined); - printf("cudaErrorStreamCaptureIsolation -> %d\n", cudaErrorStreamCaptureIsolation); - printf("cudaErrorStreamCaptureImplicit -> %d\n", cudaErrorStreamCaptureImplicit); - printf("cudaErrorCapturedEvent -> %d\n", cudaErrorCapturedEvent); - printf("cudaErrorStreamCaptureWrongThread -> %d\n", cudaErrorStreamCaptureWrongThread); - #endif - #if (CUDART_VERSION == 1100) - printf("cudaErrorTimeout -> %d\n", cudaErrorTimeout); - printf("cudaErrorGraphExecUpdateFailure -> %d\n", cudaErrorGraphExecUpdateFailure); - #endif - printf("cudaErrorUnknown -> %d\n", cudaErrorUnknown); - printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase); - */ - throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line); - } -} - -#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__)) - -#endif +#ifndef CUDA_ERR_CUH +#define CUDA_ERR_CUH + +/** + * @file cuda_err.cuh + * @author Jiannan Tian + * @brief CUDA runtime error handling macros. + * @version 0.2 + * @date 2020-09-20 + * Created on: 2019-10-08 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include +#include +#include + +struct cusz_cuda_exception : public std::exception { + cusz_cuda_exception(const char* err, int err_code, const char* file, int line) { + std::stringstream ss; + ss << "CUDA API failed at \e[31m\e[1m" << file << ':' << line << "\e[0m with error: " << err << '(' << err_code << ')'; + err_msg = ss.str(); + } + const char* what() const noexcept { + return err_msg.c_str(); + } + std::string err_msg; +}; + +// back compatibility start +static void HandleError(cudaError_t err, const char* file, int line) +{ + if (err != cudaSuccess) { + throw cusz_cuda_exception(cudaGetErrorString(err), err, file, line); + } +} +#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) +// back compatibility end + +static void check_cuda_error(cudaError_t status, const char* file, int line) +{ + if (cudaSuccess != status) { + /* + printf("\nCUDA error/status reference (as of CUDA 11):\n"); + printf("cudaSuccess -> %d\n", cudaSuccess); + printf("cudaErrorInvalidValue -> %d\n", cudaErrorInvalidValue); + printf("cudaErrorMemoryAllocation -> %d\n", cudaErrorMemoryAllocation); + printf("cudaErrorInitializationError -> %d\n", cudaErrorInitializationError); + printf("cudaErrorCudartUnloading -> %d\n", cudaErrorCudartUnloading); + printf("cudaErrorProfilerDisabled -> %d\n", cudaErrorProfilerDisabled); + printf("cudaErrorProfilerNotInitialized (Deprecated)-> %d\n", cudaErrorProfilerNotInitialized); + printf("cudaErrorProfilerAlreadyStarted (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStarted); + printf("cudaErrorProfilerAlreadyStopped (Deprecated)-> %d\n", cudaErrorProfilerAlreadyStopped); + printf("cudaErrorInvalidConfiguration -> %d\n", cudaErrorInvalidConfiguration); + printf("cudaErrorInvalidPitchValue -> %d\n", cudaErrorInvalidPitchValue); + printf("cudaErrorInvalidSymbol -> %d\n", cudaErrorInvalidSymbol); + printf("cudaErrorInvalidHostPointer (Deprecated)-> %d\n", cudaErrorInvalidHostPointer); + printf("cudaErrorInvalidDevicePointer (Deprecated)-> %d\n", cudaErrorInvalidDevicePointer); + printf("cudaErrorInvalidTexture -> %d\n", cudaErrorInvalidTexture); + printf("cudaErrorInvalidTextureBinding -> %d\n", cudaErrorInvalidTextureBinding); + printf("cudaErrorInvalidChannelDescriptor -> %d\n", cudaErrorInvalidChannelDescriptor); + printf("cudaErrorInvalidMemcpyDirection -> %d\n", cudaErrorInvalidMemcpyDirection); + printf("cudaErrorAddressOfConstant (Deprecated)-> %d\n", cudaErrorAddressOfConstant); + printf("cudaErrorTextureFetchFailed (Deprecated)-> %d\n", cudaErrorTextureFetchFailed); + printf("cudaErrorTextureNotBound (Deprecated)-> %d\n", cudaErrorTextureNotBound); + printf("cudaErrorSynchronizationError (Deprecated)-> %d\n", cudaErrorSynchronizationError); + printf("cudaErrorInvalidFilterSetting -> %d\n", cudaErrorInvalidFilterSetting); + printf("cudaErrorInvalidNormSetting -> %d\n", cudaErrorInvalidNormSetting); + printf("cudaErrorMixedDeviceExecution (Deprecated)-> %d\n", cudaErrorMixedDeviceExecution); + printf("cudaErrorNotYetImplemented (Deprecated)-> %d\n", cudaErrorNotYetImplemented); + printf("cudaErrorMemoryValueTooLarge (Deprecated)-> %d\n", cudaErrorMemoryValueTooLarge); + printf("cudaErrorInsufficientDriver -> %d\n", cudaErrorInsufficientDriver); + printf("cudaErrorInvalidSurface -> %d\n", cudaErrorInvalidSurface); + printf("cudaErrorDuplicateVariableName -> %d\n", cudaErrorDuplicateVariableName); + printf("cudaErrorDuplicateTextureName -> %d\n", cudaErrorDuplicateTextureName); + printf("cudaErrorDuplicateSurfaceName -> %d\n", cudaErrorDuplicateSurfaceName); + printf("cudaErrorDevicesUnavailable -> %d\n", cudaErrorDevicesUnavailable); + printf("cudaErrorIncompatibleDriverContext -> %d\n", cudaErrorIncompatibleDriverContext); + printf("cudaErrorMissingConfiguration -> %d\n", cudaErrorMissingConfiguration); + printf("cudaErrorPriorLaunchFailure (Deprecated)-> %d\n", cudaErrorPriorLaunchFailure); + printf("cudaErrorLaunchMaxDepthExceeded -> %d\n", cudaErrorLaunchMaxDepthExceeded); + printf("cudaErrorLaunchFileScopedTex -> %d\n", cudaErrorLaunchFileScopedTex); + printf("cudaErrorLaunchFileScopedSurf -> %d\n", cudaErrorLaunchFileScopedSurf); + printf("cudaErrorSyncDepthExceeded -> %d\n", cudaErrorSyncDepthExceeded); + printf("cudaErrorLaunchPendingCountExceeded -> %d\n", cudaErrorLaunchPendingCountExceeded); + printf("cudaErrorInvalidDeviceFunction -> %d\n", cudaErrorInvalidDeviceFunction); + printf("cudaErrorNoDevice -> %d\n", cudaErrorNoDevice); + printf("cudaErrorInvalidDevice -> %d\n", cudaErrorInvalidDevice); + printf("cudaErrorStartupFailure -> %d\n", cudaErrorStartupFailure); + printf("cudaErrorInvalidKernelImage -> %d\n", cudaErrorInvalidKernelImage); + #if (CUDART_VERSION == 1100) + printf("cudaErrorDeviceUninitialized -> %d\n", cudaErrorDeviceUninitialized); + #endif + printf("cudaErrorMapBufferObjectFailed -> %d\n", cudaErrorMapBufferObjectFailed); + printf("cudaErrorUnmapBufferObjectFailed -> %d\n", cudaErrorUnmapBufferObjectFailed); + #if (CUDART_VERSION == 1010) + printf("cudaErrorArrayIsMapped -> %d\n", cudaErrorArrayIsMapped); + printf("cudaErrorAlreadyMapped -> %d\n", cudaErrorAlreadyMapped); + #endif + printf("cudaErrorNoKernelImageForDevice -> %d\n", cudaErrorNoKernelImageForDevice); + #if (CUDART_VERSION == 1010) + printf("cudaErrorAlreadyAcquired -> %d\n", cudaErrorAlreadyAcquired); + printf("cudaErrorNotMapped -> %d\n", cudaErrorNotMapped); + printf("cudaErrorNotMappedAsArray -> %d\n", cudaErrorNotMappedAsArray); + printf("cudaErrorNotMappedAsPointer -> %d\n", cudaErrorNotMappedAsPointer); + #endif + printf("cudaErrorECCUncorrectable -> %d\n", cudaErrorECCUncorrectable); + printf("cudaErrorUnsupportedLimit -> %d\n", cudaErrorUnsupportedLimit); + printf("cudaErrorDeviceAlreadyInUse -> %d\n", cudaErrorDeviceAlreadyInUse); + printf("cudaErrorPeerAccessUnsupported -> %d\n", cudaErrorPeerAccessUnsupported); + printf("cudaErrorInvalidPtx -> %d\n", cudaErrorInvalidPtx); + printf("cudaErrorInvalidGraphicsContext -> %d\n", cudaErrorInvalidGraphicsContext); + printf("cudaErrorNvlinkUncorrectable -> %d\n", cudaErrorNvlinkUncorrectable); + printf("cudaErrorJitCompilerNotFound -> %d\n", cudaErrorJitCompilerNotFound); + #if (CUDART_VERSION == 1010) + printf("cudaErrorInvalidSource -> %d\n", cudaErrorInvalidSource); + printf("cudaErrorFileNotFound -> %d\n", cudaErrorFileNotFound); + #endif + printf("cudaErrorSharedObjectSymbolNotFound -> %d\n", cudaErrorSharedObjectSymbolNotFound); + printf("cudaErrorSharedObjectInitFailed -> %d\n", cudaErrorSharedObjectInitFailed); + printf("cudaErrorOperatingSystem -> %d\n", cudaErrorOperatingSystem); + printf("cudaErrorInvalidResourceHandle -> %d\n", cudaErrorInvalidResourceHandle); + #if (CUDART_VERSION == 1010) + printf("cudaErrorIllegalState -> %d\n", cudaErrorIllegalState); + printf("cudaErrorSymbolNotFound -> %d\n", cudaErrorSymbolNotFound); + #endif + printf("cudaErrorNotReady -> %d\n", cudaErrorNotReady); + printf("cudaErrorIllegalAddress -> %d\n", cudaErrorIllegalAddress); + printf("cudaErrorLaunchOutOfResources -> %d\n", cudaErrorLaunchOutOfResources); + printf("cudaErrorLaunchTimeout -> %d\n", cudaErrorLaunchTimeout); + #if (CUDART_VERSION == 1010) + printf("cudaErrorLaunchIncompatibleTexturing-> %d\n", cudaErrorLaunchIncompatibleTexturing); + #endif + printf("cudaErrorPeerAccessAlreadyEnabled -> %d\n", cudaErrorPeerAccessAlreadyEnabled); + printf("cudaErrorPeerAccessNotEnabled -> %d\n", cudaErrorPeerAccessNotEnabled); + printf("cudaErrorSetOnActiveProcess -> %d\n", cudaErrorSetOnActiveProcess); + #if (CUDART_VERSION == 1010) + printf("cudaErrorContextIsDestroyed -> %d\n", cudaErrorContextIsDestroyed); + #endif + printf("cudaErrorAssert -> %d\n", cudaErrorAssert); + printf("cudaErrorTooManyPeers -> %d\n", cudaErrorTooManyPeers); + printf("cudaErrorHostMemoryAlreadyRegistered-> %d\n", cudaErrorHostMemoryAlreadyRegistered); + printf("cudaErrorHostMemoryNotRegistered -> %d\n", cudaErrorHostMemoryNotRegistered); + printf("cudaErrorHardwareStackError -> %d\n", cudaErrorHardwareStackError); + printf("cudaErrorIllegalInstruction -> %d\n", cudaErrorIllegalInstruction); + printf("cudaErrorMisalignedAddress -> %d\n", cudaErrorMisalignedAddress); + printf("cudaErrorInvalidAddressSpace -> %d\n", cudaErrorInvalidAddressSpace); + printf("cudaErrorInvalidPc -> %d\n", cudaErrorInvalidPc); + printf("cudaErrorLaunchFailure -> %d\n", cudaErrorLaunchFailure); + printf("cudaErrorCooperativeLaunchTooLarge -> %d\n", cudaErrorCooperativeLaunchTooLarge); + printf("cudaErrorNotPermitted -> %d\n", cudaErrorNotPermitted); + printf("cudaErrorNotSupported -> %d\n", cudaErrorNotSupported); + #if (CUDART_VERSION == 1010) + printf("cudaErrorSystemNotReady -> %d\n", cudaErrorSystemNotReady); + printf("cudaErrorSystemDriverMismatch -> %d\n", cudaErrorSystemDriverMismatch); + printf("cudaErrorCompatNotSupportedOnDevice -> %d\n", cudaErrorCompatNotSupportedOnDevice); + printf("cudaErrorStreamCaptureUnsupported -> %d\n", cudaErrorStreamCaptureUnsupported); + printf("cudaErrorStreamCaptureInvalidated -> %d\n", cudaErrorStreamCaptureInvalidated); + printf("cudaErrorStreamCaptureMerge -> %d\n", cudaErrorStreamCaptureMerge); + printf("cudaErrorStreamCaptureUnmatched -> %d\n", cudaErrorStreamCaptureUnmatched); + printf("cudaErrorStreamCaptureUnjoined -> %d\n", cudaErrorStreamCaptureUnjoined); + printf("cudaErrorStreamCaptureIsolation -> %d\n", cudaErrorStreamCaptureIsolation); + printf("cudaErrorStreamCaptureImplicit -> %d\n", cudaErrorStreamCaptureImplicit); + printf("cudaErrorCapturedEvent -> %d\n", cudaErrorCapturedEvent); + printf("cudaErrorStreamCaptureWrongThread -> %d\n", cudaErrorStreamCaptureWrongThread); + #endif + #if (CUDART_VERSION == 1100) + printf("cudaErrorTimeout -> %d\n", cudaErrorTimeout); + printf("cudaErrorGraphExecUpdateFailure -> %d\n", cudaErrorGraphExecUpdateFailure); + #endif + printf("cudaErrorUnknown -> %d\n", cudaErrorUnknown); + printf("cudaErrorApiFailureBase (Deprecated)-> %d\n", cudaErrorApiFailureBase); + */ + throw cusz_cuda_exception(cudaGetErrorString(status), status, file, line); + } +} + +#define CHECK_CUDA(err) (check_cuda_error(err, __FILE__, __LINE__)) + +#endif diff --git a/qtensor/compression/cusz/include/utils/cuda_mem.cuh b/qtensor/compression/cusz/include/utils/cuda_mem.cuh index 723028ab..46e52e33 100644 --- a/qtensor/compression/cusz/include/utils/cuda_mem.cuh +++ b/qtensor/compression/cusz/include/utils/cuda_mem.cuh @@ -1,100 +1,100 @@ -#ifndef UTILS_CUDA_MEM_CUH -#define UTILS_CUDA_MEM_CUH - -/** - * @file cuda_mem.cuh - * @author Jiannan Tian - * @brief CUDA memory operation wrappers. - * @version 0.2 - * @date 2020-09-20 - * Created on 2020-04-30 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include -#include -#include -#include - -template -static inline bool __is_aligned_at(const void* ptr) -{ // - return reinterpret_cast(ptr) % NUM == 0; -}; - -template -static size_t __cusz_get_alignable_len(size_t len) -{ - return ((sizeof(T) * len - 1) / NUM + 1) * NUM; -} - -static const int CUSZ_ALIGN_NUM = 128; - -/** - * @brief when using memory pool, alignment at 128 is necessary - * - * @tparam SRC - * @tparam DST - * @param src - * @return DST* - */ -template -DST* designate(SRC* src) -{ - // TODO check alignment - auto aligned = __is_aligned_at(src); - if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes"); - - return reinterpret_cast(src); -} - -template -DST* free_repurpose(SRC* src) -{ - // aligning at 4 byte; does not raise misalignment - // may not result in optimal performance considering coalescing - auto aligned = __is_aligned_at<4>(src); - if (not aligned) throw std::runtime_error("not aligned at 4 bytes"); - - return reinterpret_cast(src); -} - -namespace mem { - -enum MemcpyDirection { h2d, d2h }; - -template -inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00) -{ - T* d_var; - cudaMalloc(&d_var, len * sizeof(T)); - cudaMemset(d_var, filling_val, len * sizeof(T)); - return d_var; -} - -template -inline T* create_devspace_memcpy_h2d(T* var, size_t l) -{ - T* d_var; - cudaMalloc(&d_var, l * sizeof(T)); - cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice); - return d_var; -} -template -inline T* create_devspace_memcpy_d2h(T* d_var, size_t l) -{ - // auto var = new T[l]; - T* var; - cudaMallocHost(&var, l * sizeof(T)); - cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost); - return var; -} - -} // namespace mem - -#endif +#ifndef UTILS_CUDA_MEM_CUH +#define UTILS_CUDA_MEM_CUH + +/** + * @file cuda_mem.cuh + * @author Jiannan Tian + * @brief CUDA memory operation wrappers. + * @version 0.2 + * @date 2020-09-20 + * Created on 2020-04-30 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include +#include +#include +#include + +template +static inline bool __is_aligned_at(const void* ptr) +{ // + return reinterpret_cast(ptr) % NUM == 0; +}; + +template +static size_t __cusz_get_alignable_len(size_t len) +{ + return ((sizeof(T) * len - 1) / NUM + 1) * NUM; +} + +static const int CUSZ_ALIGN_NUM = 128; + +/** + * @brief when using memory pool, alignment at 128 is necessary + * + * @tparam SRC + * @tparam DST + * @param src + * @return DST* + */ +template +DST* designate(SRC* src) +{ + // TODO check alignment + auto aligned = __is_aligned_at(src); + if (not aligned) throw std::runtime_error("not aligned at " + std::to_string(CUSZ_ALIGN_NUM) + " bytes"); + + return reinterpret_cast(src); +} + +template +DST* free_repurpose(SRC* src) +{ + // aligning at 4 byte; does not raise misalignment + // may not result in optimal performance considering coalescing + auto aligned = __is_aligned_at<4>(src); + if (not aligned) throw std::runtime_error("not aligned at 4 bytes"); + + return reinterpret_cast(src); +} + +namespace mem { + +enum MemcpyDirection { h2d, d2h }; + +template +inline T* create_CUDA_space(size_t len, uint8_t filling_val = 0x00) +{ + T* d_var; + cudaMalloc(&d_var, len * sizeof(T)); + cudaMemset(d_var, filling_val, len * sizeof(T)); + return d_var; +} + +template +inline T* create_devspace_memcpy_h2d(T* var, size_t l) +{ + T* d_var; + cudaMalloc(&d_var, l * sizeof(T)); + cudaMemcpy(d_var, var, l * sizeof(T), cudaMemcpyHostToDevice); + return d_var; +} +template +inline T* create_devspace_memcpy_d2h(T* d_var, size_t l) +{ + // auto var = new T[l]; + T* var; + cudaMallocHost(&var, l * sizeof(T)); + cudaMemcpy(var, d_var, l * sizeof(T), cudaMemcpyDeviceToHost); + return var; +} + +} // namespace mem + +#endif diff --git a/qtensor/compression/cusz/include/utils/cusparse_err.cuh b/qtensor/compression/cusz/include/utils/cusparse_err.cuh index 2086ca44..e2f77bb6 100644 --- a/qtensor/compression/cusz/include/utils/cusparse_err.cuh +++ b/qtensor/compression/cusz/include/utils/cusparse_err.cuh @@ -1,60 +1,60 @@ -#ifndef UTILS_CUSPARSE_ERR_CUH -#define UTILS_CUSPARSE_ERR_CUH - -/** - * @file cuda_err.cuh - * @author Jiannan Tian - * @brief CUDA runtime error handling macros. - * @version 0.2 - * @date 2020-09-20 - * Created on: 2019-10-08 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include - -// block cusparse for generic testing -#ifndef noCUSPARSE - -static void check_cusparse_error(cusparseStatus_t status, const char* file, int line) -{ - if (CUSPARSE_STATUS_SUCCESS != status) { - printf("\nCUSPARSE status reference (as of CUDA 11):\n"); - printf("CUSPARSE_STATUS_SUCCESS -> %d\n", CUSPARSE_STATUS_SUCCESS); - printf("CUSPARSE_STATUS_NOT_INITIALIZED -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED); - printf("CUSPARSE_STATUS_ALLOC_FAILED -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED); - printf("CUSPARSE_STATUS_INVALID_VALUE -> %d\n", CUSPARSE_STATUS_INVALID_VALUE); - printf("CUSPARSE_STATUS_ARCH_MISMATCH -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH); - printf("CUSPARSE_STATUS_EXECUTION_FAILED -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED); - printf("CUSPARSE_STATUS_INTERNAL_ERROR -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR); - printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); -#if (CUDART_VERSION == 1010) - printf("CUSPARSE_STATUS_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED); -#endif -#if (CUDART_VERSION == 1100) - printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); -#endif -#if (CUDART_VERSION == 1100) - printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); -#endif - printf("\n"); - -#if (CUDART_VERSION >= 1010) - printf( - "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line, - cusparseGetErrorString(status), status); -#endif - exit(EXIT_FAILURE); - } -} - -#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__)) - -#endif - -#endif +#ifndef UTILS_CUSPARSE_ERR_CUH +#define UTILS_CUSPARSE_ERR_CUH + +/** + * @file cuda_err.cuh + * @author Jiannan Tian + * @brief CUDA runtime error handling macros. + * @version 0.2 + * @date 2020-09-20 + * Created on: 2019-10-08 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include + +// block cusparse for generic testing +#ifndef noCUSPARSE + +static void check_cusparse_error(cusparseStatus_t status, const char* file, int line) +{ + if (CUSPARSE_STATUS_SUCCESS != status) { + printf("\nCUSPARSE status reference (as of CUDA 11):\n"); + printf("CUSPARSE_STATUS_SUCCESS -> %d\n", CUSPARSE_STATUS_SUCCESS); + printf("CUSPARSE_STATUS_NOT_INITIALIZED -> %d\n", CUSPARSE_STATUS_NOT_INITIALIZED); + printf("CUSPARSE_STATUS_ALLOC_FAILED -> %d\n", CUSPARSE_STATUS_ALLOC_FAILED); + printf("CUSPARSE_STATUS_INVALID_VALUE -> %d\n", CUSPARSE_STATUS_INVALID_VALUE); + printf("CUSPARSE_STATUS_ARCH_MISMATCH -> %d\n", CUSPARSE_STATUS_ARCH_MISMATCH); + printf("CUSPARSE_STATUS_EXECUTION_FAILED -> %d\n", CUSPARSE_STATUS_EXECUTION_FAILED); + printf("CUSPARSE_STATUS_INTERNAL_ERROR -> %d\n", CUSPARSE_STATUS_INTERNAL_ERROR); + printf("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); +#if (CUDART_VERSION == 1010) + printf("CUSPARSE_STATUS_NOT_SUPPORTED -> %d\n", CUSPARSE_STATUS_NOT_SUPPORTED); +#endif +#if (CUDART_VERSION == 1100) + printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); +#endif +#if (CUDART_VERSION == 1100) + printf("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES -> %d\n", CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); +#endif + printf("\n"); + +#if (CUDART_VERSION >= 1010) + printf( + "CUSPARSE API failed at \e[31m\e[1m%s:%d\e[0m with error: %s (%d)\n", file, line, + cusparseGetErrorString(status), status); +#endif + exit(EXIT_FAILURE); + } +} + +#define CHECK_CUSPARSE(err) (check_cusparse_error(err, __FILE__, __LINE__)) + +#endif + +#endif diff --git a/qtensor/compression/cusz/include/utils/format.hh b/qtensor/compression/cusz/include/utils/format.hh index 196f7248..ae1d6079 100644 --- a/qtensor/compression/cusz/include/utils/format.hh +++ b/qtensor/compression/cusz/include/utils/format.hh @@ -1,57 +1,57 @@ -#ifndef UTILS_FORMAT_HH -#define UTILS_FORMAT_HH - -/** - * @file format.hh - * @author Jiannan Tian - * @brief Formatting for log print (header). - * @version 0.2 - * @date 2020-09-20 - * Created on 2020-04-27 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include - - -const std::string LOG_NULL = " "; -const std::string LOG_INFO = " :: "; -const std::string LOG_ERR = " ERR "; -const std::string LOG_WARN = "WARN "; -const std::string LOG_DBG = " dbg "; -const std::string LOG_EXCEPTION = " !! "; - -// https://stackoverflow.com/a/26080768/8740097 CC BY-SA 3.0 -template -void build(std::ostream& o, T t) -{ - o << t << " "; -} - -template -void build(std::ostream& o, T t, Args... args) // recursive variadic function -{ - build(o, t); - build(o, args...); -} - -template -void LOGGING(const std::string& log_head, Args... args) -{ - std::ostringstream oss; - oss << log_head; - build(oss, args...); - - oss.seekp(0, std::ios::end); - std::stringstream::pos_type offset = oss.tellp(); - if (log_head == LOG_DBG) { std::cout << "\e[2m"; } // dbg - std::cout << oss.str() << std::endl; // print content - if (log_head == LOG_DBG) std::cout << "\e[0m"; // finish printing dbg -} - -#endif // FORMAT_HH +#ifndef UTILS_FORMAT_HH +#define UTILS_FORMAT_HH + +/** + * @file format.hh + * @author Jiannan Tian + * @brief Formatting for log print (header). + * @version 0.2 + * @date 2020-09-20 + * Created on 2020-04-27 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include + + +const std::string LOG_NULL = " "; +const std::string LOG_INFO = " :: "; +const std::string LOG_ERR = " ERR "; +const std::string LOG_WARN = "WARN "; +const std::string LOG_DBG = " dbg "; +const std::string LOG_EXCEPTION = " !! "; + +// https://stackoverflow.com/a/26080768/8740097 CC BY-SA 3.0 +template +void build(std::ostream& o, T t) +{ + o << t << " "; +} + +template +void build(std::ostream& o, T t, Args... args) // recursive variadic function +{ + build(o, t); + build(o, args...); +} + +template +void LOGGING(const std::string& log_head, Args... args) +{ + std::ostringstream oss; + oss << log_head; + build(oss, args...); + + oss.seekp(0, std::ios::end); + std::stringstream::pos_type offset = oss.tellp(); + if (log_head == LOG_DBG) { std::cout << "\e[2m"; } // dbg + std::cout << oss.str() << std::endl; // print content + if (log_head == LOG_DBG) std::cout << "\e[0m"; // finish printing dbg +} + +#endif // FORMAT_HH diff --git a/qtensor/compression/cusz/include/utils/io.hh b/qtensor/compression/cusz/include/utils/io.hh index de71334d..574432ef 100644 --- a/qtensor/compression/cusz/include/utils/io.hh +++ b/qtensor/compression/cusz/include/utils/io.hh @@ -1,59 +1,59 @@ -#ifndef UTILS_IO_HH -#define UTILS_IO_HH - -/** - * @file io.hh - * @author Jiannan Tian - * @brief Read and write binary. - * @version 0.2 - * @date 2020-09-20 - * Created on 2019-08-27 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include - -namespace io { - -template -T* read_binary_to_new_array(const std::string& fname, size_t dtype_len) -{ - std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in); - if (not ifs.is_open()) { - std::cerr << "fail to open " << fname << std::endl; - exit(1); - } - auto _a = new T[dtype_len](); - ifs.read(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); - ifs.close(); - return _a; -} - -template -void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len) -{ - std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in); - if (not ifs.is_open()) { - std::cerr << "fail to open " << fname << std::endl; - exit(1); - } - ifs.read(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); - ifs.close(); -} - -template -void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len) -{ - std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out); - if (not ofs.is_open()) return; - ofs.write(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); - ofs.close(); -} - -} // namespace io - -#endif // IO_HH +#ifndef UTILS_IO_HH +#define UTILS_IO_HH + +/** + * @file io.hh + * @author Jiannan Tian + * @brief Read and write binary. + * @version 0.2 + * @date 2020-09-20 + * Created on 2019-08-27 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include + +namespace io { + +template +T* read_binary_to_new_array(const std::string& fname, size_t dtype_len) +{ + std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in); + if (not ifs.is_open()) { + std::cerr << "fail to open " << fname << std::endl; + exit(1); + } + auto _a = new T[dtype_len](); + ifs.read(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); + ifs.close(); + return _a; +} + +template +void read_binary_to_array(const std::string& fname, T* _a, size_t dtype_len) +{ + std::ifstream ifs(fname.c_str(), std::ios::binary | std::ios::in); + if (not ifs.is_open()) { + std::cerr << "fail to open " << fname << std::endl; + exit(1); + } + ifs.read(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); + ifs.close(); +} + +template +void write_array_to_binary(const std::string& fname, T* const _a, size_t const dtype_len) +{ + std::ofstream ofs(fname.c_str(), std::ios::binary | std::ios::out); + if (not ofs.is_open()) return; + ofs.write(reinterpret_cast(_a), std::streamsize(dtype_len * sizeof(T))); + ofs.close(); +} + +} // namespace io + +#endif // IO_HH diff --git a/qtensor/compression/cusz/include/utils/print_gpu.h b/qtensor/compression/cusz/include/utils/print_gpu.h index 67dcc30a..d4cded5e 100644 --- a/qtensor/compression/cusz/include/utils/print_gpu.h +++ b/qtensor/compression/cusz/include/utils/print_gpu.h @@ -1,45 +1,45 @@ -/** - * @file print.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-28 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6 -#define E02AE628_9C8A_4100_8C73_A3B74B7128F6 - -#ifdef __cplusplus -extern "C" { -#endif - -#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset); - -PRINT_INT_LESS_THAN_64(i8, int8_t) -PRINT_INT_LESS_THAN_64(i16, int16_t) -PRINT_INT_LESS_THAN_64(i32, int32_t) - -void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset); - -#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset); - -PRINT_UINT_LESS_THAN_64(ui8, uint8_t) -PRINT_UINT_LESS_THAN_64(ui16, uint16_t) -PRINT_UINT_LESS_THAN_64(ui32, uint32_t) - -void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset); - -void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset); -void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset); - -#undef PRINT_INT_LESS_THAN_64 -#undef PRINT_UINT_LESS_THAN_64 - -#ifdef __cplusplus -} -#endif - -#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */ +/** + * @file print.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-28 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef E02AE628_9C8A_4100_8C73_A3B74B7128F6 +#define E02AE628_9C8A_4100_8C73_A3B74B7128F6 + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRINT_INT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset); + +PRINT_INT_LESS_THAN_64(i8, int8_t) +PRINT_INT_LESS_THAN_64(i16, int16_t) +PRINT_INT_LESS_THAN_64(i32, int32_t) + +void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset); + +#define PRINT_UINT_LESS_THAN_64(Tliteral, T) void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset); + +PRINT_UINT_LESS_THAN_64(ui8, uint8_t) +PRINT_UINT_LESS_THAN_64(ui16, uint16_t) +PRINT_UINT_LESS_THAN_64(ui32, uint32_t) + +void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset); + +void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset); +void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset); + +#undef PRINT_INT_LESS_THAN_64 +#undef PRINT_UINT_LESS_THAN_64 + +#ifdef __cplusplus +} +#endif + +#endif /* E02AE628_9C8A_4100_8C73_A3B74B7128F6 */ diff --git a/qtensor/compression/cusz/include/utils/print_gpu.hh b/qtensor/compression/cusz/include/utils/print_gpu.hh index cffcbf22..c3236f62 100644 --- a/qtensor/compression/cusz/include/utils/print_gpu.hh +++ b/qtensor/compression/cusz/include/utils/print_gpu.hh @@ -1,21 +1,21 @@ -/** - * @file print_gpu.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "print_gpu.h" - -namespace psz { - -template -void peek_device_data(T* d_arr, size_t num, size_t offset = 0); - -} // namespace psz - -#undef PEEK_DEVICE_DATA +/** + * @file print_gpu.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "print_gpu.h" + +namespace psz { + +template +void peek_device_data(T* d_arr, size_t num, size_t offset = 0); + +} // namespace psz + +#undef PEEK_DEVICE_DATA diff --git a/qtensor/compression/cusz/include/utils/strhelper.hh b/qtensor/compression/cusz/include/utils/strhelper.hh index 6768edeb..a95dc96f 100644 --- a/qtensor/compression/cusz/include/utils/strhelper.hh +++ b/qtensor/compression/cusz/include/utils/strhelper.hh @@ -1,144 +1,144 @@ -/** - * @file strhelper.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-19 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_UTILS_STRHELPER_HH -#define CUSZ_UTILS_STRHELPER_HH - -#include -#include -#include -#include -#include -#include -#include "format.hh" - -using std::cerr; -using std::endl; - -using ss_t = std::stringstream; -using map_t = std::unordered_map; -using str_list = std::vector; - -struct StrHelper { - static unsigned int str2int(const char* s) - { - char* end; - auto res = std::strtol(s, &end, 10); - if (*end) { - const char* notif = "invalid option value, non-convertible part: "; - cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl; - } - return res; - } - - static unsigned int str2int(std::string s) { return str2int(s.c_str()); } - - static double str2fp(const char* s) - { - char* end; - auto res = std::strtod(s, &end); - if (*end) { - const char* notif = "invalid option value, non-convertible part: "; - cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl; - } - return res; - } - - static double str2fp(std::string s) { return str2fp(s.c_str()); } - - static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; } - - static std::pair separate_kv(std::string& s) - { - std::string delimiter = "="; - - if (s.find(delimiter) == std::string::npos) - throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m"); - - std::string k = s.substr(0, s.find(delimiter)); - std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos); - - return std::make_pair(k, v); - } - - static void parse_strlist_as_kv(const char* in_str, map_t& kv_list) - { - ss_t ss(in_str); - while (ss.good()) { - std::string tmp; - std::getline(ss, tmp, ','); - kv_list.insert(separate_kv(tmp)); - } - } - - static void parse_strlist(const char* in_str, str_list& list) - { - ss_t ss(in_str); - while (ss.good()) { - std::string tmp; - std::getline(ss, tmp, ','); - list.push_back(tmp); - } - } - - static std::pair parse_kv_onoff(std::string in_str) - { - auto kv_literal = "(.*?)=(on|ON|off|OFF)"; - std::regex kv_pattern(kv_literal); - std::regex onoff_pattern("on|ON|off|OFF"); - - bool onoff = false; - std::string k, v; - - std::smatch kv_match; - if (std::regex_match(in_str, kv_match, kv_pattern)) { - // the 1st match: whole string - // the 2nd: k, the 3rd: v - if (kv_match.size() == 3) { - k = kv_match[1].str(), v = kv_match[2].str(); - - std::smatch v_match; - if (std::regex_match(v, v_match, onoff_pattern)) { // - onoff = (v == "on") or (v == "ON"); - } - else { - throw std::runtime_error("not legal (k=v)-syntax"); - } - } - } - return std::make_pair(k, onoff); - } - - static std::string doc_format(const std::string& s) - { - std::regex gray("%(.*?)%"); - std::string gray_text("\e[37m$1\e[0m"); - - std::regex bful("@(.*?)@"); - std::string bful_text("\e[1m\e[4m$1\e[0m"); - std::regex bf("\\*(.*?)\\*"); - std::string bf_text("\e[1m$1\e[0m"); - std::regex ul(R"(_((\w|-|\d|\.)+?)_)"); - std::string ul_text("\e[4m$1\e[0m"); - std::regex red(R"(\^\^(.*?)\^\^)"); - std::string red_text("\e[31m$1\e[0m"); - - auto a = std::regex_replace(s, bful, bful_text); - auto b = std::regex_replace(a, bf, bf_text); - auto c = std::regex_replace(b, ul, ul_text); - auto d = std::regex_replace(c, red, red_text); - auto e = std::regex_replace(d, gray, gray_text); - - return e; - } -}; - -#endif +/** + * @file strhelper.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-19 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_UTILS_STRHELPER_HH +#define CUSZ_UTILS_STRHELPER_HH + +#include +#include +#include +#include +#include +#include +#include "format.hh" + +using std::cerr; +using std::endl; + +using ss_t = std::stringstream; +using map_t = std::unordered_map; +using str_list = std::vector; + +struct StrHelper { + static unsigned int str2int(const char* s) + { + char* end; + auto res = std::strtol(s, &end, 10); + if (*end) { + const char* notif = "invalid option value, non-convertible part: "; + cerr << LOG_ERR << notif << "\e[1m" << s << "\e[0m" << endl; + } + return res; + } + + static unsigned int str2int(std::string s) { return str2int(s.c_str()); } + + static double str2fp(const char* s) + { + char* end; + auto res = std::strtod(s, &end); + if (*end) { + const char* notif = "invalid option value, non-convertible part: "; + cerr << LOG_ERR << notif << "\e[1m" << end << "\e[0m" << endl; + } + return res; + } + + static double str2fp(std::string s) { return str2fp(s.c_str()); } + + static bool is_kv_pair(std::string s) { return s.find("=") != std::string::npos; } + + static std::pair separate_kv(std::string& s) + { + std::string delimiter = "="; + + if (s.find(delimiter) == std::string::npos) + throw std::runtime_error("\e[1mnot a correct key-value syntax, must be \"opt=value\"\e[0m"); + + std::string k = s.substr(0, s.find(delimiter)); + std::string v = s.substr(s.find(delimiter) + delimiter.length(), std::string::npos); + + return std::make_pair(k, v); + } + + static void parse_strlist_as_kv(const char* in_str, map_t& kv_list) + { + ss_t ss(in_str); + while (ss.good()) { + std::string tmp; + std::getline(ss, tmp, ','); + kv_list.insert(separate_kv(tmp)); + } + } + + static void parse_strlist(const char* in_str, str_list& list) + { + ss_t ss(in_str); + while (ss.good()) { + std::string tmp; + std::getline(ss, tmp, ','); + list.push_back(tmp); + } + } + + static std::pair parse_kv_onoff(std::string in_str) + { + auto kv_literal = "(.*?)=(on|ON|off|OFF)"; + std::regex kv_pattern(kv_literal); + std::regex onoff_pattern("on|ON|off|OFF"); + + bool onoff = false; + std::string k, v; + + std::smatch kv_match; + if (std::regex_match(in_str, kv_match, kv_pattern)) { + // the 1st match: whole string + // the 2nd: k, the 3rd: v + if (kv_match.size() == 3) { + k = kv_match[1].str(), v = kv_match[2].str(); + + std::smatch v_match; + if (std::regex_match(v, v_match, onoff_pattern)) { // + onoff = (v == "on") or (v == "ON"); + } + else { + throw std::runtime_error("not legal (k=v)-syntax"); + } + } + } + return std::make_pair(k, onoff); + } + + static std::string doc_format(const std::string& s) + { + std::regex gray("%(.*?)%"); + std::string gray_text("\e[37m$1\e[0m"); + + std::regex bful("@(.*?)@"); + std::string bful_text("\e[1m\e[4m$1\e[0m"); + std::regex bf("\\*(.*?)\\*"); + std::string bf_text("\e[1m$1\e[0m"); + std::regex ul(R"(_((\w|-|\d|\.)+?)_)"); + std::string ul_text("\e[4m$1\e[0m"); + std::regex red(R"(\^\^(.*?)\^\^)"); + std::string red_text("\e[31m$1\e[0m"); + + auto a = std::regex_replace(s, bful, bful_text); + auto b = std::regex_replace(a, bf, bf_text); + auto c = std::regex_replace(b, ul, ul_text); + auto d = std::regex_replace(c, red, red_text); + auto e = std::regex_replace(d, gray, gray_text); + + return e; + } +}; + +#endif diff --git a/qtensor/compression/cusz/include/utils/timer.h b/qtensor/compression/cusz/include/utils/timer.h index c38cb0dd..41efb730 100644 --- a/qtensor/compression/cusz/include/utils/timer.h +++ b/qtensor/compression/cusz/include/utils/timer.h @@ -1,92 +1,92 @@ -/** - * @file timer.h - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-31 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 -#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 - -#ifdef __cplusplus -extern "C" { -#endif - -#include "../cusz/type.h" - -struct asz_timer; -typedef struct asz_timer asz_timer; -typedef struct asz_timer asz_cputimer; - -struct asz_cudatimer; -typedef struct asz_cudatimer asz_cudatimer; - -// top-level/dispatcher -// asz_timer* asz_timer_create(asz_policy const p, void* stream); -// void asz_timer_destroy(asz_timer* t); -// void asz_timer_start(asz_timer* t); -// void asz_timer_end(asz_timer* t); -// double asz_time_elapsed(asz_timer* t); - -asz_timer* asz_cputimer_create(); -void asz_cputimer_destroy(asz_timer* t); -void asz_cputimer_start(asz_timer* t); -void asz_cputimer_end(asz_timer* t); -double asz_cputime_elapsed(asz_timer* t); - -// 22-11-01 adding wrapper incurs unexpeted overhead in timing -asz_cudatimer* asz_cudatimer_create(); -void asz_cudatimer_destroy(asz_cudatimer* t); -void asz_cudatimer_start(asz_cudatimer* t); -void asz_cudatimer_end(asz_cudatimer* t); -double asz_cudatime_elapsed(asz_cudatimer* t); - -asz_cudatimer* asz_cudastreamtimer_create(void* stream); -void asz_cudastreamtimer_destroy(asz_cudatimer* t); -void asz_cudastreamtimer_start(asz_cudatimer* t); -void asz_cudastreamtimer_end(asz_cudatimer* t); -double asz_cudastreamtime_elapsed(asz_cudatimer* t); - -// 22-11-01 CUDA timing snippet instead -#define CREATE_CUDAEVENT_PAIR \ - cudaEvent_t a, b; \ - cudaEventCreate(&a); \ - cudaEventCreate(&b); - -#define DESTROY_CUDAEVENT_PAIR \ - cudaEventDestroy(a); \ - cudaEventDestroy(b); - -#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM); -#define STOP_CUDAEVENT_RECORDING(STREAM) \ - cudaEventRecord(b, STREAM); \ - cudaEventSynchronize(b); - -#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b); - -// 22-11-01 HIP timing snippet instead -#define CREATE_HIPEVENT_PAIR \ - hipEvent_t a, b; \ - hipEventCreate(&a); \ - hipEventCreate(&b); - -#define DESTROY_HIPEVENT_PAIR \ - hipEventDestroy(a); \ - hipEventDestroy(b); - -#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM); -#define STOP_HIPEVENT_RECORDING(STREAM) \ - hipEventRecord(b, STREAM); \ - hipEventSynchronize(b); - -#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b); - -#ifdef __cplusplus -} -#endif - -#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */ +/** + * @file timer.h + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-31 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 +#define B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../cusz/type.h" + +struct asz_timer; +typedef struct asz_timer asz_timer; +typedef struct asz_timer asz_cputimer; + +struct asz_cudatimer; +typedef struct asz_cudatimer asz_cudatimer; + +// top-level/dispatcher +// asz_timer* asz_timer_create(asz_policy const p, void* stream); +// void asz_timer_destroy(asz_timer* t); +// void asz_timer_start(asz_timer* t); +// void asz_timer_end(asz_timer* t); +// double asz_time_elapsed(asz_timer* t); + +asz_timer* asz_cputimer_create(); +void asz_cputimer_destroy(asz_timer* t); +void asz_cputimer_start(asz_timer* t); +void asz_cputimer_end(asz_timer* t); +double asz_cputime_elapsed(asz_timer* t); + +// 22-11-01 adding wrapper incurs unexpeted overhead in timing +asz_cudatimer* asz_cudatimer_create(); +void asz_cudatimer_destroy(asz_cudatimer* t); +void asz_cudatimer_start(asz_cudatimer* t); +void asz_cudatimer_end(asz_cudatimer* t); +double asz_cudatime_elapsed(asz_cudatimer* t); + +asz_cudatimer* asz_cudastreamtimer_create(void* stream); +void asz_cudastreamtimer_destroy(asz_cudatimer* t); +void asz_cudastreamtimer_start(asz_cudatimer* t); +void asz_cudastreamtimer_end(asz_cudatimer* t); +double asz_cudastreamtime_elapsed(asz_cudatimer* t); + +// 22-11-01 CUDA timing snippet instead +#define CREATE_CUDAEVENT_PAIR \ + cudaEvent_t a, b; \ + cudaEventCreate(&a); \ + cudaEventCreate(&b); + +#define DESTROY_CUDAEVENT_PAIR \ + cudaEventDestroy(a); \ + cudaEventDestroy(b); + +#define START_CUDAEVENT_RECORDING(STREAM) cudaEventRecord(a, STREAM); +#define STOP_CUDAEVENT_RECORDING(STREAM) \ + cudaEventRecord(b, STREAM); \ + cudaEventSynchronize(b); + +#define TIME_ELAPSED_CUDAEVENT(PTR_MILLISEC) cudaEventElapsedTime(PTR_MILLISEC, a, b); + +// 22-11-01 HIP timing snippet instead +#define CREATE_HIPEVENT_PAIR \ + hipEvent_t a, b; \ + hipEventCreate(&a); \ + hipEventCreate(&b); + +#define DESTROY_HIPEVENT_PAIR \ + hipEventDestroy(a); \ + hipEventDestroy(b); + +#define START_HIPEVENT_RECORDING(STREAM) hipEventRecord(a, STREAM); +#define STOP_HIPEVENT_RECORDING(STREAM) \ + hipEventRecord(b, STREAM); \ + hipEventSynchronize(b); + +#define TIME_ELAPSED_HIPEVENT(PTR_MILLISEC) hipEventElapsedTime(PTR_MILLISEC, a, b); + +#ifdef __cplusplus +} +#endif + +#endif /* B36B7228_E9EC_4E61_A1DC_19A4352C4EB3 */ diff --git a/qtensor/compression/cusz/include/utils/timer.hh b/qtensor/compression/cusz/include/utils/timer.hh index 6ba7d35b..c820d451 100644 --- a/qtensor/compression/cusz/include/utils/timer.hh +++ b/qtensor/compression/cusz/include/utils/timer.hh @@ -1,153 +1,153 @@ -/** - * @file timer.hh - * @author Jiannan Tian - * @brief High-resolution timer wrapper from and util functions for timing both CPU and CUDA function - * @version 0.2 - * @date 2021-01-05 - * (created) 2019-08-26 (rev) 2021-12-23 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef UTILS_TIMER_HH -#define UTILS_TIMER_HH - -#include -#include - -using hires = std::chrono::high_resolution_clock; -using duration_t = std::chrono::duration; -using hires_clock_t = std::chrono::time_point; - -typedef struct Timer { - hires_clock_t start, end; - - void timer_start() { start = hires::now(); } - void timer_end() { end = hires::now(); } - double get_time_elapsed() { return static_cast(end - start).count(); } - -} host_timer_t; - -#ifdef __CUDACC__ - -/** - * @brief CUDA event based timer. Synopsis: - * cuda_timer_t t; - * t.timer_start(); - * kernel<<>>(...); - * t.timer_end(); - * cudaStreamSynchronize(stream); - * auto ms = t.get_time_elapsed(); - * - */ -typedef struct CUDATimer { - cudaEvent_t start, stop; - float milliseconds; - - // stream not involved - void timer_start() - { - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); - } - - void timer_end() - { - cudaEventRecord(stop); - cudaEventSynchronize(stop); - } - - // stream involved - void timer_start(cudaStream_t stream) - { - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start, stream); // set event as not occurred - } - - void timer_end(cudaStream_t stream) - { - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); // block host until `stream` meets `stop` - } - - // get time - float get_time_elapsed() - { - cudaEventElapsedTime(&milliseconds, start, stop); - return milliseconds; - } - -} cuda_timer_t; - -#endif - -// TODO handle return; testing -/** - * @brief A timer wrapper for arbitrary function (no handling return for now); - * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0) - * - * @tparam F auto function type - * @tparam Args variadic function argument type - * @param func non-return function to be timed - * @param args variadic function arguments - * @return double time in seconds - */ -template -double TimeThisRoutine(F func, Args&&... args) -{ - auto t0 = hires::now(); - func(std::forward(args)...); - return static_cast(hires::now() - t0).count(); -} - -#ifdef __CUDACC__ -typedef struct CUDAKernelConfig { - dim3 dim_grid; - dim3 dim_block; - size_t shmem_nbyte{0}; - cudaStream_t stream; - -} kernelcfg; - -// TODO use cudaEvent -/** - * @brief A timer wrapper for arbitrary CUDA function - * - * @tparam F auto function type - * @tparam Args variadic function argument type - * @param func CUDA kernel function to be time - * @param cfg CUDA kernel config - * @param args variadic function arguments - * @return double time in seconds - */ -template -float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args) -{ - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start); - func<<>>( // - args... - // std::forward(args)... // also works - ); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - - cudaStreamSynchronize(cfg.stream); - - float milliseconds; - cudaEventElapsedTime(&milliseconds, start, stop); - - return milliseconds; -} - -#endif - -#endif // UTILS_TIMER_HH +/** + * @file timer.hh + * @author Jiannan Tian + * @brief High-resolution timer wrapper from and util functions for timing both CPU and CUDA function + * @version 0.2 + * @date 2021-01-05 + * (created) 2019-08-26 (rev) 2021-12-23 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef UTILS_TIMER_HH +#define UTILS_TIMER_HH + +#include +#include + +using hires = std::chrono::high_resolution_clock; +using duration_t = std::chrono::duration; +using hires_clock_t = std::chrono::time_point; + +typedef struct Timer { + hires_clock_t start, end; + + void timer_start() { start = hires::now(); } + void timer_end() { end = hires::now(); } + double get_time_elapsed() { return static_cast(end - start).count(); } + +} host_timer_t; + +#ifdef __CUDACC__ + +/** + * @brief CUDA event based timer. Synopsis: + * cuda_timer_t t; + * t.timer_start(); + * kernel<<>>(...); + * t.timer_end(); + * cudaStreamSynchronize(stream); + * auto ms = t.get_time_elapsed(); + * + */ +typedef struct CUDATimer { + cudaEvent_t start, stop; + float milliseconds; + + // stream not involved + void timer_start() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + } + + void timer_end() + { + cudaEventRecord(stop); + cudaEventSynchronize(stop); + } + + // stream involved + void timer_start(cudaStream_t stream) + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start, stream); // set event as not occurred + } + + void timer_end(cudaStream_t stream) + { + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); // block host until `stream` meets `stop` + } + + // get time + float get_time_elapsed() + { + cudaEventElapsedTime(&milliseconds, start, stop); + return milliseconds; + } + +} cuda_timer_t; + +#endif + +// TODO handle return; testing +/** + * @brief A timer wrapper for arbitrary function (no handling return for now); + * Adapted from https://stackoverflow.com/a/33900479/8740097 (CC BY-SA 3.0) + * + * @tparam F auto function type + * @tparam Args variadic function argument type + * @param func non-return function to be timed + * @param args variadic function arguments + * @return double time in seconds + */ +template +double TimeThisRoutine(F func, Args&&... args) +{ + auto t0 = hires::now(); + func(std::forward(args)...); + return static_cast(hires::now() - t0).count(); +} + +#ifdef __CUDACC__ +typedef struct CUDAKernelConfig { + dim3 dim_grid; + dim3 dim_block; + size_t shmem_nbyte{0}; + cudaStream_t stream; + +} kernelcfg; + +// TODO use cudaEvent +/** + * @brief A timer wrapper for arbitrary CUDA function + * + * @tparam F auto function type + * @tparam Args variadic function argument type + * @param func CUDA kernel function to be time + * @param cfg CUDA kernel config + * @param args variadic function arguments + * @return double time in seconds + */ +template +float TimeThisCUDARoutine(F func, kernelcfg cfg, Args&&... args) +{ + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + func<<>>( // + args... + // std::forward(args)... // also works + ); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaStreamSynchronize(cfg.stream); + + float milliseconds; + cudaEventElapsedTime(&milliseconds, start, stop); + + return milliseconds; +} + +#endif + +#endif // UTILS_TIMER_HH diff --git a/qtensor/compression/cusz/src/cli/cli.cu b/qtensor/compression/cusz/src/cli/cli.cu index 01c61565..64084cba 100644 --- a/qtensor/compression/cusz/src/cli/cli.cu +++ b/qtensor/compression/cusz/src/cli/cli.cu @@ -1,14 +1,14 @@ -/** - * @file cli.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-03-07 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "cli.cuh" - -template class cusz::CLI; +/** + * @file cli.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-03-07 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "cli.cuh" + +template class cusz::CLI; diff --git a/qtensor/compression/cusz/src/cli/cli.cuh b/qtensor/compression/cusz/src/cli/cli.cuh index da94a347..14a9103d 100644 --- a/qtensor/compression/cusz/src/cli/cli.cuh +++ b/qtensor/compression/cusz/src/cli/cli.cuh @@ -1,195 +1,195 @@ -/** - * @file cli.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-02-20 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CLI_CUH -#define CLI_CUH - -#include -#include - -#include "cli/analyzer.hh" -#include "cli/dryrun_part.cuh" -#include "cli/query.hh" -#include "cli/timerecord_viewer.hh" -#include "cusz.h" -#include "framework.hh" - -namespace cusz { - -template -class CLI { - private: - using Header = cuszHEADER; - using T = Data; - - const static auto HOST = cusz::LOC::HOST; - const static auto DEVICE = cusz::LOC::DEVICE; - const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE; - - using context_t = cuszCTX*; - using header_t = cuszHEADER*; - - public: - CLI() = default; - - template - static void cli_dryrun(context_t ctx, bool dualquant = true) - { - BaseCompressor analysis; - - uint3 xyz{ctx->x, ctx->y, ctx->z}; - cudaStream_t stream; - cudaStreamCreate(&stream); - - if (not dualquant) { - analysis.init_dualquant_dryrun(xyz); - analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream); - analysis.destroy_dualquant_dryrun(); - } - else { - analysis.init_generic_dryrun(xyz); - analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream); - analysis.destroy_generic_dryrun(); - } - cudaStreamDestroy(stream); - } - - private: - void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len) - { - Capsule file("cusza"); - file.set_len(compressed_len) - .set_dptr(compressed) - .mallochost() - .device2host() - .tofile(compressed_name) - .freehost() - .free(); - } - - void try_write_decompressed_to_disk(Capsule& xdata, std::string basename, bool skip_write) - { - if (not skip_write) xdata.device2host().tofile(basename + ".cuszx"); - } - - // template - void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream) - { - Capsule input("uncompressed"); - BYTE* compressed; - size_t compressed_len; - Header header; - auto len = ctx->get_len(); - auto basename = ctx->fname.fname; - - auto load_uncompressed = [&](std::string fname) { - input - .set_len(len) // - .mallochost() - .malloc() - .fromfile(fname) - .host2device(); - }; - - auto adjust_eb = [&]() { - if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng(); - }; - - /******************************************************************************/ - - load_uncompressed(basename); - adjust_eb(); - - TimeRecord timerecord; - - cusz_config* config = new cusz_config{.eb = ctx->eb, .mode = Rel}; - cusz_len uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1}; - - cusz_compress( - compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord, - stream); - - if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len); - write_compressed_to_disk(basename + ".cusza", compressed, compressed_len); - } - - // template - void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream) - { - Capsule compressed("compressed"); - Capsule decompressed("decompressed"), original("cmp"); - auto header = new Header; - auto basename = (*ctx).fname.fname; - - auto load_compressed = [&](std::string compressed_name) { - auto compressed_len = ConfigHelper::get_filesize(compressed_name); - compressed - .set_len(compressed_len) // - .mallochost() - .malloc() - .fromfile(compressed_name) - .host2device(); - }; - - /******************************************************************************/ - - load_compressed(basename + ".cusza"); - memcpy(header, compressed.hptr(), sizeof(Header)); - auto len = ConfigHelper::get_uncompressed_len(header); - - decompressed // - .set_len(len) - .mallochost() - .malloc(); - original.set_len(len); - - TimeRecord timerecord; - - cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1}; - - cusz_decompress( - compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len, - (void*)&timerecord, stream); - - if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte()); - QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp); - try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk); - - decompressed.freehost().free(); - } - - public: - // TODO determine dtype & predictor in here - void dispatch(context_t ctx) - { - // TODO disable predictor selection; to specify in another way - // auto predictor = (*ctx).predictor; - - cusz_framework* framework = cusz_default_framework(); - cusz_compressor* compressor = cusz_create(framework, FP32); - - cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); - - // TODO hardcoded predictor type - if ((*ctx).cli_task.dryrun) cli_dryrun::Predictor>(ctx); - - if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream); - - if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream); - - if (stream) cudaStreamDestroy(stream); - } -}; - -} // namespace cusz - -#endif +/** + * @file cli.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-02-20 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CLI_CUH +#define CLI_CUH + +#include +#include + +#include "cli/analyzer.hh" +#include "cli/dryrun_part.cuh" +#include "cli/query.hh" +#include "cli/timerecord_viewer.hh" +#include "cusz.h" +#include "framework.hh" + +namespace cusz { + +template +class CLI { + private: + using Header = cuszHEADER; + using T = Data; + + const static auto HOST = cusz::LOC::HOST; + const static auto DEVICE = cusz::LOC::DEVICE; + const static auto HOST_DEVICE = cusz::LOC::HOST_DEVICE; + + using context_t = cuszCTX*; + using header_t = cuszHEADER*; + + public: + CLI() = default; + + template + static void cli_dryrun(context_t ctx, bool dualquant = true) + { + BaseCompressor analysis; + + uint3 xyz{ctx->x, ctx->y, ctx->z}; + cudaStream_t stream; + cudaStreamCreate(&stream); + + if (not dualquant) { + analysis.init_dualquant_dryrun(xyz); + analysis.dualquant_dryrun(ctx->fname.fname, ctx->eb, ctx->mode == "r2r", stream); + analysis.destroy_dualquant_dryrun(); + } + else { + analysis.init_generic_dryrun(xyz); + analysis.generic_dryrun(ctx->fname.fname, ctx->eb, 512, ctx->mode == "r2r", stream); + analysis.destroy_generic_dryrun(); + } + cudaStreamDestroy(stream); + } + + private: + void write_compressed_to_disk(std::string compressed_name, BYTE* compressed, size_t compressed_len) + { + Capsule file("cusza"); + file.set_len(compressed_len) + .set_dptr(compressed) + .mallochost() + .device2host() + .tofile(compressed_name) + .freehost() + .free(); + } + + void try_write_decompressed_to_disk(Capsule& xdata, std::string basename, bool skip_write) + { + if (not skip_write) xdata.device2host().tofile(basename + ".cuszx"); + } + + // template + void cli_construct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream) + { + Capsule input("uncompressed"); + BYTE* compressed; + size_t compressed_len; + Header header; + auto len = ctx->get_len(); + auto basename = ctx->fname.fname; + + auto load_uncompressed = [&](std::string fname) { + input + .set_len(len) // + .mallochost() + .malloc() + .fromfile(fname) + .host2device(); + }; + + auto adjust_eb = [&]() { + if (ctx->mode == "r2r") ctx->eb *= input.prescan().get_rng(); + }; + + /******************************************************************************/ + + load_uncompressed(basename); + adjust_eb(); + + TimeRecord timerecord; + + cusz_config* config = new cusz_config{.eb = ctx->eb, .mode = Rel}; + cusz_len uncomp_len = cusz_len{ctx->x, ctx->y, ctx->z, 1}; + + cusz_compress( + compressor, config, input.dptr(), uncomp_len, &compressed, &compressed_len, &header, (void*)&timerecord, + stream); + + if (ctx->report.time) TimeRecordViewer::view_compression(&timerecord, input.nbyte(), compressed_len); + write_compressed_to_disk(basename + ".cusza", compressed, compressed_len); + } + + // template + void cli_reconstruct(context_t ctx, cusz_compressor* compressor, cudaStream_t stream) + { + Capsule compressed("compressed"); + Capsule decompressed("decompressed"), original("cmp"); + auto header = new Header; + auto basename = (*ctx).fname.fname; + + auto load_compressed = [&](std::string compressed_name) { + auto compressed_len = ConfigHelper::get_filesize(compressed_name); + compressed + .set_len(compressed_len) // + .mallochost() + .malloc() + .fromfile(compressed_name) + .host2device(); + }; + + /******************************************************************************/ + + load_compressed(basename + ".cusza"); + memcpy(header, compressed.hptr(), sizeof(Header)); + auto len = ConfigHelper::get_uncompressed_len(header); + + decompressed // + .set_len(len) + .mallochost() + .malloc(); + original.set_len(len); + + TimeRecord timerecord; + + cusz_len decomp_len = cusz_len{header->x, header->y, header->z, 1}; + + cusz_decompress( + compressor, header, compressed.dptr(), ConfigHelper::get_filesize(header), decompressed.dptr(), decomp_len, + (void*)&timerecord, stream); + + if (ctx->report.time) TimeRecordViewer::view_decompression(&timerecord, decompressed.nbyte()); + QualityViewer::view(header, decompressed, original, (*ctx).fname.origin_cmp); + try_write_decompressed_to_disk(decompressed, basename, (*ctx).skip.write2disk); + + decompressed.freehost().free(); + } + + public: + // TODO determine dtype & predictor in here + void dispatch(context_t ctx) + { + // TODO disable predictor selection; to specify in another way + // auto predictor = (*ctx).predictor; + + cusz_framework* framework = cusz_default_framework(); + cusz_compressor* compressor = cusz_create(framework, FP32); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + // TODO hardcoded predictor type + if ((*ctx).cli_task.dryrun) cli_dryrun::Predictor>(ctx); + + if ((*ctx).cli_task.construct) cli_construct(ctx, compressor, stream); + + if ((*ctx).cli_task.reconstruct) cli_reconstruct(ctx, compressor, stream); + + if (stream) cudaStreamDestroy(stream); + } +}; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cu b/qtensor/compression/cusz/src/cli/dryrun_part.cu index 41311b6b..c3a8a1c4 100644 --- a/qtensor/compression/cusz/src/cli/dryrun_part.cu +++ b/qtensor/compression/cusz/src/cli/dryrun_part.cu @@ -1,17 +1,17 @@ -/** - * @file base_compressor.cu - * @author Jiannan Tian - * @brief Predictor-only Base Compressor; can also be used for dryrun. - * @version 0.3 - * @date 2021-10-05 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#include "dryrun_part.cuh" - -template class cusz::BaseCompressor::type, - ErrCtrlTrait<2>::type, - FastLowPrecisionTrait::type>>; +/** + * @file base_compressor.cu + * @author Jiannan Tian + * @brief Predictor-only Base Compressor; can also be used for dryrun. + * @version 0.3 + * @date 2021-10-05 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#include "dryrun_part.cuh" + +template class cusz::BaseCompressor::type, + ErrCtrlTrait<2>::type, + FastLowPrecisionTrait::type>>; diff --git a/qtensor/compression/cusz/src/cli/dryrun_part.cuh b/qtensor/compression/cusz/src/cli/dryrun_part.cuh index e6fd4579..0013e790 100644 --- a/qtensor/compression/cusz/src/cli/dryrun_part.cuh +++ b/qtensor/compression/cusz/src/cli/dryrun_part.cuh @@ -1,196 +1,196 @@ -/** - * @file base_compressor.cuh - * @author Jiannan Tian - * @brief Predictor-only Base Compressor; can also be used for dryrun. - * @version 0.3 - * @date 2021-10-05 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef BASE_COMPRESSOR_CUH -#define BASE_COMPRESSOR_CUH - -#include "cli/analyzer.hh" -#include "cli/quality_viewer.hh" -#include "cli/verify.hh" -#include "common.hh" -#include "component.hh" -#include "context.hh" -#include "kernel/dryrun.cuh" -#include "stat/compare_gpu.hh" -#include "utils.hh" - -/** - * @brief bare metal, can run predictor to check data quality and compressibility - * - * @tparam T for data type - * @tparam E for error control type - */ - -namespace cusz { - -template -class BaseCompressor { - public: - using BYTE = uint8_t; - using T = typename Predictor::Origin; - using FP = typename Predictor::Precision; - using E = typename Predictor::ErrCtrl; - - private: - struct NonCritical { - Predictor* p; - Capsule original; - Capsule errctrl; // TODO change to 4-byte - Capsule outlier; - Capsule anchor; - Capsule reconst; - - NonCritical(dim3 size) { p = new Predictor; } - }; - - struct NonCritical* nc; - - protected: - cuszCTX* ctx; - - int dict_size; - double eb; - - dim3 xyz; - - public: - /** - * @brief Generic dryrun; performing predictor.construct() and .reconstruct() - * - * @param fname filename - * @param eb (host variable) error bound; future: absolute error bound only - * @param radius (host variable) limiting radius - * @param r2r if relative-to-value-range - * @param stream CUDA stream - * @return BaseCompressor& this object instance - */ - BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream) - { - if (not nc) throw std::runtime_error("NonCritical struct has no instance."); - - // LOGGING(LOG_INFO, "invoke dry-run"); - - nc->original.fromfile(fname).host2device_async(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - if (r2r) { - double max, min, rng; - nc->original.prescan(max, min, rng); - eb *= rng; - } - - auto xyz = dim3(ctx->x, ctx->y, ctx->z); - - // nc->p->construct( - // LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius, - // stream); - // nc->p->reconstruct( - // LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius, - // stream); - - nc->reconst.device2host_async(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - cusz_stats stat; - psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data()); - cusz::QualityViewer::print_metrics_cross(&stat, 0, true); - - return *this; - } - - /** - * @brief Dual-quant dryrun; performing integerization & its reverse procedure - * - * @param eb (host variable) error bound; future: absolute error bound only - * @param r2r if relative-to-value-range - * @param stream CUDA stream - * @return BaseCompressor& this object instance - */ - BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream) - { - auto len = nc->original.len(); - - nc->original.fromfile(fname).host2device_async(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - if (r2r) { - double max, min, rng; - nc->original.prescan(max, min, rng); - eb *= rng; - } - - auto ebx2_r = 1 / (eb * 2); - auto ebx2 = eb * 2; - - cusz::dualquant_dryrun_kernel // - <<>> // - (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2); - - nc->reconst.device2host_async(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - cusz_stats stat; - psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len); - cusz::QualityViewer::print_metrics_cross(&stat, 0, true); - - return *this; - } - - public: - BaseCompressor() = default; - - ~BaseCompressor() {} - - public: - // dry run - void init_generic_dryrun(dim3 size) - { // - auto len = size.x * size.y * size.z; - nc = new struct NonCritical(size); - - nc->original.set_len(len).mallochost().malloc(); - nc->outlier.set_len(len).mallochost().malloc(); - nc->errctrl.set_len(len).mallochost().malloc(); - nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc(); - nc->reconst.set_len(len).mallochost().malloc(); - } - - void destroy_generic_dryrun() - { - delete nc->p; - nc->original.freehost().free(); - nc->outlier.freehost().free(); - nc->errctrl.freehost().free(); - nc->anchor.freehost().free(); - nc->reconst.freehost().free(); - delete nc; - } - - void init_dualquant_dryrun(dim3 size) - { - auto len = size.x * size.y * size.z; - nc = new struct NonCritical(size); - nc->original.set_len(len).mallochost().malloc(); - nc->reconst.set_len(len).mallochost().malloc(); - } - - void destroy_dualquant_dryrun() - { - nc->original.freehost().free(); - nc->reconst.freehost().free(); - - delete nc; - } -}; - -} // namespace cusz - -#endif +/** + * @file base_compressor.cuh + * @author Jiannan Tian + * @brief Predictor-only Base Compressor; can also be used for dryrun. + * @version 0.3 + * @date 2021-10-05 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef BASE_COMPRESSOR_CUH +#define BASE_COMPRESSOR_CUH + +#include "cli/analyzer.hh" +#include "cli/quality_viewer.hh" +#include "cli/verify.hh" +#include "common.hh" +#include "component.hh" +#include "context.hh" +#include "kernel/dryrun.cuh" +#include "stat/compare_gpu.hh" +#include "utils.hh" + +/** + * @brief bare metal, can run predictor to check data quality and compressibility + * + * @tparam T for data type + * @tparam E for error control type + */ + +namespace cusz { + +template +class BaseCompressor { + public: + using BYTE = uint8_t; + using T = typename Predictor::Origin; + using FP = typename Predictor::Precision; + using E = typename Predictor::ErrCtrl; + + private: + struct NonCritical { + Predictor* p; + Capsule original; + Capsule errctrl; // TODO change to 4-byte + Capsule outlier; + Capsule anchor; + Capsule reconst; + + NonCritical(dim3 size) { p = new Predictor; } + }; + + struct NonCritical* nc; + + protected: + cuszCTX* ctx; + + int dict_size; + double eb; + + dim3 xyz; + + public: + /** + * @brief Generic dryrun; performing predictor.construct() and .reconstruct() + * + * @param fname filename + * @param eb (host variable) error bound; future: absolute error bound only + * @param radius (host variable) limiting radius + * @param r2r if relative-to-value-range + * @param stream CUDA stream + * @return BaseCompressor& this object instance + */ + BaseCompressor& generic_dryrun(const std::string fname, double eb, int radius, bool r2r, cudaStream_t stream) + { + if (not nc) throw std::runtime_error("NonCritical struct has no instance."); + + // LOGGING(LOG_INFO, "invoke dry-run"); + + nc->original.fromfile(fname).host2device_async(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + if (r2r) { + double max, min, rng; + nc->original.prescan(max, min, rng); + eb *= rng; + } + + auto xyz = dim3(ctx->x, ctx->y, ctx->z); + + // nc->p->construct( + // LorenzoI, xyz, nc->original.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->outlier.dptr, eb, radius, + // stream); + // nc->p->reconstruct( + // LorenzoI, xyz, nc->outlier.dptr, nc->anchor.dptr, nc->errctrl.dptr, nc->reconst.dptr, eb, radius, + // stream); + + nc->reconst.device2host_async(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + cusz_stats stat; + psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), nc->p->get_len_data()); + cusz::QualityViewer::print_metrics_cross(&stat, 0, true); + + return *this; + } + + /** + * @brief Dual-quant dryrun; performing integerization & its reverse procedure + * + * @param eb (host variable) error bound; future: absolute error bound only + * @param r2r if relative-to-value-range + * @param stream CUDA stream + * @return BaseCompressor& this object instance + */ + BaseCompressor& dualquant_dryrun(const std::string fname, double eb, bool r2r, cudaStream_t stream) + { + auto len = nc->original.len(); + + nc->original.fromfile(fname).host2device_async(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + if (r2r) { + double max, min, rng; + nc->original.prescan(max, min, rng); + eb *= rng; + } + + auto ebx2_r = 1 / (eb * 2); + auto ebx2 = eb * 2; + + cusz::dualquant_dryrun_kernel // + <<>> // + (nc->original.dptr(), nc->reconst.dptr(), len, ebx2_r, ebx2); + + nc->reconst.device2host_async(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + cusz_stats stat; + psz::thrustgpu_assess_quality(&stat, nc->reconst.hptr(), nc->original.hptr(), len); + cusz::QualityViewer::print_metrics_cross(&stat, 0, true); + + return *this; + } + + public: + BaseCompressor() = default; + + ~BaseCompressor() {} + + public: + // dry run + void init_generic_dryrun(dim3 size) + { // + auto len = size.x * size.y * size.z; + nc = new struct NonCritical(size); + + nc->original.set_len(len).mallochost().malloc(); + nc->outlier.set_len(len).mallochost().malloc(); + nc->errctrl.set_len(len).mallochost().malloc(); + nc->anchor.set_len(nc->p->get_len_anchor()).mallochost().malloc(); + nc->reconst.set_len(len).mallochost().malloc(); + } + + void destroy_generic_dryrun() + { + delete nc->p; + nc->original.freehost().free(); + nc->outlier.freehost().free(); + nc->errctrl.freehost().free(); + nc->anchor.freehost().free(); + nc->reconst.freehost().free(); + delete nc; + } + + void init_dualquant_dryrun(dim3 size) + { + auto len = size.x * size.y * size.z; + nc = new struct NonCritical(size); + nc->original.set_len(len).mallochost().malloc(); + nc->reconst.set_len(len).mallochost().malloc(); + } + + void destroy_dualquant_dryrun() + { + nc->original.freehost().free(); + nc->reconst.freehost().free(); + + delete nc; + } +}; + +} // namespace cusz + +#endif diff --git a/qtensor/compression/cusz/src/cli_bin.cu b/qtensor/compression/cusz/src/cli_bin.cu index f3e50d64..c59c00f9 100644 --- a/qtensor/compression/cusz/src/cli_bin.cu +++ b/qtensor/compression/cusz/src/cli_bin.cu @@ -1,27 +1,27 @@ -/** - * @file cusz-cli.cu - * @author Jiannan Tian - * @brief Driver program of cuSZ. - * @version 0.1 - * @date 2020-09-20 - * (created) 2019-12-30 (rev) 2022-02-20 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include "cli/cli.cuh" - -int main(int argc, char** argv) -{ - auto ctx = new cuszCTX(argc, argv); - - if (ctx->verbose) { - Diagnostics::GetMachineProperties(); - GpuDiagnostics::GetDeviceProperty(); - } - - cusz::CLI cusz_cli; - cusz_cli.dispatch(ctx); -} +/** + * @file cusz-cli.cu + * @author Jiannan Tian + * @brief Driver program of cuSZ. + * @version 0.1 + * @date 2020-09-20 + * (created) 2019-12-30 (rev) 2022-02-20 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include "cli/cli.cuh" + +int main(int argc, char** argv) +{ + auto ctx = new cuszCTX(argc, argv); + + if (ctx->verbose) { + Diagnostics::GetMachineProperties(); + GpuDiagnostics::GetDeviceProperty(); + } + + cusz::CLI cusz_cli; + cusz_cli.dispatch(ctx); +} diff --git a/qtensor/compression/cusz/src/compressor.cc b/qtensor/compression/cusz/src/compressor.cc index 7b62db5a..7482293b 100644 --- a/qtensor/compression/cusz/src/compressor.cc +++ b/qtensor/compression/cusz/src/compressor.cc @@ -1,149 +1,149 @@ -/** - * @file compressor.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "compressor.hh" -#include "common/configs.hh" -#include "framework.hh" - -namespace cusz { - -template -Compressor::~Compressor() -{ - pimpl.reset(); -} - -template -Compressor::Compressor() : pimpl{std::make_unique()} -{ -} - -template -Compressor::Compressor(const Compressor& old) : pimpl{std::make_unique(*old.pimpl)} -{ -} - -template -Compressor& Compressor::operator=(const Compressor& old) -{ - *pimpl = *old.pimpl; - return *this; -} - -template -Compressor::Compressor(Compressor&&) = default; - -template -Compressor& Compressor::operator=(Compressor&&) = default; - -//------------------------------------------------------------------------------ - -template -void Compressor::init(Context* config, bool dbg_print) -{ - pimpl->init(config, dbg_print); -} - -template -void Compressor::init(Header* config, bool dbg_print) -{ - pimpl->init(config, dbg_print); -} - -template -void Compressor::compress( - Context* config, - Compressor::T* uncompressed, - BYTE*& compressed, - size_t& compressed_len, - cudaStream_t stream, - bool dbg_print) -{ - pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print); -} - -template -void Compressor::decompress( - Header* config, - BYTE* compressed, - Compressor::T* decompressed, - cudaStream_t stream, - bool dbg_print) -{ - pimpl->decompress(config, compressed, decompressed, stream, dbg_print); -} - -template -void Compressor::clear_buffer() -{ - pimpl->clear_buffer(); -} - -// getter - -template -void Compressor::export_header(Header& header) -{ - pimpl->export_header(header); -} - -template -void Compressor::export_header(Header* header) -{ - pimpl->export_header(header); -} - -template -void Compressor::export_timerecord(TimeRecord* ext_timerecord) -{ - pimpl->export_timerecord(ext_timerecord); -} - -} // namespace cusz - -// extra helper -namespace cusz { - -int CompressorHelper::autotune_coarse_parvle(Context* ctx) -{ - auto tune_coarse_huffman_sublen = [](size_t len) { - int current_dev = 0; - cudaSetDevice(current_dev); - cudaDeviceProp dev_prop{}; - cudaGetDeviceProperties(&dev_prop, current_dev); - - auto nSM = dev_prop.multiProcessorCount; - auto allowed_block_dim = dev_prop.maxThreadsPerBlock; - auto deflate_nthread = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT; - auto optimal_sublen = ConfigHelper::get_npart(len, deflate_nthread); - optimal_sublen = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) * - HuffmanHelper::BLOCK_DIM_DEFLATE; - - return optimal_sublen; - }; - - auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) { - sublen = tune_coarse_huffman_sublen(len); - pardeg = ConfigHelper::get_npart(len, sublen); - }; - - // TODO should be move to somewhere else, e.g., cusz::par_optmizer - if (ctx->use.autotune_vle_pardeg) - get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg); - else - ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen); - - return ctx->vle_pardeg; -} - -} // namespace cusz - -template class cusz::Compressor>; +/** + * @file compressor.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "compressor.hh" +#include "common/configs.hh" +#include "framework.hh" + +namespace cusz { + +template +Compressor::~Compressor() +{ + pimpl.reset(); +} + +template +Compressor::Compressor() : pimpl{std::make_unique()} +{ +} + +template +Compressor::Compressor(const Compressor& old) : pimpl{std::make_unique(*old.pimpl)} +{ +} + +template +Compressor& Compressor::operator=(const Compressor& old) +{ + *pimpl = *old.pimpl; + return *this; +} + +template +Compressor::Compressor(Compressor&&) = default; + +template +Compressor& Compressor::operator=(Compressor&&) = default; + +//------------------------------------------------------------------------------ + +template +void Compressor::init(Context* config, bool dbg_print) +{ + pimpl->init(config, dbg_print); +} + +template +void Compressor::init(Header* config, bool dbg_print) +{ + pimpl->init(config, dbg_print); +} + +template +void Compressor::compress( + Context* config, + Compressor::T* uncompressed, + BYTE*& compressed, + size_t& compressed_len, + cudaStream_t stream, + bool dbg_print) +{ + pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print); +} + +template +void Compressor::decompress( + Header* config, + BYTE* compressed, + Compressor::T* decompressed, + cudaStream_t stream, + bool dbg_print) +{ + pimpl->decompress(config, compressed, decompressed, stream, dbg_print); +} + +template +void Compressor::clear_buffer() +{ + pimpl->clear_buffer(); +} + +// getter + +template +void Compressor::export_header(Header& header) +{ + pimpl->export_header(header); +} + +template +void Compressor::export_header(Header* header) +{ + pimpl->export_header(header); +} + +template +void Compressor::export_timerecord(TimeRecord* ext_timerecord) +{ + pimpl->export_timerecord(ext_timerecord); +} + +} // namespace cusz + +// extra helper +namespace cusz { + +int CompressorHelper::autotune_coarse_parvle(Context* ctx) +{ + auto tune_coarse_huffman_sublen = [](size_t len) { + int current_dev = 0; + cudaSetDevice(current_dev); + cudaDeviceProp dev_prop{}; + cudaGetDeviceProperties(&dev_prop, current_dev); + + auto nSM = dev_prop.multiProcessorCount; + auto allowed_block_dim = dev_prop.maxThreadsPerBlock; + auto deflate_nthread = allowed_block_dim * nSM / HuffmanHelper::DEFLATE_CONSTANT; + auto optimal_sublen = ConfigHelper::get_npart(len, deflate_nthread); + optimal_sublen = ConfigHelper::get_npart(optimal_sublen, HuffmanHelper::BLOCK_DIM_DEFLATE) * + HuffmanHelper::BLOCK_DIM_DEFLATE; + + return optimal_sublen; + }; + + auto get_coarse_pardeg = [&](size_t len, int& sublen, int& pardeg) { + sublen = tune_coarse_huffman_sublen(len); + pardeg = ConfigHelper::get_npart(len, sublen); + }; + + // TODO should be move to somewhere else, e.g., cusz::par_optmizer + if (ctx->use.autotune_vle_pardeg) + get_coarse_pardeg(ctx->data_len, ctx->vle_sublen, ctx->vle_pardeg); + else + ctx->vle_pardeg = ConfigHelper::get_npart(ctx->data_len, ctx->vle_sublen); + + return ctx->vle_pardeg; +} + +} // namespace cusz + +template class cusz::Compressor>; diff --git a/qtensor/compression/cusz/src/context.cc b/qtensor/compression/cusz/src/context.cc index c85f3d24..3356323b 100644 --- a/qtensor/compression/cusz/src/context.cc +++ b/qtensor/compression/cusz/src/context.cc @@ -1,493 +1,493 @@ -/** - * @file argparse.cc - * @author Jiannan Tian - * @brief Argument parser. - * @version 0.1 - * @date 2020-09-20 - * Created on: 20-04-24 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include -#include -#include -#include - -#include "cli/document.hh" -#include "context.hh" - -namespace cusz { -const char* VERSION_TEXT = "2023-01-23 (unstable; pre-0.4)"; -const int VERSION = 20230123; -const int COMPATIBILITY = 0; -} // namespace cusz - -namespace { - -void set_preprocess(cusz::context_t ctx, const char* in_str) -{ - str_list opts; - StrHelper::parse_strlist(in_str, opts); - - for (auto k : opts) { - // TODO - } -} - -void set_report(cusz::context_t ctx, const char* in_str) -{ - str_list opts; - StrHelper::parse_strlist(in_str, opts); - - for (auto o : opts) { - if (StrHelper::is_kv_pair(o)) { - auto kv = StrHelper::parse_kv_onoff(o); - - if (kv.first == "cr") - ctx->report.cr = kv.second; - else if (kv.first == "compressibility") - ctx->report.compressibility = kv.second; - else if (kv.first == "time") - ctx->report.time = kv.second; - } - else { - if (o == "cr") - ctx->report.cr = true; - else if (o == "compressibility") - ctx->report.compressibility = true; - else if (o == "time") - ctx->report.time = true; - } - } -} - -void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false) -{ - map_t opts; - StrHelper::parse_strlist_as_kv(in_str, opts); - - if (dbg_print) { - for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str()); - std::cout << "\n"; - } - - std::string k, v; - char* end; - - auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); }; - auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; }; - - for (auto kv : opts) { - k = kv.first; - v = kv.second; - - if (optmatch({"type", "dtype"})) { - ConfigHelper::check_dtype(v, false); - ctx->dtype = v; - } - else if (optmatch({"eb", "errorbound"})) { - ctx->eb = StrHelper::str2fp(v); - } - else if (optmatch({"mode"})) { - ConfigHelper::check_cuszmode(v, true); - ctx->mode = v; - } - else if (optmatch({"len", "length"})) { - cuszCTX::parse_input_length(v.c_str(), ctx); - } - else if (optmatch({"alloclen"})) { - ctx->alloclen.len = StrHelper::str2int(v); - } - else if (optmatch({"demo"})) { - ctx->use.predefined_demo = true; - ctx->demo_dataset = std::string(v); - ctx->load_demo_sizes(); - } - else if (optmatch({"cap", "booklen", "dictsize"})) { - ctx->dict_size = StrHelper::str2int(v); - ctx->radius = ctx->dict_size / 2; - } - else if (optmatch({"radius"})) { - ctx->radius = StrHelper::str2int(v); - ctx->dict_size = ctx->radius * 2; - } - else if (optmatch({"huffbyte"})) { - ctx->huff_bytewidth = StrHelper::str2int(v); - ctx->codecs_in_use = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/; - } - else if (optmatch({"huffchunk"})) { - ctx->vle_sublen = StrHelper::str2int(v); - ctx->use.autotune_vle_pardeg = false; - } - else if (optmatch({"predictor"})) { - ctx->predictor = std::string(v); - } - else if (optmatch({"codec"})) { - // placeholder - } - else if (optmatch({"spcodec"})) { - // placeholder - } - else if (optmatch({"anchor"}) and is_enabled(v)) { - ctx->use.anchor = true; - } - else if (optmatch({"nondestructive"}) and is_enabled(v)) { - // placeholder - } - else if (optmatch({"failfast"}) and is_enabled(v)) { - // placeholder - } - else if (optmatch({"releaseinput"}) and is_enabled(v)) { - ctx->use.release_input = true; - } - else if (optmatch({"pipeline"})) { - ctx->pipeline = v; - } - else if (optmatch({"density"})) { // refer to `SparseMethodSetup` in `config.hh` - ctx->nz_density = StrHelper::str2fp(v); - ctx->nz_density_factor = 1 / ctx->nz_density; - } - else if (optmatch({"densityfactor"})) { // refer to `SparseMethodSetup` in `config.hh` - ctx->nz_density_factor = StrHelper::str2fp(v); - ctx->nz_density = 1 / ctx->nz_density_factor; - } - else if (optmatch({"gpuverify"}) and is_enabled(v)) { - ctx->use.gpu_verify = true; - } - - // when to enable anchor - if (ctx->predictor == "spline3") { - // unconditionally use anchor when it is spline3 - ctx->use.anchor = true; - } - } -} - -void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv) -{ - int i = 1; - - auto check_next = [&]() { - if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i])); - }; - - std::string opt; - auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); }; - - while (i < argc) { - if (argv[i][0] == '-') { - opt = std::string(argv[i]); - - if (optmatch({"-c", "--config"})) { - check_next(); - set_config(ctx, argv[++i]); - } - else if (optmatch({"-R", "--report"})) { - check_next(); - set_report(ctx, argv[++i]); - } - else if (optmatch({"-h", "--help"})) { - cusz::Context::print_doc(true); - exit(0); - } - else if (optmatch({"-v", "--version"})) { - std::cout << ">>>> cusz build: " << cusz::VERSION_TEXT << "\n"; - exit(0); - } - else if (optmatch({"-m", "--mode"})) { - check_next(); - ctx->mode = std::string(argv[++i]); - if (ctx->mode == "r2r") ctx->preprocess.prescan = true; - } - else if (optmatch({"-e", "--eb", "--error-bound"})) { - check_next(); - char* end; - ctx->eb = std::strtod(argv[++i], &end); - } - else if (optmatch({"-p", "--predictor"})) { - check_next(); - ctx->predictor = std::string(argv[++i]); - } - else if (optmatch({"-c", "--codec"})) { - check_next(); - // placeholder - } - else if (optmatch({"-s", "--spcodec"})) { - check_next(); - // placeholder - } - else if (optmatch({"-t", "--type", "--dtype"})) { - check_next(); - std::string s = std::string(std::string(argv[++i])); - if (s == "f32" or s == "fp4") - ctx->dtype = "f32"; - else if (s == "f64" or s == "fp8") - ctx->dtype = "f64"; - } - else if (optmatch({"-i", "--input"})) { - check_next(); - ctx->fname.fname = std::string(argv[++i]); - } - else if (optmatch({"-l", "--len"})) { - check_next(); - cusz::Context::parse_input_length(argv[++i], ctx); - } - else if (optmatch({"-L", "--allocation-len"})) { - check_next(); - // placeholder - } - else if (optmatch({"-z", "--zip", "--compress"})) { - ctx->cli_task.construct = true; - } - else if (optmatch({"-x", "--unzip", "--decompress"})) { - ctx->cli_task.reconstruct = true; - } - else if (optmatch({"-r", "--dry-run"})) { - ctx->cli_task.dryrun = true; - } - else if (optmatch({"--anchor"})) { - ctx->use.anchor = true; - } - else if (optmatch({"--nondestructive", "--input-nondestructive"})) { - // placeholder - } - else if (optmatch({"--failfast"})) { - // placeholder - } - else if (optmatch({"-P", "--pre", "--preprocess"})) { - check_next(); - std::string pre(argv[++i]); - if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; } - } - else if (optmatch({"-T", "--post", "--postprocess"})) { - check_next(); - std::string post(argv[++i]); - if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; } - if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; } - } - else if (optmatch({"-V", "--verbose"})) { - ctx->verbose = true; - } - else if (optmatch({"--pipeline"})) { - check_next(); - ctx->pipeline = std::string(argv[++i]); - } - else if (optmatch({"--demo"})) { - check_next(); - ctx->use.predefined_demo = true; - ctx->demo_dataset = std::string(argv[++i]); - ctx->load_demo_sizes(); - } - else if (optmatch({"-S", "-X", "--skip", "--exclude"})) { - check_next(); - std::string exclude(argv[++i]); - if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; } - if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; } - } - else if (optmatch({"--opath"})) { - check_next(); - ctx->opath = std::string(argv[++i]); - } - else if (optmatch({"--origin", "--compare"})) { - check_next(); - ctx->fname.origin_cmp = std::string(argv[++i]); - } - else { - const char* notif_prefix = "invalid option value at position "; - char* notif; - int size = asprintf(¬if, "%d: %s", i, argv[i]); - cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m" - << "\n"; - cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' '); - cerr << "\e[1m"; - cerr << std::string(strlen(notif), '~'); - cerr << "\e[0m\n"; - - ctx->trap(-1); - } - } - else { - const char* notif_prefix = "invalid option at position "; - char* notif; - int size = asprintf(¬if, "%d: %s", i, argv[i]); - cerr << LOG_ERR << notif_prefix << "\e[1m" << notif - << "\e[0m" - "\n" - << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ') // - << "\e[1m" // - << std::string(strlen(notif), '~') // - << "\e[0m\n"; - - ctx->trap(-1); - } - i++; - } -} - -} // namespace - -cuszCTX& cuszCTX::set_control_string(const char* in_str) -{ - set_config(this, in_str); - return *this; -} - -void cuszCTX::load_demo_sizes() -{ - const std::unordered_map> dataset_entries = { - {std::string("hacc"), {280953867, 1, 1, 1, 1}}, {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}}, - {std::string("cesm"), {3600, 1800, 1, 1, 2}}, {std::string("hurricane"), {500, 500, 100, 1, 3}}, - {std::string("nyx-s"), {512, 512, 512, 1, 3}}, {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}}, - {std::string("qmc"), {288, 69, 7935, 1, 3}}, {std::string("qmcpre"), {69, 69, 33120, 1, 3}}, - {std::string("exafel"), {388, 59200, 1, 1, 2}}, {std::string("rtm"), {235, 849, 849, 1, 3}}, - {std::string("parihaka"), {1168, 1126, 922, 1, 3}}}; - - if (not demo_dataset.empty()) { - auto f = dataset_entries.find(demo_dataset); - if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset); - auto demo_xyzw = f->second; - - x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3]; - ndim = demo_xyzw[4]; - } - data_len = x * y * z * w; -} - -void cuszCTX::trap(int _status) { this->read_args_status = _status; } - -void cuszCTX::validate() -{ - bool to_abort = false; - if (fname.fname.empty()) { - cerr << LOG_ERR << "must specify input file" << endl; - to_abort = true; - } - - if (data_len == 1 and not use.predefined_demo) { - if (cli_task.construct or cli_task.dryrun) { - cerr << LOG_ERR << "wrong input size" << endl; - to_abort = true; - } - } - if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) { - cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl; - to_abort = true; - } - if (false == ConfigHelper::check_dtype(dtype, false)) { - if (cli_task.construct or cli_task.dryrun) { - std::cout << dtype << endl; - cerr << LOG_ERR << "must specify data type" << endl; - to_abort = true; - } - } - - if (quant_bytewidth == 1) - assert(dict_size <= 256); - else if (quant_bytewidth == 2) - assert(dict_size <= 65536); - - if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) { - cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl; - cerr << LOG_WARN << "dryrun only" << endl << endl; - cli_task.construct = false; - cli_task.reconstruct = false; - } - else if (cli_task.dryrun and cli_task.construct) { - cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl; - cerr << LOG_WARN << "dryrun only" << endl << endl; - cli_task.construct = false; - } - else if (cli_task.dryrun and cli_task.reconstruct) { - cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl; - cerr << LOG_WARN << "will dryrun only" << endl << endl; - cli_task.reconstruct = false; - } - - if (to_abort) { - print_doc(); - exit(-1); - } -} - -cuszCTX::cuszCTX(int argc, char** const argv) -{ - std::string opt; - auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); }; - - if (argc == 1) { - print_doc(); - exit(0); - } - - /******************************************************************************/ - /* phase 0: parse */ - set_from_cli_input(this, argc, argv); - - // special treatment - if (predictor == "spline3") { - // unconditionally use anchor when it is spline3 - use.anchor = true; - } - - /******************************************************************************/ - /* phase 1: check syntax */ - if (read_args_status != 0) { - std::cout << LOG_INFO << "Exiting..." << endl; - // after printing ALL argument errors - exit(-1); - } - - /******************************************************************************/ - /* phase 2: check if legal */ - validate(); - - /******************************************************************************/ - /* phase 3: sort out filenames */ - derive_fnames(); -} - -cuszCTX::cuszCTX(const char* in_str, bool dbg_print) -{ - /** - ** >>> syntax - ** comma-separated key-pairs - ** "key1=val1,key2=val2[,...]" - ** - ** >>> example - ** "predictor=lorenzo,size=3600x1800" - ** - **/ - - set_config(this, in_str, dbg_print); -} - -void cuszCTX::print_doc(bool full) -{ - std::cout << "\n>>>> cusz build: " << cusz::VERSION_TEXT << "\n"; - - if (full) - std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl; - else - std::cout << cusz_short_doc << std::endl; -} - -void cuszCTX::derive_fnames() -{ - // (1) "fname" -> "", "fname" - // (2) "./fname" -> "./" "fname" - // (3) "/path/to/fname" -> "/path/to", "fname" - auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1); - if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.')); - fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1); - - if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path; - opath += "/"; - - fname.path_basename = opath + fname.basename; - fname.compress_output = fname.path_basename + ".cusza"; -} +/** + * @file argparse.cc + * @author Jiannan Tian + * @brief Argument parser. + * @version 0.1 + * @date 2020-09-20 + * Created on: 20-04-24 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include +#include +#include +#include + +#include "cli/document.hh" +#include "context.hh" + +namespace cusz { +const char* VERSION_TEXT = "2023-01-23 (unstable; pre-0.4)"; +const int VERSION = 20230123; +const int COMPATIBILITY = 0; +} // namespace cusz + +namespace { + +void set_preprocess(cusz::context_t ctx, const char* in_str) +{ + str_list opts; + StrHelper::parse_strlist(in_str, opts); + + for (auto k : opts) { + // TODO + } +} + +void set_report(cusz::context_t ctx, const char* in_str) +{ + str_list opts; + StrHelper::parse_strlist(in_str, opts); + + for (auto o : opts) { + if (StrHelper::is_kv_pair(o)) { + auto kv = StrHelper::parse_kv_onoff(o); + + if (kv.first == "cr") + ctx->report.cr = kv.second; + else if (kv.first == "compressibility") + ctx->report.compressibility = kv.second; + else if (kv.first == "time") + ctx->report.time = kv.second; + } + else { + if (o == "cr") + ctx->report.cr = true; + else if (o == "compressibility") + ctx->report.compressibility = true; + else if (o == "time") + ctx->report.time = true; + } + } +} + +void set_config(cusz::context_t ctx, const char* in_str, bool dbg_print = false) +{ + map_t opts; + StrHelper::parse_strlist_as_kv(in_str, opts); + + if (dbg_print) { + for (auto kv : opts) printf("%-*s %-s\n", 10, kv.first.c_str(), kv.second.c_str()); + std::cout << "\n"; + } + + std::string k, v; + char* end; + + auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(k, vs); }; + auto is_enabled = [&](auto& v) -> bool { return v == "on" or v == "ON"; }; + + for (auto kv : opts) { + k = kv.first; + v = kv.second; + + if (optmatch({"type", "dtype"})) { + ConfigHelper::check_dtype(v, false); + ctx->dtype = v; + } + else if (optmatch({"eb", "errorbound"})) { + ctx->eb = StrHelper::str2fp(v); + } + else if (optmatch({"mode"})) { + ConfigHelper::check_cuszmode(v, true); + ctx->mode = v; + } + else if (optmatch({"len", "length"})) { + cuszCTX::parse_input_length(v.c_str(), ctx); + } + else if (optmatch({"alloclen"})) { + ctx->alloclen.len = StrHelper::str2int(v); + } + else if (optmatch({"demo"})) { + ctx->use.predefined_demo = true; + ctx->demo_dataset = std::string(v); + ctx->load_demo_sizes(); + } + else if (optmatch({"cap", "booklen", "dictsize"})) { + ctx->dict_size = StrHelper::str2int(v); + ctx->radius = ctx->dict_size / 2; + } + else if (optmatch({"radius"})) { + ctx->radius = StrHelper::str2int(v); + ctx->dict_size = ctx->radius * 2; + } + else if (optmatch({"huffbyte"})) { + ctx->huff_bytewidth = StrHelper::str2int(v); + ctx->codecs_in_use = ctx->codec_force_fallback() ? 0b11 /*use both*/ : 0b01 /*use 4-byte*/; + } + else if (optmatch({"huffchunk"})) { + ctx->vle_sublen = StrHelper::str2int(v); + ctx->use.autotune_vle_pardeg = false; + } + else if (optmatch({"predictor"})) { + ctx->predictor = std::string(v); + } + else if (optmatch({"codec"})) { + // placeholder + } + else if (optmatch({"spcodec"})) { + // placeholder + } + else if (optmatch({"anchor"}) and is_enabled(v)) { + ctx->use.anchor = true; + } + else if (optmatch({"nondestructive"}) and is_enabled(v)) { + // placeholder + } + else if (optmatch({"failfast"}) and is_enabled(v)) { + // placeholder + } + else if (optmatch({"releaseinput"}) and is_enabled(v)) { + ctx->use.release_input = true; + } + else if (optmatch({"pipeline"})) { + ctx->pipeline = v; + } + else if (optmatch({"density"})) { // refer to `SparseMethodSetup` in `config.hh` + ctx->nz_density = StrHelper::str2fp(v); + ctx->nz_density_factor = 1 / ctx->nz_density; + } + else if (optmatch({"densityfactor"})) { // refer to `SparseMethodSetup` in `config.hh` + ctx->nz_density_factor = StrHelper::str2fp(v); + ctx->nz_density = 1 / ctx->nz_density_factor; + } + else if (optmatch({"gpuverify"}) and is_enabled(v)) { + ctx->use.gpu_verify = true; + } + + // when to enable anchor + if (ctx->predictor == "spline3") { + // unconditionally use anchor when it is spline3 + ctx->use.anchor = true; + } + } +} + +void set_from_cli_input(cusz::context_t ctx, int const argc, char** const argv) +{ + int i = 1; + + auto check_next = [&]() { + if (i + 1 >= argc) throw std::runtime_error("out-of-range at" + std::string(argv[i])); + }; + + std::string opt; + auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); }; + + while (i < argc) { + if (argv[i][0] == '-') { + opt = std::string(argv[i]); + + if (optmatch({"-c", "--config"})) { + check_next(); + set_config(ctx, argv[++i]); + } + else if (optmatch({"-R", "--report"})) { + check_next(); + set_report(ctx, argv[++i]); + } + else if (optmatch({"-h", "--help"})) { + cusz::Context::print_doc(true); + exit(0); + } + else if (optmatch({"-v", "--version"})) { + std::cout << ">>>> cusz build: " << cusz::VERSION_TEXT << "\n"; + exit(0); + } + else if (optmatch({"-m", "--mode"})) { + check_next(); + ctx->mode = std::string(argv[++i]); + if (ctx->mode == "r2r") ctx->preprocess.prescan = true; + } + else if (optmatch({"-e", "--eb", "--error-bound"})) { + check_next(); + char* end; + ctx->eb = std::strtod(argv[++i], &end); + } + else if (optmatch({"-p", "--predictor"})) { + check_next(); + ctx->predictor = std::string(argv[++i]); + } + else if (optmatch({"-c", "--codec"})) { + check_next(); + // placeholder + } + else if (optmatch({"-s", "--spcodec"})) { + check_next(); + // placeholder + } + else if (optmatch({"-t", "--type", "--dtype"})) { + check_next(); + std::string s = std::string(std::string(argv[++i])); + if (s == "f32" or s == "fp4") + ctx->dtype = "f32"; + else if (s == "f64" or s == "fp8") + ctx->dtype = "f64"; + } + else if (optmatch({"-i", "--input"})) { + check_next(); + ctx->fname.fname = std::string(argv[++i]); + } + else if (optmatch({"-l", "--len"})) { + check_next(); + cusz::Context::parse_input_length(argv[++i], ctx); + } + else if (optmatch({"-L", "--allocation-len"})) { + check_next(); + // placeholder + } + else if (optmatch({"-z", "--zip", "--compress"})) { + ctx->cli_task.construct = true; + } + else if (optmatch({"-x", "--unzip", "--decompress"})) { + ctx->cli_task.reconstruct = true; + } + else if (optmatch({"-r", "--dry-run"})) { + ctx->cli_task.dryrun = true; + } + else if (optmatch({"--anchor"})) { + ctx->use.anchor = true; + } + else if (optmatch({"--nondestructive", "--input-nondestructive"})) { + // placeholder + } + else if (optmatch({"--failfast"})) { + // placeholder + } + else if (optmatch({"-P", "--pre", "--preprocess"})) { + check_next(); + std::string pre(argv[++i]); + if (pre.find("binning") != std::string::npos) { ctx->preprocess.binning = true; } + } + else if (optmatch({"-T", "--post", "--postprocess"})) { + check_next(); + std::string post(argv[++i]); + if (post.find("gzip") != std::string::npos) { ctx->postcompress.cpu_gzip = true; } + if (post.find("nvcomp") != std::string::npos) { ctx->postcompress.gpu_nvcomp_cascade = true; } + } + else if (optmatch({"-V", "--verbose"})) { + ctx->verbose = true; + } + else if (optmatch({"--pipeline"})) { + check_next(); + ctx->pipeline = std::string(argv[++i]); + } + else if (optmatch({"--demo"})) { + check_next(); + ctx->use.predefined_demo = true; + ctx->demo_dataset = std::string(argv[++i]); + ctx->load_demo_sizes(); + } + else if (optmatch({"-S", "-X", "--skip", "--exclude"})) { + check_next(); + std::string exclude(argv[++i]); + if (exclude.find("huffman") != std::string::npos) { ctx->skip.huffman = true; } + if (exclude.find("write2disk") != std::string::npos) { ctx->skip.write2disk = true; } + } + else if (optmatch({"--opath"})) { + check_next(); + ctx->opath = std::string(argv[++i]); + } + else if (optmatch({"--origin", "--compare"})) { + check_next(); + ctx->fname.origin_cmp = std::string(argv[++i]); + } + else { + const char* notif_prefix = "invalid option value at position "; + char* notif; + int size = asprintf(¬if, "%d: %s", i, argv[i]); + cerr << LOG_ERR << notif_prefix << "\e[1m" << notif << "\e[0m" + << "\n"; + cerr << std::string(LOG_NULL.length() + strlen(notif_prefix), ' '); + cerr << "\e[1m"; + cerr << std::string(strlen(notif), '~'); + cerr << "\e[0m\n"; + + ctx->trap(-1); + } + } + else { + const char* notif_prefix = "invalid option at position "; + char* notif; + int size = asprintf(¬if, "%d: %s", i, argv[i]); + cerr << LOG_ERR << notif_prefix << "\e[1m" << notif + << "\e[0m" + "\n" + << std::string(LOG_NULL.length() + strlen(notif_prefix), ' ') // + << "\e[1m" // + << std::string(strlen(notif), '~') // + << "\e[0m\n"; + + ctx->trap(-1); + } + i++; + } +} + +} // namespace + +cuszCTX& cuszCTX::set_control_string(const char* in_str) +{ + set_config(this, in_str); + return *this; +} + +void cuszCTX::load_demo_sizes() +{ + const std::unordered_map> dataset_entries = { + {std::string("hacc"), {280953867, 1, 1, 1, 1}}, {std::string("hacc1b"), {1073726487, 1, 1, 1, 1}}, + {std::string("cesm"), {3600, 1800, 1, 1, 2}}, {std::string("hurricane"), {500, 500, 100, 1, 3}}, + {std::string("nyx-s"), {512, 512, 512, 1, 3}}, {std::string("nyx-m"), {1024, 1024, 1024, 1, 3}}, + {std::string("qmc"), {288, 69, 7935, 1, 3}}, {std::string("qmcpre"), {69, 69, 33120, 1, 3}}, + {std::string("exafel"), {388, 59200, 1, 1, 2}}, {std::string("rtm"), {235, 849, 849, 1, 3}}, + {std::string("parihaka"), {1168, 1126, 922, 1, 3}}}; + + if (not demo_dataset.empty()) { + auto f = dataset_entries.find(demo_dataset); + if (f == dataset_entries.end()) throw std::runtime_error("no such dataset as" + demo_dataset); + auto demo_xyzw = f->second; + + x = demo_xyzw[0], y = demo_xyzw[1], z = demo_xyzw[2], w = demo_xyzw[3]; + ndim = demo_xyzw[4]; + } + data_len = x * y * z * w; +} + +void cuszCTX::trap(int _status) { this->read_args_status = _status; } + +void cuszCTX::validate() +{ + bool to_abort = false; + if (fname.fname.empty()) { + cerr << LOG_ERR << "must specify input file" << endl; + to_abort = true; + } + + if (data_len == 1 and not use.predefined_demo) { + if (cli_task.construct or cli_task.dryrun) { + cerr << LOG_ERR << "wrong input size" << endl; + to_abort = true; + } + } + if (not cli_task.construct and not cli_task.reconstruct and not cli_task.dryrun) { + cerr << LOG_ERR << "select compress (-z), decompress (-x) or dry-run (-r)" << endl; + to_abort = true; + } + if (false == ConfigHelper::check_dtype(dtype, false)) { + if (cli_task.construct or cli_task.dryrun) { + std::cout << dtype << endl; + cerr << LOG_ERR << "must specify data type" << endl; + to_abort = true; + } + } + + if (quant_bytewidth == 1) + assert(dict_size <= 256); + else if (quant_bytewidth == 2) + assert(dict_size <= 65536); + + if (cli_task.dryrun and cli_task.construct and cli_task.reconstruct) { + cerr << LOG_WARN << "no need to dry-run, compress and decompress at the same time" << endl; + cerr << LOG_WARN << "dryrun only" << endl << endl; + cli_task.construct = false; + cli_task.reconstruct = false; + } + else if (cli_task.dryrun and cli_task.construct) { + cerr << LOG_WARN << "no need to dry-run and compress at the same time" << endl; + cerr << LOG_WARN << "dryrun only" << endl << endl; + cli_task.construct = false; + } + else if (cli_task.dryrun and cli_task.reconstruct) { + cerr << LOG_WARN << "no need to dry-run and decompress at the same time" << endl; + cerr << LOG_WARN << "will dryrun only" << endl << endl; + cli_task.reconstruct = false; + } + + if (to_abort) { + print_doc(); + exit(-1); + } +} + +cuszCTX::cuszCTX(int argc, char** const argv) +{ + std::string opt; + auto optmatch = [&](std::vector vs) -> bool { return ConfigHelper::check_opt_in_list(opt, vs); }; + + if (argc == 1) { + print_doc(); + exit(0); + } + + /******************************************************************************/ + /* phase 0: parse */ + set_from_cli_input(this, argc, argv); + + // special treatment + if (predictor == "spline3") { + // unconditionally use anchor when it is spline3 + use.anchor = true; + } + + /******************************************************************************/ + /* phase 1: check syntax */ + if (read_args_status != 0) { + std::cout << LOG_INFO << "Exiting..." << endl; + // after printing ALL argument errors + exit(-1); + } + + /******************************************************************************/ + /* phase 2: check if legal */ + validate(); + + /******************************************************************************/ + /* phase 3: sort out filenames */ + derive_fnames(); +} + +cuszCTX::cuszCTX(const char* in_str, bool dbg_print) +{ + /** + ** >>> syntax + ** comma-separated key-pairs + ** "key1=val1,key2=val2[,...]" + ** + ** >>> example + ** "predictor=lorenzo,size=3600x1800" + ** + **/ + + set_config(this, in_str, dbg_print); +} + +void cuszCTX::print_doc(bool full) +{ + std::cout << "\n>>>> cusz build: " << cusz::VERSION_TEXT << "\n"; + + if (full) + std::cout << StrHelper::doc_format(cusz_full_doc) << std::endl; + else + std::cout << cusz_short_doc << std::endl; +} + +void cuszCTX::derive_fnames() +{ + // (1) "fname" -> "", "fname" + // (2) "./fname" -> "./" "fname" + // (3) "/path/to/fname" -> "/path/to", "fname" + auto input_path = fname.fname.substr(0, fname.fname.rfind('/') + 1); + if (not cli_task.construct and cli_task.reconstruct) fname.fname = fname.fname.substr(0, fname.fname.rfind('.')); + fname.basename = fname.fname.substr(fname.fname.rfind('/') + 1); + + if (opath.empty()) opath = input_path.empty() ? opath = "" : opath = input_path; + opath += "/"; + + fname.path_basename = opath + fname.basename; + fname.compress_output = fname.path_basename + ".cusza"; +} diff --git a/qtensor/compression/cusz/src/cusz/custom.cc b/qtensor/compression/cusz/src/cusz/custom.cc index ad9eff89..6717e842 100644 --- a/qtensor/compression/cusz/src/cusz/custom.cc +++ b/qtensor/compression/cusz/src/cusz/custom.cc @@ -1,34 +1,34 @@ -/** - * @file custom.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-30 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "cusz/custom.h" - -extern "C" { - -cusz_custom_predictor cusz_default_predictor() { return {LorenzoI, false, false}; } -cusz_custom_quantization cusz_default_quantization() { return {512, false}; } -cusz_custom_codec cusz_default_codec() { return {Huffman, true, 0.5}; } -cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; } -cusz_custom_spcodec cusz_default_spcodec() { return {SparseMat, 0.2}; } -cusz_custom_framework* cusz_default_framework() -{ - return new cusz_custom_framework{ - FP32, // placeholder; set in another function call - Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(), - // cusz_default_spcodec(), - cusz_default_huffman_codec()}; -} - -void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; } -void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; } - -// end of extern C -} +/** + * @file custom.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-30 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "cusz/custom.h" + +extern "C" { + +cusz_custom_predictor cusz_default_predictor() { return {LorenzoI, false, false}; } +cusz_custom_quantization cusz_default_quantization() { return {512, false}; } +cusz_custom_codec cusz_default_codec() { return {Huffman, true, 0.5}; } +cusz_custom_huffman_codec cusz_default_huffman_codec() { return {Canonical, Device, Coarse, 1024, 768}; } +cusz_custom_spcodec cusz_default_spcodec() { return {SparseMat, 0.2}; } +cusz_custom_framework* cusz_default_framework() +{ + return new cusz_custom_framework{ + FP32, // placeholder; set in another function call + Auto, cusz_default_predictor(), cusz_default_quantization(), cusz_default_codec(), + // cusz_default_spcodec(), + cusz_default_huffman_codec()}; +} + +void cusz_set_datatype(cusz_custom_framework* config, cusz_datatype datatype) { config->datatype = datatype; } +void cusz_set_pipelinetype(cusz_custom_framework* config, cusz_pipelinetype pipeline) { config->pipeline = pipeline; } + +// end of extern C +} diff --git a/qtensor/compression/cusz/src/cusz_lib.cc b/qtensor/compression/cusz/src/cusz_lib.cc index 723b80b1..d6bad3c6 100644 --- a/qtensor/compression/cusz/src/cusz_lib.cc +++ b/qtensor/compression/cusz/src/cusz_lib.cc @@ -1,115 +1,115 @@ -/** - * @file cusz_lib.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-05-01 - * (rev.1) 2023-01-29 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include - -#include -#include - -#include "component.hh" -#include "compressor.hh" -#include "context.hh" -#include "cusz.h" -#include "cusz/custom.h" -#include "cusz/type.h" -#include "framework.hh" - -cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type) -{ - auto comp = new cusz_compressor{.framework = _framework, .type = _type}; - - if (comp->type == FP32) { - using DATA = float; - using Compressor = cusz::CompressorFP32; - - comp->compressor = new Compressor(); - } - else { - throw std::runtime_error("Type is not supported."); - } - - return comp; -} - -cusz_error_status cusz_release(cusz_compressor* comp) -{ - delete comp; - return CUSZ_SUCCESS; -} - -cusz_error_status cusz_compress( - cusz_compressor* comp, - cusz_config* config, - void* uncompressed, - cusz_len const uncomp_len, - uint8_t** compressed, - size_t* comp_bytes, - cusz_header* header, - void* record, - cudaStream_t stream) -{ - // cusz::TimeRecord cpp_record; - - auto context = new cusz_context(); - (*context) - .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w) - .set_eb(config->eb) - .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs"); - - // Be cautious of autotuning! The default value of pardeg is not robust. - cusz::CompressorHelper::autotune_coarse_parvle(static_cast(context)); - - if (comp->type == FP32) { - using DATA = float; - using Compressor = cusz::CompressorFP32; - - // TODO add memlen & datalen comparison - static_cast(comp->compressor)->init(context); - static_cast(comp->compressor) - ->compress(context, static_cast(uncompressed), *compressed, *comp_bytes, stream); - static_cast(comp->compressor)->export_header(*header); - static_cast(comp->compressor)->export_timerecord((cusz::TimeRecord*)record); - } - else { - throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported."); - } - - return CUSZ_SUCCESS; -} - -cusz_error_status cusz_decompress( - cusz_compressor* comp, - cusz_header* header, - uint8_t* compressed, - size_t const comp_len, - void* decompressed, - cusz_len const decomp_len, - void* record, - cudaStream_t stream) -{ - // cusz::TimeRecord cpp_record; - - if (comp->type == FP32) { - using DATA = float; - using Compressor = cusz::CompressorFP32; - - static_cast(comp->compressor)->init(header); - static_cast(comp->compressor) - ->decompress(header, compressed, static_cast(decompressed), stream); - static_cast(comp->compressor)->export_timerecord((cusz::TimeRecord*)record); - } - else { - throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported."); - } - - return CUSZ_SUCCESS; +/** + * @file cusz_lib.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-05-01 + * (rev.1) 2023-01-29 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include + +#include +#include + +#include "component.hh" +#include "compressor.hh" +#include "context.hh" +#include "cusz.h" +#include "cusz/custom.h" +#include "cusz/type.h" +#include "framework.hh" + +cusz_compressor* cusz_create(cusz_framework* _framework, cusz_datatype _type) +{ + auto comp = new cusz_compressor{.framework = _framework, .type = _type}; + + if (comp->type == FP32) { + using DATA = float; + using Compressor = cusz::CompressorFP32; + + comp->compressor = new Compressor(); + } + else { + throw std::runtime_error("Type is not supported."); + } + + return comp; +} + +cusz_error_status cusz_release(cusz_compressor* comp) +{ + delete comp; + return CUSZ_SUCCESS; +} + +cusz_error_status cusz_compress( + cusz_compressor* comp, + cusz_config* config, + void* uncompressed, + cusz_len const uncomp_len, + uint8_t** compressed, + size_t* comp_bytes, + cusz_header* header, + void* record, + cudaStream_t stream) +{ + // cusz::TimeRecord cpp_record; + + auto context = new cusz_context(); + (*context) + .set_len(uncomp_len.x, uncomp_len.y, uncomp_len.z, uncomp_len.w) + .set_eb(config->eb) + .set_control_string(config->eb == Rel ? "mode=r2r" : "mode=abs"); + + // Be cautious of autotuning! The default value of pardeg is not robust. + cusz::CompressorHelper::autotune_coarse_parvle(static_cast(context)); + + if (comp->type == FP32) { + using DATA = float; + using Compressor = cusz::CompressorFP32; + + // TODO add memlen & datalen comparison + static_cast(comp->compressor)->init(context); + static_cast(comp->compressor) + ->compress(context, static_cast(uncompressed), *compressed, *comp_bytes, stream); + static_cast(comp->compressor)->export_header(*header); + static_cast(comp->compressor)->export_timerecord((cusz::TimeRecord*)record); + } + else { + throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported."); + } + + return CUSZ_SUCCESS; +} + +cusz_error_status cusz_decompress( + cusz_compressor* comp, + cusz_header* header, + uint8_t* compressed, + size_t const comp_len, + void* decompressed, + cusz_len const decomp_len, + void* record, + cudaStream_t stream) +{ + // cusz::TimeRecord cpp_record; + + if (comp->type == FP32) { + using DATA = float; + using Compressor = cusz::CompressorFP32; + + static_cast(comp->compressor)->init(header); + static_cast(comp->compressor) + ->decompress(header, compressed, static_cast(decompressed), stream); + static_cast(comp->compressor)->export_timerecord((cusz::TimeRecord*)record); + } + else { + throw std::runtime_error(std::string(__FUNCTION__) + ": Type is not supported."); + } + + return CUSZ_SUCCESS; } \ No newline at end of file diff --git a/qtensor/compression/cusz/src/cusz_version.h.in b/qtensor/compression/cusz/src/cusz_version.h.in index 1bd3344f..09a2d3d7 100644 --- a/qtensor/compression/cusz/src/cusz_version.h.in +++ b/qtensor/compression/cusz/src/cusz_version.h.in @@ -1,3 +1,3 @@ -#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@ -#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@ -#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@ +#define CUSZ_MAJOR_VERSION @PROJECT_VERSION_MAJOR@ +#define CUSZ_MINOR_VERSION @PROJECT_VERSION_MINOR@ +#define CUSZ_PATCH_VERSION @PROJECT_VERSION_PATCH@ diff --git a/qtensor/compression/cusz/src/cusz_wrapper.cu b/qtensor/compression/cusz/src/cusz_wrapper.cu index 2827123d..a9b1f760 100644 --- a/qtensor/compression/cusz/src/cusz_wrapper.cu +++ b/qtensor/compression/cusz/src/cusz_wrapper.cu @@ -1,154 +1,154 @@ -//#include "cuszx_entry.h" -//#include "szx_defines.h" -//#include "szx_BytesToolkit.h" -//#include "szx_TypeManager.h" -//#include "timingGPU.h" - -#include "cusz.h" -#include "cli/quality_viewer.hh" -#include "cli/timerecord_viewer.hh" -#include "utils/io.hh" -#include "utils/print_gpu.hh" - -// template -extern "C"{ -unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize) -{ - /* For demo, we use 3600x1800 CESM data. */ - - cusz_header header; - uint8_t* exposed_compressed; - uint8_t* compressed; - size_t compressed_len; - - float *d_uncompressed, *h_uncompressed; - float *d_decompressed, *h_decompressed; - - d_uncompressed = data; - - cudaStream_t stream; - cudaStreamCreate(&stream); - - // using default - // cusz_framework* framework = cusz_default_framework(); - // alternatively - cusz_framework fw = cusz_framework{ - .pipeline = Auto, - .predictor = cusz_custom_predictor{.type = LorenzoI}, - .quantization = cusz_custom_quantization{.radius = 512}, - .codec = cusz_custom_codec{.type = Huffman}}; - cusz_framework* framework = &fw; - - // Brace initializing a struct pointer is not supported by all host compilers - // when nvcc forwards. - // cusz_framework* framework = new cusz_framework{ - // .pipeline = Auto, - // .predictor = cusz_custom_predictor{.type = LorenzoI}, - // .quantization = cusz_custom_quantization{.radius = 512}, - // .codec = cusz_custom_codec{.type = Huffman}}; - - - cusz_compressor* comp = cusz_create(framework, FP32); - cusz_config* config = new cusz_config{.eb = r2r_error, .mode = Rel}; - cusz_len uncomp_len = cusz_len{len, 1, 1, 1}; // x, y, z, w - cusz_len decomp_len = uncomp_len; - - cusz::TimeRecord compress_timerecord; - - - { - cusz_compress( - comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header, - (void*)&compress_timerecord, stream); - - /* User can interpret the collected time information in other ways. */ - cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len); - - /* verify header */ - printf("header.%-*s : %x\n", 12, "(addr)", &header); - printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z); - printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header)); - } - - /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */ - cudaMalloc(&compressed, compressed_len); - cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice); - cudaFree(exposed_compressed); - cudaStreamDestroy(stream); - *outSize = compressed_len; - return compressed; -} - -float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){ - cusz::TimeRecord decompress_timerecord; - cudaStream_t stream; - cusz_header header; - float* d_decompressed; - cudaMalloc(&d_decompressed, sizeof(float) * len); - - cusz_framework fw = cusz_framework{ - .pipeline = Auto, - .predictor = cusz_custom_predictor{.type = LorenzoI}, - .quantization = cusz_custom_quantization{.radius = 512}, - .codec = cusz_custom_codec{.type = Huffman}}; - cusz_framework* framework = &fw; - - cusz_compressor* comp = cusz_create(framework, FP32); - cusz_config* config = new cusz_config{.eb = r2r_error, .mode = Rel}; - cusz_len uncomp_len = cusz_len{len, 1, 1, 1}; // x, y, z, w - cusz_len decomp_len = uncomp_len; - - - cudaStreamCreate(&stream); - { - cusz_decompress( - comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len, - (void*)&decompress_timerecord, stream); - - cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float)); - } - - - cusz_release(comp); - - // cudaFree(cmpbytes); - cudaStreamDestroy(stream); - return d_decompressed; -} - - - // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){ - // float max,min; - // unsigned char* bytes; - // max = data[0]; - // min = data[0]; - // for (size_t i = 0; i < nbEle; i++) - // { - // if(data[i] > max) max = data[i]; - // if(data[i] < min) min = data[i]; - // } - - // float threshold = r2r_threshold*(max-min); - // float errBound = r2r_err*(max-min); - // bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold); - // // printf("outSize %p\n", bytes); - // return bytes; - // } - - // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){ - // // printf("test\n"); - // float**data; - // cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes); - // return *data; - // } - - // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){ - // return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold); - // } - - // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){ - // return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes); - // } - - -} +//#include "cuszx_entry.h" +//#include "szx_defines.h" +//#include "szx_BytesToolkit.h" +//#include "szx_TypeManager.h" +//#include "timingGPU.h" + +#include "cusz.h" +#include "cli/quality_viewer.hh" +#include "cli/timerecord_viewer.hh" +#include "utils/io.hh" +#include "utils/print_gpu.hh" + +// template +extern "C"{ +unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize) +{ + /* For demo, we use 3600x1800 CESM data. */ + + cusz_header header; + uint8_t* exposed_compressed; + uint8_t* compressed; + size_t compressed_len; + + float *d_uncompressed, *h_uncompressed; + float *d_decompressed, *h_decompressed; + + d_uncompressed = data; + + cudaStream_t stream; + cudaStreamCreate(&stream); + + // using default + // cusz_framework* framework = cusz_default_framework(); + // alternatively + cusz_framework fw = cusz_framework{ + .pipeline = Auto, + .predictor = cusz_custom_predictor{.type = LorenzoI}, + .quantization = cusz_custom_quantization{.radius = 512}, + .codec = cusz_custom_codec{.type = Huffman}}; + cusz_framework* framework = &fw; + + // Brace initializing a struct pointer is not supported by all host compilers + // when nvcc forwards. + // cusz_framework* framework = new cusz_framework{ + // .pipeline = Auto, + // .predictor = cusz_custom_predictor{.type = LorenzoI}, + // .quantization = cusz_custom_quantization{.radius = 512}, + // .codec = cusz_custom_codec{.type = Huffman}}; + + + cusz_compressor* comp = cusz_create(framework, FP32); + cusz_config* config = new cusz_config{.eb = r2r_error, .mode = Rel}; + cusz_len uncomp_len = cusz_len{len, 1, 1, 1}; // x, y, z, w + cusz_len decomp_len = uncomp_len; + + cusz::TimeRecord compress_timerecord; + + + { + cusz_compress( + comp, config, d_uncompressed, uncomp_len, &exposed_compressed, &compressed_len, &header, + (void*)&compress_timerecord, stream); + + /* User can interpret the collected time information in other ways. */ + cusz::TimeRecordViewer::view_compression(&compress_timerecord, len * sizeof(float), compressed_len); + + /* verify header */ + printf("header.%-*s : %x\n", 12, "(addr)", &header); + printf("header.%-*s : %lu, %lu, %lu\n", 12, "{x,y,z}", header.x, header.y, header.z); + printf("header.%-*s : %lu\n", 12, "filesize", ConfigHelper::get_filesize(&header)); + } + + /* If needed, User should perform a memcopy to transfer `exposed_compressed` before `compressor` is destroyed. */ + cudaMalloc(&compressed, compressed_len); + cudaMemcpy(compressed, exposed_compressed, compressed_len, cudaMemcpyDeviceToDevice); + cudaFree(exposed_compressed); + cudaStreamDestroy(stream); + *outSize = compressed_len; + return compressed; +} + +float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){ + cusz::TimeRecord decompress_timerecord; + cudaStream_t stream; + cusz_header header; + float* d_decompressed; + cudaMalloc(&d_decompressed, sizeof(float) * len); + + cusz_framework fw = cusz_framework{ + .pipeline = Auto, + .predictor = cusz_custom_predictor{.type = LorenzoI}, + .quantization = cusz_custom_quantization{.radius = 512}, + .codec = cusz_custom_codec{.type = Huffman}}; + cusz_framework* framework = &fw; + + cusz_compressor* comp = cusz_create(framework, FP32); + cusz_config* config = new cusz_config{.eb = r2r_error, .mode = Rel}; + cusz_len uncomp_len = cusz_len{len, 1, 1, 1}; // x, y, z, w + cusz_len decomp_len = uncomp_len; + + + cudaStreamCreate(&stream); + { + cusz_decompress( + comp, &header, cmpbytes, compressed_len, d_decompressed, decomp_len, + (void*)&decompress_timerecord, stream); + + cusz::TimeRecordViewer::view_decompression(&decompress_timerecord, len * sizeof(float)); + } + + + cusz_release(comp); + + // cudaFree(cmpbytes); + cudaStreamDestroy(stream); + return d_decompressed; +} + + + // unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize){ + // float max,min; + // unsigned char* bytes; + // max = data[0]; + // min = data[0]; + // for (size_t i = 0; i < nbEle; i++) + // { + // if(data[i] > max) max = data[i]; + // if(data[i] < min) min = data[i]; + // } + + // float threshold = r2r_threshold*(max-min); + // float errBound = r2r_err*(max-min); + // bytes = cuSZx_fast_compress_args_unpredictable_blocked_float(data, outSize, errBound, nbEle, blockSize, threshold); + // // printf("outSize %p\n", bytes); + // return bytes; + // } + + // float* cuSZx_integrated_decompress(unsigned char *bytes, size_t nbEle){ + // // printf("test\n"); + // float**data; + // cuSZx_fast_decompress_args_unpredictable_blocked_float(data, nbEle, bytes); + // return *data; + // } + + // unsigned char* cuSZx_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold){ + // return device_ptr_cuSZx_compress_float(oriData, outSize, absErrBound, nbEle, blockSize, threshold); + // } + + // float* cuSZx_device_decompress(size_t nbEle, unsigned char* cmpBytes){ + // return device_ptr_cuSZx_decompress_float(nbEle, cmpBytes); + // } + + +} diff --git a/qtensor/compression/cusz/src/cusz_wrapper.py b/qtensor/compression/cusz/src/cusz_wrapper.py index e588c492..682bd3e6 100644 --- a/qtensor/compression/cusz/src/cusz_wrapper.py +++ b/qtensor/compression/cusz/src/cusz_wrapper.py @@ -1,173 +1,173 @@ -import numpy as np -import ctypes -from ctypes import * -import random -from qtensor.tools.lazy_import import cupy as cp -import time -import torch - -from pathlib import Path -LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so') -CUSZ_PATH = str(Path(__file__).parent/'libcusz.so') -# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize) - -# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize) - -def get_device_compress(): - dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL) - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.cusz_device_compress - # Returns: unsigned char *bytes - # Needs: float *data, float r2r_error,size_t len,size_t *outSize - func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)] - func.restype = POINTER(c_ubyte) - return func - -# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){ - -def get_device_decompress(): - - dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL) - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.cusz_device_decompress - # Returns: float *newData - # Needs: size_t nbEle, unsigned char *cmpBytes - func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float] - func.restype = POINTER(c_float) - return func - - -def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold): - __cuszx_device_compress = get_device_compress() - #print(nbEle) - ori_nbEle = nbEle - variable = ctypes.c_size_t(0) - outSize = ctypes.pointer(variable) - - oriData = oriData.flatten() - ori_real = oriData.real - ori_imag = oriData.imag - oriData = cp.concatenate((ori_real, ori_imag)) - #nbEle = len(oriData) - sample = oriData[::2] - #print(nbEle) - d = cp.amax(oriData) - cp.amin(oriData) - #print("max min time (s): " +str(time.time()-v_time)) - d = d.get() - if d.dtype == np.complex64: - #d = min(d.real, d.imag) - d = d.real - # absErrBound = absErrBound*(d) - threshold = threshold*(d) - s_1 = time.time() - #print(cp.get_array_module(oriData)) - truth_values = abs(oriData)<=threshold - oriData[truth_values] = 0.0 - - nbEle = oriData.shape[0] - - - oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) - #print("starting") - # float *data, float r2r_error,size_t len,size_t *outSize - o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize) - - - return (o_bytes,outSize.contents.value, absErrBound), outSize - - -def cusz_device_decompress(nbEle, cmpBytes, owner, dtype): - __cuszx_device_decompress=get_device_decompress() - (cmpBytes, cmpsize, err_bound) = cmpBytes - - nbEle_p = ctypes.c_size_t(nbEle) - # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error - newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound)) - - # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) - # -- Workaround to convert GPU pointer to int - p_decompressed_ptr = ctypes.addressof(newData) - # cast to int64 pointer - # (effectively converting pointer to pointer to addr to pointer to int64) - p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) - decompressed_int = p_decompressed_int.contents - # -- - pointer_for_free = decompressed_int.value - # self.decompressed_own.append(decompressed_int.value) - mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) - mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) - #print("mem ptr") - #print(mem_ptr) - arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) - - # res = cp.zeros((nbEle,)) - # ## need to convert newData to cupy - # cp.place(res,bitmap,arr) - - c_res = cp.zeros(int(nbEle/2), np.complex64) - c_res.real = arr[0:int(nbEle/2)] - c_res.imag = arr[int(nbEle/2):] - return (c_res, pointer_for_free) - -### Example of device compress/decompress wrapper usage -class Comp(): - def __init__(self): - self.name = "dummy" - -def free_compressed(ptr): - p_ptr = ctypes.addressof(ptr) - p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) - decomp_int = p_int.contents - cp.cuda.runtime.free(decomp_int.value) - - -if __name__ == "__main__": - - DATA_SIZE = int(1024) - MAX_D = 10.0 - MIN_D = -10.0 - RANGE = MAX_D - MIN_D - r2r_threshold = 0.002 - r2r_error = 0.0001 - - in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) - #print(np.max(in_vector)) - DATA_SIZE = len(in_vector) - #range_vr = np.max(in_vector)-np.min(in_vector) - #r2r_threshold = r2r_threshold*range_vr - #r2r_error = r2r_error*range_vr - #in_vector = np.zeros((DATA_SIZE,)) - #for i in range(0,int(DATA_SIZE/4)): - # in_vector[i] = 0.0 - #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): - # in_vector[i] = 5.0 - #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): - # in_vector[i] = random.uniform(MIN_D, MAX_D) - #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): - # in_vector[i] = -7.0 - #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): - # in_vector[i] = 0.001 - - print(DATA_SIZE) - #in_vector = in_vector.astype('float32') - in_vector_gpu = cp.asarray(in_vector) - - # variable = ctypes.c_size_t(0) - # outSize = ctypes.pointer(variable) - for i in range(200): - s_time = time.time() - o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold) - print("Time python: "+str(time.time()-s_time)) - print(outSize[0]) - print("Compress Success...starting decompress ") - comp = Comp() - - s_time = time.time() - (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) - - free_compressed(o_bytes[0]) - cp.cuda.runtime.free(ptr) - print("Time python: "+str(time.time()-s_time)) - #for i in d_bytes: - # print(i) - print("Decompress Success") +import numpy as np +import ctypes +from ctypes import * +import random +from qtensor.tools.lazy_import import cupy as cp +import time +import torch + +from pathlib import Path +LIB_PATH = str(Path(__file__).parent/'libcusz_wrapper.so') +CUSZ_PATH = str(Path(__file__).parent/'libcusz.so') +# unsigned char* cuSZx_integrated_compress(float *data, float r2r_threshold, float r2r_err, size_t nbEle, int blockSize, size_t *outSize) + +# unsigned char* cusz_device_compress(float *data, float r2r_error,size_t len,size_t *outSize) + +def get_device_compress(): + dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL) + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.cusz_device_compress + # Returns: unsigned char *bytes + # Needs: float *data, float r2r_error,size_t len,size_t *outSize + func.argtypes = [POINTER(c_float), c_float, c_size_t, POINTER(c_size_t)] + func.restype = POINTER(c_ubyte) + return func + +# float* cusz_device_decompress(uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error){ + +def get_device_decompress(): + + dll_base = ctypes.CDLL(CUSZ_PATH, mode=ctypes.RTLD_GLOBAL) + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.cusz_device_decompress + # Returns: float *newData + # Needs: size_t nbEle, unsigned char *cmpBytes + func.argtypes = [POINTER(c_ubyte), c_size_t, c_size_t, c_float] + func.restype = POINTER(c_float) + return func + + +def cusz_device_compress(oriData, absErrBound, nbEle, blockSize,threshold): + __cuszx_device_compress = get_device_compress() + #print(nbEle) + ori_nbEle = nbEle + variable = ctypes.c_size_t(0) + outSize = ctypes.pointer(variable) + + oriData = oriData.flatten() + ori_real = oriData.real + ori_imag = oriData.imag + oriData = cp.concatenate((ori_real, ori_imag)) + #nbEle = len(oriData) + sample = oriData[::2] + #print(nbEle) + d = cp.amax(oriData) - cp.amin(oriData) + #print("max min time (s): " +str(time.time()-v_time)) + d = d.get() + if d.dtype == np.complex64: + #d = min(d.real, d.imag) + d = d.real + # absErrBound = absErrBound*(d) + threshold = threshold*(d) + s_1 = time.time() + #print(cp.get_array_module(oriData)) + truth_values = abs(oriData)<=threshold + oriData[truth_values] = 0.0 + + nbEle = oriData.shape[0] + + + oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) + #print("starting") + # float *data, float r2r_error,size_t len,size_t *outSize + o_bytes = __cuszx_device_compress(oriData_p,np.float32(absErrBound), np.ulonglong(nbEle), outSize) + + + return (o_bytes,outSize.contents.value, absErrBound), outSize + + +def cusz_device_decompress(nbEle, cmpBytes, owner, dtype): + __cuszx_device_decompress=get_device_decompress() + (cmpBytes, cmpsize, err_bound) = cmpBytes + + nbEle_p = ctypes.c_size_t(nbEle) + # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error + newData = __cuszx_device_decompress(cmpBytes,nbEle_p, ctypes.c_size_t(cmpsize), np.float32(err_bound)) + + # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) + # -- Workaround to convert GPU pointer to int + p_decompressed_ptr = ctypes.addressof(newData) + # cast to int64 pointer + # (effectively converting pointer to pointer to addr to pointer to int64) + p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) + decompressed_int = p_decompressed_int.contents + # -- + pointer_for_free = decompressed_int.value + # self.decompressed_own.append(decompressed_int.value) + mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) + mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) + #print("mem ptr") + #print(mem_ptr) + arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) + + # res = cp.zeros((nbEle,)) + # ## need to convert newData to cupy + # cp.place(res,bitmap,arr) + + c_res = cp.zeros(int(nbEle/2), np.complex64) + c_res.real = arr[0:int(nbEle/2)] + c_res.imag = arr[int(nbEle/2):] + return (c_res, pointer_for_free) + +### Example of device compress/decompress wrapper usage +class Comp(): + def __init__(self): + self.name = "dummy" + +def free_compressed(ptr): + p_ptr = ctypes.addressof(ptr) + p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) + decomp_int = p_int.contents + cp.cuda.runtime.free(decomp_int.value) + + +if __name__ == "__main__": + + DATA_SIZE = int(1024) + MAX_D = 10.0 + MIN_D = -10.0 + RANGE = MAX_D - MIN_D + r2r_threshold = 0.002 + r2r_error = 0.0001 + + in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) + #print(np.max(in_vector)) + DATA_SIZE = len(in_vector) + #range_vr = np.max(in_vector)-np.min(in_vector) + #r2r_threshold = r2r_threshold*range_vr + #r2r_error = r2r_error*range_vr + #in_vector = np.zeros((DATA_SIZE,)) + #for i in range(0,int(DATA_SIZE/4)): + # in_vector[i] = 0.0 + #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): + # in_vector[i] = 5.0 + #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): + # in_vector[i] = random.uniform(MIN_D, MAX_D) + #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): + # in_vector[i] = -7.0 + #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): + # in_vector[i] = 0.001 + + print(DATA_SIZE) + #in_vector = in_vector.astype('float32') + in_vector_gpu = cp.asarray(in_vector) + + # variable = ctypes.c_size_t(0) + # outSize = ctypes.pointer(variable) + for i in range(200): + s_time = time.time() + o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold) + print("Time python: "+str(time.time()-s_time)) + print(outSize[0]) + print("Compress Success...starting decompress ") + comp = Comp() + + s_time = time.time() + (d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) + + free_compressed(o_bytes[0]) + cp.cuda.runtime.free(ptr) + print("Time python: "+str(time.time()-s_time)) + #for i in d_bytes: + # print(i) + print("Decompress Success") diff --git a/qtensor/compression/cusz/src/detail/compare_cpu.inl b/qtensor/compression/cusz/src/detail/compare_cpu.inl index 1617fc38..b09eb558 100644 --- a/qtensor/compression/cusz/src/detail/compare_cpu.inl +++ b/qtensor/compression/cusz/src/detail/compare_cpu.inl @@ -1,109 +1,109 @@ -/** - * @file _compare.hh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-08 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682 -#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682 - -#include -#include -#include -#include -#include "cusz/type.h" - -namespace psz { -namespace detail { - -template -bool cppstd_identical(T* d1, T* d2, size_t const len) -{ - return std::equal(d1, d1 + len, d2); -} - -template -bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) -{ - // debugging - - bool eb_ed = true; - for (size_t i = 0; i < len; i++) { - if (fabs(a[i] - b[i]) > 1.001 * eb) { - if (first_faulty_idx) *first_faulty_idx = i; - return false; - } - } - return true; -} - -template -void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len) -{ - double max_odata = odata[0], min_odata = odata[0]; - double max_xdata = xdata[0], min_xdata = xdata[0]; - double max_abserr = max_abserr = fabs(xdata[0] - odata[0]); - - double sum_0 = 0, sum_x = 0; - for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i]; - - double mean_odata = sum_0 / len, mean_xdata = sum_x / len; - double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0; - - double max_pwrrel_abserr = 0; - size_t max_abserr_index = 0; - for (size_t i = 0; i < len; i++) { - max_odata = max_odata < odata[i] ? odata[i] : max_odata; - min_odata = min_odata > odata[i] ? odata[i] : min_odata; - - max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata; - min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata; - - float abserr = fabs(xdata[i] - odata[i]); - if (odata[i] != 0) { - rel_abserr = abserr / fabs(odata[i]); - max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr; - } - max_abserr_index = max_abserr < abserr ? i : max_abserr_index; - max_abserr = max_abserr < abserr ? abserr : max_abserr; - sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata); - sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata); - sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata); - sum_err2 += abserr * abserr; - } - double std_odata = sqrt(sum_var_odata / len); - double std_xdata = sqrt(sum_var_xdata / len); - double ee = sum_corr / len; - - s->len = len; - - s->odata.max = max_odata; - s->odata.min = min_odata; - s->odata.rng = max_odata - min_odata; - s->odata.std = std_odata; - - s->xdata.max = max_xdata; - s->xdata.min = min_xdata; - s->xdata.rng = max_xdata - min_xdata; - s->xdata.std = std_xdata; - - s->max_err.idx = max_abserr_index; - s->max_err.abs = max_abserr; - s->max_err.rel = max_abserr / s->odata.rng; - s->max_err.pwrrel = max_pwrrel_abserr; - - s->reduced.coeff = ee / std_odata / std_xdata; - s->reduced.MSE = sum_err2 / len; - s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; - s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); -} - -} // namespace detail -} // namespace psz - -#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */ +/** + * @file _compare.hh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-08 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef C0E747B4_066F_4B04_A3D2_00E1A3B7D682 +#define C0E747B4_066F_4B04_A3D2_00E1A3B7D682 + +#include +#include +#include +#include +#include "cusz/type.h" + +namespace psz { +namespace detail { + +template +bool cppstd_identical(T* d1, T* d2, size_t const len) +{ + return std::equal(d1, d1 + len, d2); +} + +template +bool cppstd_error_bounded(T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) +{ + // debugging + + bool eb_ed = true; + for (size_t i = 0; i < len; i++) { + if (fabs(a[i] - b[i]) > 1.001 * eb) { + if (first_faulty_idx) *first_faulty_idx = i; + return false; + } + } + return true; +} + +template +void cppstd_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t const len) +{ + double max_odata = odata[0], min_odata = odata[0]; + double max_xdata = xdata[0], min_xdata = xdata[0]; + double max_abserr = max_abserr = fabs(xdata[0] - odata[0]); + + double sum_0 = 0, sum_x = 0; + for (size_t i = 0; i < len; i++) sum_0 += odata[i], sum_x += xdata[i]; + + double mean_odata = sum_0 / len, mean_xdata = sum_x / len; + double sum_var_odata = 0, sum_var_xdata = 0, sum_err2 = 0, sum_corr = 0, rel_abserr = 0; + + double max_pwrrel_abserr = 0; + size_t max_abserr_index = 0; + for (size_t i = 0; i < len; i++) { + max_odata = max_odata < odata[i] ? odata[i] : max_odata; + min_odata = min_odata > odata[i] ? odata[i] : min_odata; + + max_xdata = max_xdata < odata[i] ? odata[i] : max_xdata; + min_xdata = min_xdata > xdata[i] ? xdata[i] : min_xdata; + + float abserr = fabs(xdata[i] - odata[i]); + if (odata[i] != 0) { + rel_abserr = abserr / fabs(odata[i]); + max_pwrrel_abserr = max_pwrrel_abserr < rel_abserr ? rel_abserr : max_pwrrel_abserr; + } + max_abserr_index = max_abserr < abserr ? i : max_abserr_index; + max_abserr = max_abserr < abserr ? abserr : max_abserr; + sum_corr += (odata[i] - mean_odata) * (xdata[i] - mean_xdata); + sum_var_odata += (odata[i] - mean_odata) * (odata[i] - mean_odata); + sum_var_xdata += (xdata[i] - mean_xdata) * (xdata[i] - mean_xdata); + sum_err2 += abserr * abserr; + } + double std_odata = sqrt(sum_var_odata / len); + double std_xdata = sqrt(sum_var_xdata / len); + double ee = sum_corr / len; + + s->len = len; + + s->odata.max = max_odata; + s->odata.min = min_odata; + s->odata.rng = max_odata - min_odata; + s->odata.std = std_odata; + + s->xdata.max = max_xdata; + s->xdata.min = min_xdata; + s->xdata.rng = max_xdata - min_xdata; + s->xdata.std = std_xdata; + + s->max_err.idx = max_abserr_index; + s->max_err.abs = max_abserr; + s->max_err.rel = max_abserr / s->odata.rng; + s->max_err.pwrrel = max_pwrrel_abserr; + + s->reduced.coeff = ee / std_odata / std_xdata; + s->reduced.MSE = sum_err2 / len; + s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; + s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); +} + +} // namespace detail +} // namespace psz + +#endif /* C0E747B4_066F_4B04_A3D2_00E1A3B7D682 */ diff --git a/qtensor/compression/cusz/src/detail/compare_gpu.inl b/qtensor/compression/cusz/src/detail/compare_gpu.inl index 12ec3475..851fc4a2 100644 --- a/qtensor/compression/cusz/src/detail/compare_gpu.inl +++ b/qtensor/compression/cusz/src/detail/compare_gpu.inl @@ -1,193 +1,193 @@ -/** - * @file _compare.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-08 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 -#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cusz/type.h" - -namespace psz { -namespace detail { - -static const int MINVAL = 0; -static const int MAXVAL = 1; -static const int AVGVAL = 2; -static const int RNG = 3; - -template -bool thrustgpu_identical(T* d1, T* d2, size_t const len) -{ - return thrust::equal(thrust::device, d1, d1 + len, d2); -} - -template -bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr) -{ - thrust::device_ptr a_ = thrust::device_pointer_cast(a); - thrust::device_ptr b_ = thrust::device_pointer_cast(b); - thrust::constant_iterator eb_(eb); - using tup = thrust::tuple; - - auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_)); - auto ab_end = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_)); - - // Let compiler figure out the type. - auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) { - // debug use - // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t)) - // printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t)); - - return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t); - }); - - if (iter == ab_end) { return true; } - else { - // *first_faulty_idx = iter - ab_begin; - return false; - } -} - -template -void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) -{ - thrust::device_ptr g_ptr = thrust::device_pointer_cast(d_ptr); - - auto minel = thrust::min_element(g_ptr, g_ptr + len) - g_ptr; - auto maxel = thrust::max_element(g_ptr, g_ptr + len) - g_ptr; - res[MINVAL] = *(g_ptr + minel); - res[MAXVAL] = *(g_ptr + maxel); - res[RNG] = res[MAXVAL] - res[MINVAL]; - - auto sum = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus()); - res[AVGVAL] = sum / len; -} - -template -void thrustgpu_get_extrema(thrust::device_ptr g_ptr, size_t len, T res[4]) -{ - auto minel = thrust::min_element(g_ptr, g_ptr + len) - g_ptr; - auto maxel = thrust::max_element(g_ptr, g_ptr + len) - g_ptr; - res[MINVAL] = *(g_ptr + minel); - res[MAXVAL] = *(g_ptr + maxel); - res[RNG] = res[MAXVAL] - res[MINVAL]; - - auto sum = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus()); - res[AVGVAL] = sum / len; -} - -template -void thrustgpu_get_maxerr( - T* reconstructed, // in - T* original, // in - size_t len, // in - T& maximum_val, // out - size_t& maximum_loc, // out - bool destructive = false) -{ - T* diff; - - if (destructive) { - diff = original; // aliasing - } - else { - cudaMalloc(&diff, sizeof(T) * len); - } - - auto expr = [=] __device__(T rel, T oel) { return rel - oel; }; - - // typesafe (also with exec-policy binding) - thrust::device_ptr r(reconstructed); - thrust::device_ptr o(original); - thrust::device_ptr d(diff); - - thrust::transform(r, r + len, o, d, expr); - - auto maximum_ptr = thrust::max_element(d, d + len); - maximum_val = *maximum_ptr; - maximum_loc = maximum_ptr - d; - - if (not destructive) { cudaFree(diff); } -} - -template -void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len) -{ - using tup = thrust::tuple; - - thrust::device_ptr p_odata = thrust::device_pointer_cast(odata); // origin - thrust::device_ptr p_xdata = thrust::device_pointer_cast(xdata); - - T odata_res[4], xdata_res[4]; - - thrustgpu_get_extrema(p_odata, len, odata_res); - thrustgpu_get_extrema(p_xdata, len, xdata_res); - - auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata)); - auto end = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len)); - - // clang-format off - auto corr = [=] __host__ __device__(tup t) { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); }; - auto err2 = [] __host__ __device__(tup t) { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; }; - auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; }; - auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; }; - - auto sum_err2 = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus()); - auto sum_corr = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus()); - auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus()); - auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus()); - // clang-format on - - double std_odata = sqrt(sum_var_odata / len); - double std_xdata = sqrt(sum_var_xdata / len); - double ee = sum_corr / len; - - // ----------------------------------------------------------------------------- - T max_abserr{0}; - size_t max_abserr_index{0}; - thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false); - // ----------------------------------------------------------------------------- - - s->len = len; - - s->odata.max = odata_res[MAXVAL]; - s->odata.min = odata_res[MINVAL]; - s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL]; - s->odata.std = std_odata; - - s->xdata.max = xdata_res[MAXVAL]; - s->xdata.min = xdata_res[MINVAL]; - s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL]; - s->xdata.std = std_xdata; - - s->max_err.idx = max_abserr_index; - s->max_err.abs = max_abserr; - s->max_err.rel = max_abserr / s->odata.rng; - s->max_err.pwrrel = NAN; - - s->reduced.coeff = ee / std_odata / std_xdata; - s->reduced.MSE = sum_err2 / len; - s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; - s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); -} - -} // namespace detail -} // namespace psz - -#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */ +/** + * @file _compare.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-08 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 +#define F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cusz/type.h" + +namespace psz { +namespace detail { + +static const int MINVAL = 0; +static const int MAXVAL = 1; +static const int AVGVAL = 2; +static const int RNG = 3; + +template +bool thrustgpu_identical(T* d1, T* d2, size_t const len) +{ + return thrust::equal(thrust::device, d1, d1 + len, d2); +} + +template +bool thrustgpu_error_bounded(T* a, T* b, size_t const len, double eb, size_t* first_faulty_idx = nullptr) +{ + thrust::device_ptr a_ = thrust::device_pointer_cast(a); + thrust::device_ptr b_ = thrust::device_pointer_cast(b); + thrust::constant_iterator eb_(eb); + using tup = thrust::tuple; + + auto ab_begin = thrust::make_zip_iterator(thrust::make_tuple(a_, b_, eb_)); + auto ab_end = thrust::make_zip_iterator(thrust::make_tuple(a_ + len, b_ + len, eb_)); + + // Let compiler figure out the type. + auto iter = thrust::find_if(thrust::device, ab_begin, ab_end, [] __device__(tup t) { + // debug use + // if (fabs(thrust::get<1>(t) - thrust::get<0>(t)) > thrust::get<2>(t)) + // printf("a: %f\tb: %f\teb: %lf\n", (float)thrust::get<1>(t), (float)thrust::get<0>(t), thrust::get<2>(t)); + + return fabs(thrust::get<1>(t) - thrust::get<0>(t)) > 1.001 * thrust::get<2>(t); + }); + + if (iter == ab_end) { return true; } + else { + // *first_faulty_idx = iter - ab_begin; + return false; + } +} + +template +void thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) +{ + thrust::device_ptr g_ptr = thrust::device_pointer_cast(d_ptr); + + auto minel = thrust::min_element(g_ptr, g_ptr + len) - g_ptr; + auto maxel = thrust::max_element(g_ptr, g_ptr + len) - g_ptr; + res[MINVAL] = *(g_ptr + minel); + res[MAXVAL] = *(g_ptr + maxel); + res[RNG] = res[MAXVAL] - res[MINVAL]; + + auto sum = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus()); + res[AVGVAL] = sum / len; +} + +template +void thrustgpu_get_extrema(thrust::device_ptr g_ptr, size_t len, T res[4]) +{ + auto minel = thrust::min_element(g_ptr, g_ptr + len) - g_ptr; + auto maxel = thrust::max_element(g_ptr, g_ptr + len) - g_ptr; + res[MINVAL] = *(g_ptr + minel); + res[MAXVAL] = *(g_ptr + maxel); + res[RNG] = res[MAXVAL] - res[MINVAL]; + + auto sum = thrust::reduce(g_ptr, g_ptr + len, (T)0.0, thrust::plus()); + res[AVGVAL] = sum / len; +} + +template +void thrustgpu_get_maxerr( + T* reconstructed, // in + T* original, // in + size_t len, // in + T& maximum_val, // out + size_t& maximum_loc, // out + bool destructive = false) +{ + T* diff; + + if (destructive) { + diff = original; // aliasing + } + else { + cudaMalloc(&diff, sizeof(T) * len); + } + + auto expr = [=] __device__(T rel, T oel) { return rel - oel; }; + + // typesafe (also with exec-policy binding) + thrust::device_ptr r(reconstructed); + thrust::device_ptr o(original); + thrust::device_ptr d(diff); + + thrust::transform(r, r + len, o, d, expr); + + auto maximum_ptr = thrust::max_element(d, d + len); + maximum_val = *maximum_ptr; + maximum_loc = maximum_ptr - d; + + if (not destructive) { cudaFree(diff); } +} + +template +void thrustgpu_assess_quality(cusz_stats* s, T* xdata, T* odata, size_t len) +{ + using tup = thrust::tuple; + + thrust::device_ptr p_odata = thrust::device_pointer_cast(odata); // origin + thrust::device_ptr p_xdata = thrust::device_pointer_cast(xdata); + + T odata_res[4], xdata_res[4]; + + thrustgpu_get_extrema(p_odata, len, odata_res); + thrustgpu_get_extrema(p_xdata, len, xdata_res); + + auto begin = thrust::make_zip_iterator(thrust::make_tuple(p_odata, p_xdata)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(p_odata + len, p_xdata + len)); + + // clang-format off + auto corr = [=] __host__ __device__(tup t) { return (thrust::get<0>(t) - odata[AVGVAL]) * (thrust::get<1>(t) - xdata[AVGVAL]); }; + auto err2 = [] __host__ __device__(tup t) { T f = thrust::get<0>(t) - thrust::get<1>(t); return f * f; }; + auto var_odata = [=] __host__ __device__(T a) { T f = a - odata[AVGVAL]; return f * f; }; + auto var_xdata = [=] __host__ __device__(T a) { T f = a - xdata[AVGVAL]; return f * f; }; + + auto sum_err2 = thrust::transform_reduce(begin, end, err2, 0.0f, thrust::plus()); + auto sum_corr = thrust::transform_reduce(begin, end, corr, 0.0f, thrust::plus()); + auto sum_var_odata = thrust::transform_reduce(p_odata, p_odata + len, var_odata, 0.0f, thrust::plus()); + auto sum_var_xdata = thrust::transform_reduce(p_xdata, p_xdata + len, var_xdata, 0.0f, thrust::plus()); + // clang-format on + + double std_odata = sqrt(sum_var_odata / len); + double std_xdata = sqrt(sum_var_xdata / len); + double ee = sum_corr / len; + + // ----------------------------------------------------------------------------- + T max_abserr{0}; + size_t max_abserr_index{0}; + thrustgpu_get_maxerr(xdata, odata, len, max_abserr, max_abserr_index, false); + // ----------------------------------------------------------------------------- + + s->len = len; + + s->odata.max = odata_res[MAXVAL]; + s->odata.min = odata_res[MINVAL]; + s->odata.rng = odata_res[MAXVAL] - odata_res[MINVAL]; + s->odata.std = std_odata; + + s->xdata.max = xdata_res[MAXVAL]; + s->xdata.min = xdata_res[MINVAL]; + s->xdata.rng = xdata_res[MAXVAL] - xdata_res[MINVAL]; + s->xdata.std = std_xdata; + + s->max_err.idx = max_abserr_index; + s->max_err.abs = max_abserr; + s->max_err.rel = max_abserr / s->odata.rng; + s->max_err.pwrrel = NAN; + + s->reduced.coeff = ee / std_odata / std_xdata; + s->reduced.MSE = sum_err2 / len; + s->reduced.NRMSE = sqrt(s->reduced.MSE) / s->odata.rng; + s->reduced.PSNR = 20 * log10(s->odata.rng) - 10 * log10(s->reduced.MSE); +} + +} // namespace detail +} // namespace psz + +#endif /* F7DF2FE5_571E_48C1_965D_0B19D1CC14D4 */ diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.cu b/qtensor/compression/cusz/src/detail/compressor_impl.cu index 83b819ae..3974e15b 100644 --- a/qtensor/compression/cusz/src/detail/compressor_impl.cu +++ b/qtensor/compression/cusz/src/detail/compressor_impl.cu @@ -1,18 +1,18 @@ -/** - * @file compressor.cu - * @author Jiannan Tian - * @brief cuSZ compressor of the default path - * @version 0.3 - * @date 2021-10-05 - * (create) 2020-02-12; (release) 2020-09-20; - * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include "compressor_impl.inl" -#include "framework.hh" - -template class cusz::Compressor>::impl; +/** + * @file compressor.cu + * @author Jiannan Tian + * @brief cuSZ compressor of the default path + * @version 0.3 + * @date 2021-10-05 + * (create) 2020-02-12; (release) 2020-09-20; + * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include "compressor_impl.inl" +#include "framework.hh" + +template class cusz::Compressor>::impl; diff --git a/qtensor/compression/cusz/src/detail/compressor_impl.inl b/qtensor/compression/cusz/src/detail/compressor_impl.inl index a36f339a..46704ba6 100644 --- a/qtensor/compression/cusz/src/detail/compressor_impl.inl +++ b/qtensor/compression/cusz/src/detail/compressor_impl.inl @@ -1,479 +1,479 @@ -/** - * @file compressor_impl.cuh - * @author Jiannan Tian - * @brief cuSZ compressor of the default path - * @version 0.3 - * @date 2021-10-05 - * (create) 2020-02-12; (release) 2020-09-20; - * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_DEFAULT_PATH_CUH -#define CUSZ_DEFAULT_PATH_CUH - -#include -#include -#include -#include - -#include "component.hh" -#include "compressor.hh" -#include "header.h" -#include "kernel/cpplaunch_cuda.hh" -#include "stat/stat_g.hh" -#include "utils/cuda_err.cuh" - -#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr}; -#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr}; -#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR)); -#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR)); - -#define PRINT_ENTRY(VAR) printf("%d %-*s: %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]); - -#define DEVICE2DEVICE_COPY(VAR, FIELD) \ - if (nbyte[Header::FIELD] != 0 and VAR != nullptr) { \ - auto dst = d_reserved_compressed + header.entry[Header::FIELD]; \ - auto src = reinterpret_cast(VAR); \ - CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ - } - -#define ACCESSOR(SYM, TYPE) reinterpret_cast(in_compressed + header->entry[Header::SYM]) - -namespace cusz { - -constexpr auto kHOST = cusz::LOC::HOST; -constexpr auto kDEVICE = cusz::LOC::DEVICE; -constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE; - -#define TEMPLATE_TYPE template -#define IMPL Compressor::impl - -TEMPLATE_TYPE -uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; } - -TEMPLATE_TYPE -IMPL::impl() -{ - predictor = new Predictor; - - spcodec = new Spcodec; - codec = new Codec; - fb_codec = new FallbackCodec; -} - -TEMPLATE_TYPE -void IMPL::destroy() -{ - if (spcodec) delete spcodec; - if (codec) delete codec; - if (fb_codec) delete codec; - if (predictor) delete predictor; -} - -TEMPLATE_TYPE -IMPL::~impl() { destroy(); } - -//------------------------------------------------------------------------------ - -// TODO -TEMPLATE_TYPE -void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); } - -TEMPLATE_TYPE -void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); } - -template -void peek_devdata(T* d_arr, size_t num = 20) -{ - thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); }); - printf("\n"); -} - -TEMPLATE_TYPE -void IMPL::compress( - Context* config, - T* uncompressed, - BYTE*& compressed, - size_t& compressed_len, - cudaStream_t stream, - bool dbg_print) -{ - auto const eb = config->eb; - auto const radius = config->radius; - auto const pardeg = config->vle_pardeg; - auto const codecs_in_use = config->codecs_in_use; - auto const nz_density_factor = config->nz_density_factor; - - if (dbg_print) { - std::cout << "eb\t" << eb << endl; - std::cout << "radius\t" << radius << endl; - std::cout << "pardeg\t" << pardeg << endl; - std::cout << "codecs_in_use\t" << codecs_in_use << endl; - std::cout << "nz_density_factor\t" << nz_density_factor << endl; - } - - data_len3 = dim3(config->x, config->y, config->z); - auto codec_force_fallback = config->codec_force_fallback(); - - header.codecs_in_use = codecs_in_use; - header.nz_density_factor = nz_density_factor; - - T* d_anchor{nullptr}; // predictor out1 - E* d_errctrl{nullptr}; // predictor out2 - T* d_outlier{nullptr}; // predictor out3 - BYTE* d_spfmt{nullptr}; - size_t spfmt_outlen{0}; - - BYTE* d_codec_out{nullptr}; - size_t codec_outlen{0}; - - size_t data_len, errctrl_len, sublen, spcodec_inlen; - auto booklen = radius * 2; - - auto derive_lengths_after_prediction = [&]() { - data_len = predictor->get_len_data(); - errctrl_len = data_len; - spcodec_inlen = data_len; - sublen = ConfigHelper::get_npart(data_len, pardeg); - - // std::cout << "datalen\t" << data_len << '\n'; - // std::cout << "errctrl_len\t" << errctrl_len << '\n'; - // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n'; - // std::cout << "sublen\t" << sublen << '\n'; - }; - - auto update_header = [&]() { - header.x = data_len3.x; - header.y = data_len3.y; - header.z = data_len3.z; - header.w = 1; // placeholder - header.radius = radius; - header.vle_pardeg = pardeg; - header.eb = eb; - header.byte_vle = use_fallback_codec ? 8 : 4; - }; - - /******************************************************************************/ - - // Prediction is the dependency of the rest procedures. - predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream); - // peek_devdata(d_errctrl); - - derive_lengths_after_prediction(); - /******************************************************************************/ - - asz::stat::histogram(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream); - - /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); - - // TODO remove duplicate get_frequency inside encode_with_exception() - encode_with_exception( - d_errctrl, errctrl_len, // input - d_freq, booklen, sublen, pardeg, codec_force_fallback, // config - d_codec_out, codec_outlen, // output - stream, dbg_print); - - (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print); - - /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); - - /******************************************************************************/ - - update_header(); - subfile_collect( - d_anchor, (*predictor).get_len_anchor(), // - d_codec_out, codec_outlen, // - d_spfmt, spfmt_outlen, // - stream, dbg_print); - - // output - compressed_len = ConfigHelper::get_filesize(&header); - compressed = d_reserved_compressed; - - collect_compress_timerecord(); - - // considering that codec can be consecutively in use, and can compress data of different huff-byte - use_fallback_codec = false; -} - -TEMPLATE_TYPE -void IMPL::clear_buffer() -{ // - (*predictor).clear_buffer(); - (*codec).clear_buffer(); - (*spcodec).clear_buffer(); -} - -TEMPLATE_TYPE -void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print) -{ - // TODO host having copy of header when compressing - if (not header) { - header = new Header; - CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - } - - data_len3 = dim3(header->x, header->y, header->z); - - use_fallback_codec = header->byte_vle == 8; - double const eb = header->eb; - int const radius = header->radius; - auto const vle_pardeg = header->vle_pardeg; - - // The inputs of components are from `compressed`. - auto d_anchor = ACCESSOR(ANCHOR, T); - auto d_vle = ACCESSOR(VLE, BYTE); - auto d_sp = ACCESSOR(SPFMT, BYTE); - - // wire the workspace - auto d_errctrl = (*predictor).expose_quant(); // reuse space - - // wire and aliasing - auto d_outlier = out_decompressed; - auto d_outlier_xdata = out_decompressed; - - auto spcodec_do = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); }; - auto decode_with_exception = [&]() { - if (not use_fallback_codec) { // - (*codec).decode(d_vle, d_errctrl); - } - else { - if (not fallback_codec_allocated) { - (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false); - fallback_codec_allocated = true; - } - (*fb_codec).decode(d_vle, d_errctrl); - } - }; - auto predictor_do = [&]() { - (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream); - }; - - // process - spcodec_do(), decode_with_exception(), predictor_do(); - - collect_decompress_timerecord(); - - // clear state for the next decompression after reporting - use_fallback_codec = false; -} - -// public getter -TEMPLATE_TYPE -void IMPL::export_header(Header& ext_header) { ext_header = header; } - -TEMPLATE_TYPE -void IMPL::export_header(Header* ext_header) { *ext_header = header; } - -TEMPLATE_TYPE -void IMPL::export_timerecord(TimeRecord* ext_timerecord) -{ - if (ext_timerecord) *ext_timerecord = timerecord; -} - -// helper -TEMPLATE_TYPE -void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print) -{ - if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s)."); - if (codec_config bitand 0b01) { - if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec"); - (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print); - } - if (codec_config bitand 0b10) { - if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec"); - (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print); - fallback_codec_allocated = true; - } -}; - -TEMPLATE_TYPE -template -void IMPL::init_detail(CONFIG* config, bool dbg_print) -{ - const auto cfg_radius = config->radius; - const auto cfg_pardeg = config->vle_pardeg; - const auto density_factor = config->nz_density_factor; - const auto codec_config = config->codecs_in_use; - const auto cfg_max_booklen = cfg_radius * 2; - const auto x = config->x; - const auto y = config->y; - const auto z = config->z; - - size_t spcodec_in_len, codec_in_len; - - (*predictor).init(LorenzoI, x, y, z, dbg_print); - - spcodec_in_len = (*predictor).get_alloclen_data(); - codec_in_len = (*predictor).get_alloclen_quant(); - - (*spcodec).init(spcodec_in_len, density_factor, dbg_print); - - { - auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen; - cudaMalloc(&d_freq, bytes); - cudaMemset(d_freq, 0x0, bytes); - - // cudaMalloc(&d_freq_another, bytes); - // cudaMemset(d_freq_another, 0x0, bytes); - } - - init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print); - - CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2)); -} - -TEMPLATE_TYPE -void IMPL::collect_compress_timerecord() -{ -#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast(NAME), TIME}); - - if (not timerecord.empty()) timerecord.clear(); - - COLLECT_TIME("predict", (*predictor).get_time_elapsed()); - COLLECT_TIME("histogram", time_hist); - - if (not use_fallback_codec) { - COLLECT_TIME("book", (*codec).get_time_book()); - COLLECT_TIME("huff-enc", (*codec).get_time_lossless()); - } - else { - COLLECT_TIME("book", (*fb_codec).get_time_book()); - COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless()); - } - - COLLECT_TIME("outlier", (*spcodec).get_time_elapsed()); -} - -TEMPLATE_TYPE -void IMPL::collect_decompress_timerecord() -{ - if (not timerecord.empty()) timerecord.clear(); - - COLLECT_TIME("outlier", (*spcodec).get_time_elapsed()); - - if (not use_fallback_codec) { // - COLLECT_TIME("huff-dec", (*codec).get_time_lossless()); - } - else { // - COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless()); - } - - COLLECT_TIME("predict", (*predictor).get_time_elapsed()); -} - -TEMPLATE_TYPE -void IMPL::encode_with_exception( - E* d_in, - size_t inlen, - cusz::FREQ* d_freq, - int booklen, - int sublen, - int pardeg, - bool codec_force_fallback, - BYTE*& d_out, - size_t& outlen, - cudaStream_t stream, - bool dbg_print) -{ - auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); }; - auto encode_with = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); }; - - auto try_fallback_alloc = [&]() { - use_fallback_codec = true; - if (not fallback_codec_allocated) { - LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec"); - fb_codec->init(inlen, booklen, pardeg, dbg_print); - fallback_codec_allocated = true; - } - }; - - /******************************************************************************/ - if (not codec_force_fallback) { - try { - build_codebook_using(codec); - encode_with(codec); - } - catch (const std::runtime_error& e) { - LOGGING(LOG_EXCEPTION, "switch to fallback codec"); - try_fallback_alloc(); - - build_codebook_using(fb_codec); - encode_with(fb_codec); - } - } - else { - LOGGING(LOG_INFO, "force switch to fallback codec"); - try_fallback_alloc(); - - build_codebook_using(fb_codec); - encode_with(fb_codec); - } -} - -TEMPLATE_TYPE -void IMPL::subfile_collect( - T* d_anchor, - size_t anchor_len, - BYTE* d_codec_out, - size_t codec_outlen, - BYTE* d_spfmt_out, - size_t spfmt_outlen, - cudaStream_t stream, - bool dbg_print) -{ - header.self_bytes = sizeof(Header); - uint32_t nbyte[Header::END]; - nbyte[Header::HEADER] = sizeof(Header); - nbyte[Header::ANCHOR] = sizeof(T) * anchor_len; - nbyte[Header::VLE] = sizeof(BYTE) * codec_outlen; - nbyte[Header::SPFMT] = sizeof(BYTE) * spfmt_outlen; - - header.entry[0] = 0; - // *.END + 1; need to know the ending position - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } - - auto debug_header_entry = [&]() { - printf("\nsubfile collect in compressor:\n"); - printf(" ENTRIES\n"); - - PRINT_ENTRY(HEADER); - PRINT_ENTRY(ANCHOR); - PRINT_ENTRY(VLE); - PRINT_ENTRY(SPFMT); - PRINT_ENTRY(END); - printf("\n"); - }; - - if (dbg_print) debug_header_entry(); - - CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); - - DEVICE2DEVICE_COPY(d_anchor, ANCHOR) - DEVICE2DEVICE_COPY(d_codec_out, VLE) - DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT) - - /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); -} - -} // namespace cusz - -#undef FREEDEV -#undef FREEHOST -#undef DEFINE_DEV -#undef DEFINE_HOST -#undef DEVICE2DEVICE_COPY -#undef PRINT_ENTRY -#undef ACCESSOR -#undef COLLECT_TIME - -#undef TEMPLATE_TYPE -#undef IMPL - -#endif +/** + * @file compressor_impl.cuh + * @author Jiannan Tian + * @brief cuSZ compressor of the default path + * @version 0.3 + * @date 2021-10-05 + * (create) 2020-02-12; (release) 2020-09-20; + * (rev.1) 2021-01-16; (rev.2) 2021-07-12; (rev.3) 2021-09-06; (rev.4) 2021-10-05 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_DEFAULT_PATH_CUH +#define CUSZ_DEFAULT_PATH_CUH + +#include +#include +#include +#include + +#include "component.hh" +#include "compressor.hh" +#include "header.h" +#include "kernel/cpplaunch_cuda.hh" +#include "stat/stat_g.hh" +#include "utils/cuda_err.cuh" + +#define DEFINE_DEV(VAR, TYPE) TYPE* d_##VAR{nullptr}; +#define DEFINE_HOST(VAR, TYPE) TYPE* h_##VAR{nullptr}; +#define FREEDEV(VAR) CHECK_CUDA(cudaFree(d_##VAR)); +#define FREEHOST(VAR) CHECK_CUDA(cudaFreeHost(h_##VAR)); + +#define PRINT_ENTRY(VAR) printf("%d %-*s: %'10u\n", (int)Header::VAR, 14, #VAR, header.entry[Header::VAR]); + +#define DEVICE2DEVICE_COPY(VAR, FIELD) \ + if (nbyte[Header::FIELD] != 0 and VAR != nullptr) { \ + auto dst = d_reserved_compressed + header.entry[Header::FIELD]; \ + auto src = reinterpret_cast(VAR); \ + CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ + } + +#define ACCESSOR(SYM, TYPE) reinterpret_cast(in_compressed + header->entry[Header::SYM]) + +namespace cusz { + +constexpr auto kHOST = cusz::LOC::HOST; +constexpr auto kDEVICE = cusz::LOC::DEVICE; +constexpr auto kHOST_DEVICE = cusz::LOC::HOST_DEVICE; + +#define TEMPLATE_TYPE template +#define IMPL Compressor::impl + +TEMPLATE_TYPE +uint32_t IMPL::get_len_data() { return data_len3.x * data_len3.y * data_len3.z; } + +TEMPLATE_TYPE +IMPL::impl() +{ + predictor = new Predictor; + + spcodec = new Spcodec; + codec = new Codec; + fb_codec = new FallbackCodec; +} + +TEMPLATE_TYPE +void IMPL::destroy() +{ + if (spcodec) delete spcodec; + if (codec) delete codec; + if (fb_codec) delete codec; + if (predictor) delete predictor; +} + +TEMPLATE_TYPE +IMPL::~impl() { destroy(); } + +//------------------------------------------------------------------------------ + +// TODO +TEMPLATE_TYPE +void IMPL::init(Context* config, bool dbg_print) { init_detail(config, dbg_print); } + +TEMPLATE_TYPE +void IMPL::init(Header* config, bool dbg_print) { init_detail(config, dbg_print); } + +template +void peek_devdata(T* d_arr, size_t num = 20) +{ + thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__ __host__(const T i) { printf("%u\t", i); }); + printf("\n"); +} + +TEMPLATE_TYPE +void IMPL::compress( + Context* config, + T* uncompressed, + BYTE*& compressed, + size_t& compressed_len, + cudaStream_t stream, + bool dbg_print) +{ + auto const eb = config->eb; + auto const radius = config->radius; + auto const pardeg = config->vle_pardeg; + auto const codecs_in_use = config->codecs_in_use; + auto const nz_density_factor = config->nz_density_factor; + + if (dbg_print) { + std::cout << "eb\t" << eb << endl; + std::cout << "radius\t" << radius << endl; + std::cout << "pardeg\t" << pardeg << endl; + std::cout << "codecs_in_use\t" << codecs_in_use << endl; + std::cout << "nz_density_factor\t" << nz_density_factor << endl; + } + + data_len3 = dim3(config->x, config->y, config->z); + auto codec_force_fallback = config->codec_force_fallback(); + + header.codecs_in_use = codecs_in_use; + header.nz_density_factor = nz_density_factor; + + T* d_anchor{nullptr}; // predictor out1 + E* d_errctrl{nullptr}; // predictor out2 + T* d_outlier{nullptr}; // predictor out3 + BYTE* d_spfmt{nullptr}; + size_t spfmt_outlen{0}; + + BYTE* d_codec_out{nullptr}; + size_t codec_outlen{0}; + + size_t data_len, errctrl_len, sublen, spcodec_inlen; + auto booklen = radius * 2; + + auto derive_lengths_after_prediction = [&]() { + data_len = predictor->get_len_data(); + errctrl_len = data_len; + spcodec_inlen = data_len; + sublen = ConfigHelper::get_npart(data_len, pardeg); + + // std::cout << "datalen\t" << data_len << '\n'; + // std::cout << "errctrl_len\t" << errctrl_len << '\n'; + // std::cout << "spcodec_inlen\t" << spcodec_inlen << '\n'; + // std::cout << "sublen\t" << sublen << '\n'; + }; + + auto update_header = [&]() { + header.x = data_len3.x; + header.y = data_len3.y; + header.z = data_len3.z; + header.w = 1; // placeholder + header.radius = radius; + header.vle_pardeg = pardeg; + header.eb = eb; + header.byte_vle = use_fallback_codec ? 8 : 4; + }; + + /******************************************************************************/ + + // Prediction is the dependency of the rest procedures. + predictor->construct(LorenzoI, data_len3, uncompressed, &d_anchor, &d_errctrl, &d_outlier, eb, radius, stream); + // peek_devdata(d_errctrl); + + derive_lengths_after_prediction(); + /******************************************************************************/ + + asz::stat::histogram(d_errctrl, errctrl_len, d_freq, booklen, &time_hist, stream); + + /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); + + // TODO remove duplicate get_frequency inside encode_with_exception() + encode_with_exception( + d_errctrl, errctrl_len, // input + d_freq, booklen, sublen, pardeg, codec_force_fallback, // config + d_codec_out, codec_outlen, // output + stream, dbg_print); + + (*spcodec).encode(d_outlier, spcodec_inlen, d_spfmt, spfmt_outlen, stream, dbg_print); + + /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); + + /******************************************************************************/ + + update_header(); + subfile_collect( + d_anchor, (*predictor).get_len_anchor(), // + d_codec_out, codec_outlen, // + d_spfmt, spfmt_outlen, // + stream, dbg_print); + + // output + compressed_len = ConfigHelper::get_filesize(&header); + compressed = d_reserved_compressed; + + collect_compress_timerecord(); + + // considering that codec can be consecutively in use, and can compress data of different huff-byte + use_fallback_codec = false; +} + +TEMPLATE_TYPE +void IMPL::clear_buffer() +{ // + (*predictor).clear_buffer(); + (*codec).clear_buffer(); + (*spcodec).clear_buffer(); +} + +TEMPLATE_TYPE +void IMPL::decompress(Header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print) +{ + // TODO host having copy of header when compressing + if (not header) { + header = new Header; + CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(Header), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + data_len3 = dim3(header->x, header->y, header->z); + + use_fallback_codec = header->byte_vle == 8; + double const eb = header->eb; + int const radius = header->radius; + auto const vle_pardeg = header->vle_pardeg; + + // The inputs of components are from `compressed`. + auto d_anchor = ACCESSOR(ANCHOR, T); + auto d_vle = ACCESSOR(VLE, BYTE); + auto d_sp = ACCESSOR(SPFMT, BYTE); + + // wire the workspace + auto d_errctrl = (*predictor).expose_quant(); // reuse space + + // wire and aliasing + auto d_outlier = out_decompressed; + auto d_outlier_xdata = out_decompressed; + + auto spcodec_do = [&]() { (*spcodec).decode(d_sp, d_outlier, stream); }; + auto decode_with_exception = [&]() { + if (not use_fallback_codec) { // + (*codec).decode(d_vle, d_errctrl); + } + else { + if (not fallback_codec_allocated) { + (*fb_codec).init((*predictor).get_len_quant(), radius * 2, vle_pardeg, /*dbg print*/ false); + fallback_codec_allocated = true; + } + (*fb_codec).decode(d_vle, d_errctrl); + } + }; + auto predictor_do = [&]() { + (*predictor).reconstruct(LorenzoI, data_len3, d_outlier_xdata, d_anchor, d_errctrl, eb, radius, stream); + }; + + // process + spcodec_do(), decode_with_exception(), predictor_do(); + + collect_decompress_timerecord(); + + // clear state for the next decompression after reporting + use_fallback_codec = false; +} + +// public getter +TEMPLATE_TYPE +void IMPL::export_header(Header& ext_header) { ext_header = header; } + +TEMPLATE_TYPE +void IMPL::export_header(Header* ext_header) { *ext_header = header; } + +TEMPLATE_TYPE +void IMPL::export_timerecord(TimeRecord* ext_timerecord) +{ + if (ext_timerecord) *ext_timerecord = timerecord; +} + +// helper +TEMPLATE_TYPE +void IMPL::init_codec(size_t codec_in_len, unsigned int codec_config, int max_booklen, int pardeg, bool dbg_print) +{ + if (codec_config == 0b00) throw std::runtime_error("Argument codec_config must have set bit(s)."); + if (codec_config bitand 0b01) { + if (dbg_print) LOGGING(LOG_INFO, "allocated 4-byte codec"); + (*codec).init(codec_in_len, max_booklen, pardeg, dbg_print); + } + if (codec_config bitand 0b10) { + if (dbg_print) LOGGING(LOG_INFO, "allocated 8-byte (fallback) codec"); + (*fb_codec).init(codec_in_len, max_booklen, pardeg, dbg_print); + fallback_codec_allocated = true; + } +}; + +TEMPLATE_TYPE +template +void IMPL::init_detail(CONFIG* config, bool dbg_print) +{ + const auto cfg_radius = config->radius; + const auto cfg_pardeg = config->vle_pardeg; + const auto density_factor = config->nz_density_factor; + const auto codec_config = config->codecs_in_use; + const auto cfg_max_booklen = cfg_radius * 2; + const auto x = config->x; + const auto y = config->y; + const auto z = config->z; + + size_t spcodec_in_len, codec_in_len; + + (*predictor).init(LorenzoI, x, y, z, dbg_print); + + spcodec_in_len = (*predictor).get_alloclen_data(); + codec_in_len = (*predictor).get_alloclen_quant(); + + (*spcodec).init(spcodec_in_len, density_factor, dbg_print); + + { + auto bytes = sizeof(cusz::FREQ) * cfg_max_booklen; + cudaMalloc(&d_freq, bytes); + cudaMemset(d_freq, 0x0, bytes); + + // cudaMalloc(&d_freq_another, bytes); + // cudaMemset(d_freq_another, 0x0, bytes); + } + + init_codec(codec_in_len, codec_config, cfg_max_booklen, cfg_pardeg, dbg_print); + + CHECK_CUDA(cudaMalloc(&d_reserved_compressed, (*predictor).get_alloclen_data() * sizeof(T) / 2)); +} + +TEMPLATE_TYPE +void IMPL::collect_compress_timerecord() +{ +#define COLLECT_TIME(NAME, TIME) timerecord.push_back({const_cast(NAME), TIME}); + + if (not timerecord.empty()) timerecord.clear(); + + COLLECT_TIME("predict", (*predictor).get_time_elapsed()); + COLLECT_TIME("histogram", time_hist); + + if (not use_fallback_codec) { + COLLECT_TIME("book", (*codec).get_time_book()); + COLLECT_TIME("huff-enc", (*codec).get_time_lossless()); + } + else { + COLLECT_TIME("book", (*fb_codec).get_time_book()); + COLLECT_TIME("huff-enc", (*fb_codec).get_time_lossless()); + } + + COLLECT_TIME("outlier", (*spcodec).get_time_elapsed()); +} + +TEMPLATE_TYPE +void IMPL::collect_decompress_timerecord() +{ + if (not timerecord.empty()) timerecord.clear(); + + COLLECT_TIME("outlier", (*spcodec).get_time_elapsed()); + + if (not use_fallback_codec) { // + COLLECT_TIME("huff-dec", (*codec).get_time_lossless()); + } + else { // + COLLECT_TIME("huff-dec", (*fb_codec).get_time_lossless()); + } + + COLLECT_TIME("predict", (*predictor).get_time_elapsed()); +} + +TEMPLATE_TYPE +void IMPL::encode_with_exception( + E* d_in, + size_t inlen, + cusz::FREQ* d_freq, + int booklen, + int sublen, + int pardeg, + bool codec_force_fallback, + BYTE*& d_out, + size_t& outlen, + cudaStream_t stream, + bool dbg_print) +{ + auto build_codebook_using = [&](auto encoder) { encoder->build_codebook(d_freq, booklen, stream); }; + auto encode_with = [&](auto encoder) { encoder->encode(d_in, inlen, d_out, outlen, stream); }; + + auto try_fallback_alloc = [&]() { + use_fallback_codec = true; + if (not fallback_codec_allocated) { + LOGGING(LOG_EXCEPTION, "online allocate fallback (8-byte) codec"); + fb_codec->init(inlen, booklen, pardeg, dbg_print); + fallback_codec_allocated = true; + } + }; + + /******************************************************************************/ + if (not codec_force_fallback) { + try { + build_codebook_using(codec); + encode_with(codec); + } + catch (const std::runtime_error& e) { + LOGGING(LOG_EXCEPTION, "switch to fallback codec"); + try_fallback_alloc(); + + build_codebook_using(fb_codec); + encode_with(fb_codec); + } + } + else { + LOGGING(LOG_INFO, "force switch to fallback codec"); + try_fallback_alloc(); + + build_codebook_using(fb_codec); + encode_with(fb_codec); + } +} + +TEMPLATE_TYPE +void IMPL::subfile_collect( + T* d_anchor, + size_t anchor_len, + BYTE* d_codec_out, + size_t codec_outlen, + BYTE* d_spfmt_out, + size_t spfmt_outlen, + cudaStream_t stream, + bool dbg_print) +{ + header.self_bytes = sizeof(Header); + uint32_t nbyte[Header::END]; + nbyte[Header::HEADER] = sizeof(Header); + nbyte[Header::ANCHOR] = sizeof(T) * anchor_len; + nbyte[Header::VLE] = sizeof(BYTE) * codec_outlen; + nbyte[Header::SPFMT] = sizeof(BYTE) * spfmt_outlen; + + header.entry[0] = 0; + // *.END + 1; need to know the ending position + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } + + auto debug_header_entry = [&]() { + printf("\nsubfile collect in compressor:\n"); + printf(" ENTRIES\n"); + + PRINT_ENTRY(HEADER); + PRINT_ENTRY(ANCHOR); + PRINT_ENTRY(VLE); + PRINT_ENTRY(SPFMT); + PRINT_ENTRY(END); + printf("\n"); + }; + + if (dbg_print) debug_header_entry(); + + CHECK_CUDA(cudaMemcpyAsync(d_reserved_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); + + DEVICE2DEVICE_COPY(d_anchor, ANCHOR) + DEVICE2DEVICE_COPY(d_codec_out, VLE) + DEVICE2DEVICE_COPY(d_spfmt_out, SPFMT) + + /* debug */ CHECK_CUDA(cudaStreamSynchronize(stream)); +} + +} // namespace cusz + +#undef FREEDEV +#undef FREEHOST +#undef DEFINE_DEV +#undef DEFINE_HOST +#undef DEVICE2DEVICE_COPY +#undef PRINT_ENTRY +#undef ACCESSOR +#undef COLLECT_TIME + +#undef TEMPLATE_TYPE +#undef IMPL + +#endif diff --git a/qtensor/compression/cusz/src/detail/spmat.cu b/qtensor/compression/cusz/src/detail/spmat.cu index 141d2acb..b6a95bb2 100644 --- a/qtensor/compression/cusz/src/detail/spmat.cu +++ b/qtensor/compression/cusz/src/detail/spmat.cu @@ -1,14 +1,14 @@ -/** - * @file spmat.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-09-28 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#include "detail/spmat.cuh" - -template struct cusz::SpcodecCSR::impl; +/** + * @file spmat.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-09-28 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#include "detail/spmat.cuh" + +template struct cusz::SpcodecCSR::impl; diff --git a/qtensor/compression/cusz/src/detail/spv_gpu.inl b/qtensor/compression/cusz/src/detail/spv_gpu.inl index 4775926e..4c724bd5 100644 --- a/qtensor/compression/cusz/src/detail/spv_gpu.inl +++ b/qtensor/compression/cusz/src/detail/spv_gpu.inl @@ -1,77 +1,77 @@ -/** - * @file spv_gpu.inl - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-08-22 - * (update) 2022-10-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 -#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 - -#include -#include -#include -#include -#include -#include - -#include "utils/timer.h" - -namespace psz { -namespace detail { - -template -void spv_gather( - T* in, - size_t const in_len, - T* d_val, - uint32_t* d_idx, - int* nnz, - float* milliseconds, - cudaStream_t stream) -{ - using thrust::placeholders::_1; - - thrust::cuda::par.on(stream); - thrust::counting_iterator zero(0); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - // find out the indices - *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx; - - // fetch corresponding values - thrust::copy( - thrust::device, thrust::make_permutation_iterator(in, d_idx), - thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val); - - STOP_CUDAEVENT_RECORDING(stream); - TIME_ELAPSED_CUDAEVENT(milliseconds); - DESTROY_CUDAEVENT_PAIR; -} - -template -void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) -{ - thrust::cuda::par.on(stream); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded); - - STOP_CUDAEVENT_RECORDING(stream); - TIME_ELAPSED_CUDAEVENT(milliseconds); - DESTROY_CUDAEVENT_PAIR; -} - -} // namespace detail -} // namespace psz - -#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */ +/** + * @file spv_gpu.inl + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-08-22 + * (update) 2022-10-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 +#define F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 + +#include +#include +#include +#include +#include +#include + +#include "utils/timer.h" + +namespace psz { +namespace detail { + +template +void spv_gather( + T* in, + size_t const in_len, + T* d_val, + uint32_t* d_idx, + int* nnz, + float* milliseconds, + cudaStream_t stream) +{ + using thrust::placeholders::_1; + + thrust::cuda::par.on(stream); + thrust::counting_iterator zero(0); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + // find out the indices + *nnz = thrust::copy_if(thrust::device, zero, zero + in_len, in, d_idx, _1 != 0) - d_idx; + + // fetch corresponding values + thrust::copy( + thrust::device, thrust::make_permutation_iterator(in, d_idx), + thrust::make_permutation_iterator(in + *nnz, d_idx + *nnz), d_val); + + STOP_CUDAEVENT_RECORDING(stream); + TIME_ELAPSED_CUDAEVENT(milliseconds); + DESTROY_CUDAEVENT_PAIR; +} + +template +void spv_scatter(T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) +{ + thrust::cuda::par.on(stream); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + thrust::scatter(thrust::device, d_val, d_val + nnz, d_idx, decoded); + + STOP_CUDAEVENT_RECORDING(stream); + TIME_ELAPSED_CUDAEVENT(milliseconds); + DESTROY_CUDAEVENT_PAIR; +} + +} // namespace detail +} // namespace psz + +#endif /* F88E11A6_6B61_4C6F_8B2E_30EEAAB4D204 */ diff --git a/qtensor/compression/cusz/src/detail/spvec.cu b/qtensor/compression/cusz/src/detail/spvec.cu index e9b9ab6f..7ed562db 100644 --- a/qtensor/compression/cusz/src/detail/spvec.cu +++ b/qtensor/compression/cusz/src/detail/spvec.cu @@ -1,18 +1,18 @@ -/** - * @file spvec.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-03-01 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "detail/spvec.cuh" - -template struct cusz::SpcodecVec::impl; -template struct cusz::SpcodecVec::impl; -template struct cusz::SpcodecVec::impl; -template struct cusz::SpcodecVec::impl; -// template struct cusz::SpcodecVec::impl; +/** + * @file spvec.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-03-01 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "detail/spvec.cuh" + +template struct cusz::SpcodecVec::impl; +template struct cusz::SpcodecVec::impl; +template struct cusz::SpcodecVec::impl; +template struct cusz::SpcodecVec::impl; +// template struct cusz::SpcodecVec::impl; diff --git a/qtensor/compression/cusz/src/experimental/Makefile b/qtensor/compression/cusz/src/experimental/Makefile index cecce6f5..22807665 100644 --- a/qtensor/compression/cusz/src/experimental/Makefile +++ b/qtensor/compression/cusz/src/experimental/Makefile @@ -1,7 +1,7 @@ -altlorenzo: - nvcc -lineinfo -std=c++17 \ - --extended-lambda \ - -DDPCPP_SHOWCASE \ - ../wrapper/extrap_lorenzo.cu \ - dpcpp_demo_lorenzo.cu \ - -o dpcpp_demo_lorenzo +altlorenzo: + nvcc -lineinfo -std=c++17 \ + --extended-lambda \ + -DDPCPP_SHOWCASE \ + ../wrapper/extrap_lorenzo.cu \ + dpcpp_demo_lorenzo.cu \ + -o dpcpp_demo_lorenzo diff --git a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu index 375d648d..6d5123a0 100644 --- a/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu +++ b/qtensor/compression/cusz/src/experimental/dpcpp_demo_lorenzo.cu @@ -1,120 +1,120 @@ -/** - * @file withwrapper_lorenzo.cu - * @author Jiannan Tian - * @brief A temporary test case using high-level component/API. - * @version 0.3 - * @date 2021-06-21 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#include -#include -#include -#include -#include -#include -#include "../utils/io.hh" -#include "../utils/verify.hh" - -#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)." -// #include "../utils/verify_gpu.cuh" -#include "../component/extrap_lorenzo.h" - -using std::cout; -using std::endl; - -using Data = float; -using Quant = uint16_t; -using FP = float; - -Data eb; -Data maxval, minval; - -// dim3 stride3; -size_t len1; -int radius = 512; - -namespace { - -#ifndef __CUDACC__ -struct __dim3_compat { - unsigned int x, y, z; - __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){}; -}; - -using dim3 = __dim3_compat; -#endif - -auto get_npart = [](auto size, auto subsize) { - static_assert( - std::numeric_limits::is_integer and std::numeric_limits::is_integer, - "[get_npart] must be plain interger types."); - return (size + subsize - 1) / subsize; -}; -auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; }; -auto get_stride3 = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); }; - -} // namespace - -void test_lorenzo(std::string fname, int ndim, dim3 size3) -{ - cout << "filename: " << fname << '\n'; - - Data* h_data{nullptr}; - Data* d_data{nullptr}; - Data* h2_data{nullptr}; - Quant* d_quant{nullptr}; - - auto len1 = get_len_from_dim3(size3); - cout << "len1 from dim3: " << len1 << endl; - - cudaMallocHost(&h_data, len1 * sizeof(Data)); - io::read_binary_to_array(fname, h_data, len1); - cudaMallocHost(&h2_data, len1 * sizeof(Data)); - memcpy(h2_data, h_data, len1 * sizeof(Data)); - - cudaMalloc(&d_data, len1 * sizeof(Data)); - cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice); - cudaMalloc(&d_quant, len1 * sizeof(Quant)); - - auto maxval = *std::max_element(h_data, h_data + len1); - auto minval = *std::min_element(h_data, h_data + len1); - eb = 1e-3 * (maxval - minval); - - compress_lorenzo_construct(d_data, d_quant, size3, ndim, eb, radius); - decompress_lorenzo_reconstruct(d_data, d_quant, size3, ndim, eb, radius); - - cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost); - - // TODO GPU verification does not print - // { - // Stat stat_gpu; - // verify_data_GPU(&stat_gpu, h_data, h2_data, len1); - // cusz::QualityViewer::print_metrics_cross(&stat_gpu, false, eb, 0, 1, false, true); - // } - { - Stat stat; - cusz::verify_data(&stat, h_data, h2_data, len1); - cusz::QualityViewer::print_metrics_cross(&stat, false, eb, 0, 1, false, false); - } - - // clear up - cudaFree(d_data); - cudaFree(d_quant); - cudaFreeHost(h_data); - cudaFreeHost(h2_data); -} - -int main() -{ - struct passwd* pw = getpwuid(getuid()); - const char* homedir = pw->pw_dir; - - test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1)); - test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1)); - test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100)); - - return 0; -} +/** + * @file withwrapper_lorenzo.cu + * @author Jiannan Tian + * @brief A temporary test case using high-level component/API. + * @version 0.3 + * @date 2021-06-21 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#include +#include +#include +#include +#include +#include +#include "../utils/io.hh" +#include "../utils/verify.hh" + +#pragma message "--extended-lambda causes migration error (nvcc is incapable to be a wellrounded compiler)." +// #include "../utils/verify_gpu.cuh" +#include "../component/extrap_lorenzo.h" + +using std::cout; +using std::endl; + +using Data = float; +using Quant = uint16_t; +using FP = float; + +Data eb; +Data maxval, minval; + +// dim3 stride3; +size_t len1; +int radius = 512; + +namespace { + +#ifndef __CUDACC__ +struct __dim3_compat { + unsigned int x, y, z; + __dim3_compat(unsigned int _x, unsigned int _y, unsigned int _z){}; +}; + +using dim3 = __dim3_compat; +#endif + +auto get_npart = [](auto size, auto subsize) { + static_assert( + std::numeric_limits::is_integer and std::numeric_limits::is_integer, + "[get_npart] must be plain interger types."); + return (size + subsize - 1) / subsize; +}; +auto get_len_from_dim3 = [](dim3 size) { return size.x * size.y * size.z; }; +auto get_stride3 = [](dim3 size) -> dim3 { return dim3(1, size.x, size.x * size.y); }; + +} // namespace + +void test_lorenzo(std::string fname, int ndim, dim3 size3) +{ + cout << "filename: " << fname << '\n'; + + Data* h_data{nullptr}; + Data* d_data{nullptr}; + Data* h2_data{nullptr}; + Quant* d_quant{nullptr}; + + auto len1 = get_len_from_dim3(size3); + cout << "len1 from dim3: " << len1 << endl; + + cudaMallocHost(&h_data, len1 * sizeof(Data)); + io::read_binary_to_array(fname, h_data, len1); + cudaMallocHost(&h2_data, len1 * sizeof(Data)); + memcpy(h2_data, h_data, len1 * sizeof(Data)); + + cudaMalloc(&d_data, len1 * sizeof(Data)); + cudaMemcpy(d_data, h_data, len1 * sizeof(Data), cudaMemcpyHostToDevice); + cudaMalloc(&d_quant, len1 * sizeof(Quant)); + + auto maxval = *std::max_element(h_data, h_data + len1); + auto minval = *std::min_element(h_data, h_data + len1); + eb = 1e-3 * (maxval - minval); + + compress_lorenzo_construct(d_data, d_quant, size3, ndim, eb, radius); + decompress_lorenzo_reconstruct(d_data, d_quant, size3, ndim, eb, radius); + + cudaMemcpy(h_data, d_data, len1 * sizeof(Data), cudaMemcpyDeviceToHost); + + // TODO GPU verification does not print + // { + // Stat stat_gpu; + // verify_data_GPU(&stat_gpu, h_data, h2_data, len1); + // cusz::QualityViewer::print_metrics_cross(&stat_gpu, false, eb, 0, 1, false, true); + // } + { + Stat stat; + cusz::verify_data(&stat, h_data, h2_data, len1); + cusz::QualityViewer::print_metrics_cross(&stat, false, eb, 0, 1, false, false); + } + + // clear up + cudaFree(d_data); + cudaFree(d_quant); + cudaFreeHost(h_data); + cudaFreeHost(h2_data); +} + +int main() +{ + struct passwd* pw = getpwuid(getuid()); + const char* homedir = pw->pw_dir; + + test_lorenzo(std::string(homedir) + "/datafields/vx", 1, dim3(280953867, 1, 1)); + test_lorenzo(std::string(homedir) + "/datafields/CLDHGH", 2, dim3(3600, 1800, 1)); + test_lorenzo(std::string(homedir) + "/datafields/CLOUDf48", 3, dim3(500, 500, 100)); + + return 0; +} diff --git a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl index 3fb9ef82..27890728 100644 --- a/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl +++ b/qtensor/compression/cusz/src/hf/detail/hf_bookg.inl @@ -1,742 +1,742 @@ -/** - * @file huffman_parbook.cu - * @author Cody Rivera (cjrivera1@crimson.ua.edu) - * @brief Parallel Huffman Construction to generates canonical forward codebook. - * Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib) - * "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes". - * @version 0.1 - * @date 2020-10-24 - * (created) 2020-05 (rev) 2021-06-21 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef C883A574_4491_40E8_A083_1B6E8FB56670 -#define C883A574_4491_40E8_A083_1B6E8FB56670 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hh" -#include "hf/hf_bookg.hh" -#include "par_merge.inl" -#include "utils.hh" -#include "utils/timer.h" - -using std::cout; -using std::endl; -namespace cg = cooperative_groups; - -// GenerateCL Locals -__device__ int iNodesFront = 0; -__device__ int iNodesRear = 0; -__device__ int lNodesCur = 0; - -__device__ int iNodesSize = 0; -__device__ int curLeavesNum; - -__device__ int minFreq; - -__device__ int tempLength; - -__device__ int mergeFront; -__device__ int mergeRear; - -__device__ int lNodesIndex; - -// GenerateCW Locals -__device__ int CCL; -__device__ int CDPI; -__device__ int newCDPI; - -// Profiling -__device__ long long int s[10]; -__device__ long long int st[10]; - -// Mathematically correct mod -#define MOD(a, b) ((((a) % (b)) + (b)) % (b)) - -namespace par_huffman { -namespace detail { - -// clang-format off -template __global__ void GPU_FillArraySequence(T*, unsigned int); -template __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*); -template __global__ void GPU_ReverseArray(T*, unsigned int); -template __global__ void GPU_ReorderByIndex(H*, T*, unsigned int); -// clang-format on - -} // namespace detail -} // namespace par_huffman - -namespace par_huffman { - -// Codeword length -template -__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int); - -// Forward Codebook -template -__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size); - -} // namespace par_huffman - -// Parallel huffman code generation -// clang-format off -template -__global__ void par_huffman::GPU_GenerateCL( - F* histogram, F* CL, int size, - /* Global Arrays */ - F* lNodesFreq, int* lNodesLeader, - F* iNodesFreq, int* iNodesLeader, - F* tempFreq, int* tempIsLeaf, int* tempIndex, - F* copyFreq, int* copyIsLeaf, int* copyIndex, - uint32_t* diagonal_path_intersections, int mblocks, int mthreads) -{ - // clang-format on - - extern __shared__ int32_t shmem[]; - // Shared variables - int32_t& x_top = shmem[0]; - int32_t& y_top = shmem[1]; - int32_t& x_bottom = shmem[2]; - int32_t& y_bottom = shmem[3]; - int32_t& found = shmem[4]; - int32_t* oneorzero = &shmem[5]; - - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - const unsigned int i = thread; // Adaptation for easier porting - auto current_grid = cg::this_grid(); - - /* Initialization */ - if (thread < size) { - lNodesLeader[i] = -1; - CL[i] = 0; - } - - if (thread == 0) { - iNodesFront = 0; - iNodesRear = 0; - lNodesCur = 0; - - iNodesSize = 0; - } - current_grid.sync(); - - /* While there is not exactly one internal node */ - while (lNodesCur < size || iNodesSize > 1) { - /* Combine two most frequent nodes on same level */ - if (thread == 0) { - F midFreq[4]; - int midIsLeaf[4]; - for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX; - - if (lNodesCur < size) { - midFreq[0] = lNodesFreq[lNodesCur]; - midIsLeaf[0] = 1; - } - if (lNodesCur < size - 1) { - midFreq[1] = lNodesFreq[lNodesCur + 1]; - midIsLeaf[1] = 1; - } - if (iNodesSize >= 1) { - midFreq[2] = iNodesFreq[iNodesFront]; - midIsLeaf[2] = 0; - } - if (iNodesSize >= 2) { - midFreq[3] = iNodesFreq[MOD(iNodesFront + 1, size)]; - midIsLeaf[3] = 0; - } - - /* Select the minimum of minimums - 4elt sorting network */ - /* TODO There's likely a good 1-warp faster way to do this */ - { - F tempFreq; - int tempIsLeaf; - if (midFreq[1] > midFreq[3]) { - tempFreq = midFreq[1]; - midFreq[1] = midFreq[3]; - midFreq[3] = tempFreq; - tempIsLeaf = midIsLeaf[1]; - midIsLeaf[1] = midIsLeaf[3]; - midIsLeaf[3] = tempIsLeaf; - } - if (midFreq[0] > midFreq[2]) { - tempFreq = midFreq[0]; - midFreq[0] = midFreq[2]; - midFreq[2] = tempFreq; - tempIsLeaf = midIsLeaf[0]; - midIsLeaf[0] = midIsLeaf[2]; - midIsLeaf[2] = tempIsLeaf; - } - if (midFreq[0] > midFreq[1]) { - tempFreq = midFreq[0]; - midFreq[0] = midFreq[1]; - midFreq[1] = tempFreq; - tempIsLeaf = midIsLeaf[0]; - midIsLeaf[0] = midIsLeaf[1]; - midIsLeaf[1] = tempIsLeaf; - } - if (midFreq[2] > midFreq[3]) { - tempFreq = midFreq[2]; - midFreq[2] = midFreq[3]; - midFreq[3] = tempFreq; - tempIsLeaf = midIsLeaf[2]; - midIsLeaf[2] = midIsLeaf[3]; - midIsLeaf[3] = tempIsLeaf; - } - if (midFreq[1] > midFreq[2]) { - tempFreq = midFreq[1]; - midFreq[1] = midFreq[2]; - midFreq[2] = tempFreq; - tempIsLeaf = midIsLeaf[1]; - midIsLeaf[1] = midIsLeaf[2]; - midIsLeaf[2] = tempIsLeaf; - } - } - - minFreq = midFreq[0]; - if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; } - iNodesFreq[iNodesRear] = minFreq; - iNodesLeader[iNodesRear] = -1; - - /* If is leaf */ - if (midIsLeaf[0]) { - lNodesLeader[lNodesCur] = iNodesRear; - ++CL[lNodesCur], ++lNodesCur; - } - else { - iNodesLeader[iNodesFront] = iNodesRear; - iNodesFront = MOD(iNodesFront + 1, size); - } - if (midIsLeaf[1]) { - lNodesLeader[lNodesCur] = iNodesRear; - ++CL[lNodesCur], ++lNodesCur; - } - else { - iNodesLeader[iNodesFront] = iNodesRear; - iNodesFront = MOD(iNodesFront + 1, size); /* ? */ - } - - // iNodesRear = MOD(iNodesRear + 1, size); - - iNodesSize = MOD(iNodesRear - iNodesFront, size); - } - - // int curLeavesNum; - /* Select elements to copy -- parallelized */ - curLeavesNum = 0; - current_grid.sync(); - if (i >= lNodesCur && i < size) { - // Parallel component - int threadCurLeavesNum; - if (lNodesFreq[i] <= minFreq) { - threadCurLeavesNum = i - lNodesCur + 1; - // Atomic max -- Largest valid index - atomicMax(&curLeavesNum, threadCurLeavesNum); - } - - if (i - lNodesCur < curLeavesNum) { - copyFreq[i - lNodesCur] = lNodesFreq[i]; - copyIndex[i - lNodesCur] = i; - copyIsLeaf[i - lNodesCur] = 1; - } - } - - current_grid.sync(); - - /* Updates Iterators */ - if (thread == 0) { - mergeRear = iNodesRear; - mergeFront = iNodesFront; - - if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; } - /* Odd number of nodes to merge - leave out one*/ - else if ( - (iNodesSize != 0) // - and (curLeavesNum == 0 // - or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)])) // - ) { - mergeRear = MOD(mergeRear - 1, size); - iNodesFront = MOD(iNodesRear - 1, size); - } - else { - iNodesFront = iNodesRear; - --curLeavesNum; - } - - lNodesCur = lNodesCur + curLeavesNum; - iNodesRear = MOD(iNodesRear + 1, size); - } - current_grid.sync(); - - /* Parallelized Merging Phase */ - - /*if (thread == 0) { - merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum, - iNodesFreq, mergeFront, mergeRear, size, - tempFreq, tempIndex, tempIsLeaf, tempLength); - }*/ - - parMerge( - copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum, // - iNodesFreq, mergeFront, mergeRear, size, // - tempFreq, tempIndex, tempIsLeaf, tempLength, // - diagonal_path_intersections, mblocks, mthreads, // - x_top, y_top, x_bottom, y_bottom, found, oneorzero); - current_grid.sync(); - - /* Melding phase -- New */ - if (thread < tempLength / 2) { - int ind = MOD(iNodesRear + i, size); - iNodesFreq[ind] = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1]; - iNodesLeader[ind] = -1; - - if (tempIsLeaf[(2 * i)]) { - lNodesLeader[tempIndex[(2 * i)]] = ind; - ++CL[tempIndex[(2 * i)]]; - } - else { - iNodesLeader[tempIndex[(2 * i)]] = ind; - } - if (tempIsLeaf[(2 * i) + 1]) { - lNodesLeader[tempIndex[(2 * i) + 1]] = ind; - ++CL[tempIndex[(2 * i) + 1]]; - } - else { - iNodesLeader[tempIndex[(2 * i) + 1]] = ind; - } - } - current_grid.sync(); - - if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); } - current_grid.sync(); - - /* Update leaders */ - if (thread < size) { - if (lNodesLeader[i] != -1) { - if (iNodesLeader[lNodesLeader[i]] != -1) { - lNodesLeader[i] = iNodesLeader[lNodesLeader[i]]; - ++CL[i]; - } - } - } - current_grid.sync(); - - if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); } - current_grid.sync(); - } -} - -// Parallelized with atomic writes, but could replace with Jiannan's similar code -template -__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size) -{ - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - const unsigned int i = thread; // Porting convenience - auto current_grid = cg::this_grid(); - auto type_bw = sizeof(H) * 8; - - /* Reverse in place - Probably a more CUDA-appropriate way */ - if (thread < size / 2) { - F temp = CL[i]; - CL[i] = CL[size - i - 1]; - CL[size - i - 1] = temp; - } - current_grid.sync(); - - if (thread == 0) { - CCL = CL[0]; - CDPI = 0; - newCDPI = size - 1; - entry[CCL] = 0; - - // Edge case -- only one input symbol - CW[CDPI] = 0; - first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1); - entry[CCL + 1] = 1; - } - current_grid.sync(); - - // Initialize first and entry arrays - if (thread < CCL) { - // Initialization of first to Max ensures that unused code - // lengths are skipped over in decoding. - first[i] = std::numeric_limits::max(); - entry[i] = 0; - } - // Initialize first element of entry - current_grid.sync(); - - while (CDPI < size - 1) { - // CDPI update - if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); } - current_grid.sync(); - - // Last element to update - const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1]; - // Fill base - const int curEntryVal = entry[CCL]; - // Number of elements of length CCL - const int numCCL = (newCDPI - CDPI + 1); - - // Get first codeword - if (i == 0) { - if (CDPI == 0) { CW[newCDPI] = 0; } - else { - CW[newCDPI] = CW[CDPI]; // Pre-stored - } - } - current_grid.sync(); - - if (i < size) { - // Parallel canonical codeword generation - if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); } - } - - // Update entry and first arrays in O(1) time - if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; } - // Add number of entries to next CCL - if (thread == 0) { - if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; } - } - current_grid.sync(); - - // Update first array in O(1) time - if (thread == CCL) { - // Flip least significant CL[CDPI] bits - first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1); - } - if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits::max(); } - current_grid.sync(); - - if (thread == 0) { - if (newCDPI < size - 1) { - int CLDiff = CL[newCDPI + 1] - CL[newCDPI]; - // Add and shift -- Next canonical code - CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff); - CCL = CL[newCDPI + 1]; - - ++newCDPI; - } - - // Update CDPI to newCDPI after codeword length increase - CDPI = newCDPI; - newCDPI = size - 1; - } - current_grid.sync(); - } - - if (thread < size) { - /* Make encoded codeword compatible with CUSZ */ - CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1); - } - current_grid.sync(); - - /* Reverse partial codebook */ - if (thread < size / 2) { - H temp = CW[i]; - CW[i] = CW[size - i - 1]; - CW[size - i - 1] = temp; - } -} - -// TODO forceinilne? -// Helper implementations -template -__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size) -{ - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - if (thread < size) { array[thread] = thread; } -} - -// Precondition -- Result is preset to be equal to size -template -__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result) -{ - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - if (array[thread] != 0) { atomicMin(result, thread); } -} - -namespace par_huffman { -namespace detail { -__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result) -{ - (void)size; - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - if (thread == 0) { *result = CL[0]; } -} - -} // namespace detail -} // namespace par_huffman - -/** - * @brief Reorders given a set of indices. Programmer must ensure that all index[i] - * are unique or else race conditions may occur - * - * @tparam T - * @tparam Q - * @param array e.g., codebook - * @param index e.g., input data - * @param size - * @return __global__ - */ -template -__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size) -{ - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - H temp; - T newIndex; - if (thread < size) { - temp = array[thread]; - newIndex = index[thread]; - array[(int)newIndex] = temp; - } -} - -// Reverses a given array. -template -__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size) -{ - unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; - if (thread < size / 2) { - T temp = array[thread]; - array[thread] = array[size - thread - 1]; - array[size - thread - 1] = temp; - } -} - -// Parallel codebook generation wrapper -template -void asz::hf_buildbook_g( - uint32_t* freq, - int const dict_size, - H* codebook, - uint8_t* reverse_codebook, - int const revbook_nbyte, - float* time_book, - cudaStream_t stream) -{ - // Metadata - auto type_bw = sizeof(H) * 8; - auto _d_first = reinterpret_cast(reverse_codebook); - auto _d_entry = reinterpret_cast(reverse_codebook + (sizeof(H) * type_bw)); - auto _d_qcode = reinterpret_cast(reverse_codebook + (sizeof(H) * 2 * type_bw)); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - // Sort Qcodes by frequency - int nblocks = (dict_size / 1024) + 1; - par_huffman::detail::GPU_FillArraySequence<<>>(_d_qcode, (unsigned int)dict_size); - cudaStreamSynchronize(stream); - - /** - * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu) - * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate - * compilation unit as thrust calls fail in par_huffman.cu. - * - * Resolved by - * 1) inlining function - * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer(var)` - */ - auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) { - thrust::sort_by_key( - thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len), - thrust::device_pointer_cast(qcode)); - }; - - lambda_sort_by_freq(freq, dict_size, _d_qcode); - cudaStreamSynchronize(stream); - - unsigned int* d_first_nonzero_index; - unsigned int first_nonzero_index = dict_size; - cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int)); - cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice); - par_huffman::detail::GPU_GetFirstNonzeroIndex - <<>>(freq, dict_size, d_first_nonzero_index); - cudaStreamSynchronize(stream); - cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost); - cudaFree(d_first_nonzero_index); - - int nz_dict_size = dict_size - first_nonzero_index; - unsigned int* _nz_d_freq = freq + first_nonzero_index; - H* _nz_d_codebook = codebook + first_nonzero_index; - int nz_nblocks = (nz_dict_size / 1024) + 1; - - // Memory Allocation -- Perhaps put in another wrapper - // clang-format off - unsigned int *CL = nullptr; - /*unsigned int* lNodesFreq*/ int *lNodesLeader = nullptr; - unsigned int *iNodesFreq = nullptr; int *iNodesLeader = nullptr; - unsigned int *tempFreq = nullptr; int *tempIsLeaf = nullptr; int *tempIndex = nullptr; - unsigned int *copyFreq = nullptr; int *copyIsLeaf = nullptr; int *copyIndex = nullptr; - cudaMalloc(&CL, nz_dict_size * sizeof(unsigned int) ); - cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int) ); - cudaMalloc(&iNodesFreq, nz_dict_size * sizeof(unsigned int) ); - cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int) ); - cudaMalloc(&tempFreq, nz_dict_size * sizeof(unsigned int) ); - cudaMalloc(&tempIsLeaf, nz_dict_size * sizeof(int) ); - cudaMalloc(&tempIndex, nz_dict_size * sizeof(int) ); - cudaMalloc(©Freq, nz_dict_size * sizeof(unsigned int) ); - cudaMalloc(©IsLeaf, nz_dict_size * sizeof(int) ); - cudaMalloc(©Index, nz_dict_size * sizeof(int) ); - cudaMemset(CL, 0, nz_dict_size * sizeof(int) ); - // clang-format on - - // Grid configuration for CL -- based on Cooperative Groups - int cg_mblocks; - int cg_blocks_sm; - int device_id; - int mthreads = 32; // 1 warp - cudaDeviceProp deviceProp; - cudaGetDevice(&device_id); - cudaGetDeviceProperties(&deviceProp, device_id); - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &cg_blocks_sm, par_huffman::GPU_GenerateCL, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t)); - cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm; - - int ELTS_PER_SEQ_MERGE = 16; - int mblocks = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1); - - // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded - int tthreads = mthreads * mblocks; - if (tthreads < nz_dict_size) { - cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size - << " non-zero item codebook" << endl; - cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads - << " total" << endl - << endl; - // cout << LOG_ERR << "Exiting cuSZ ..." << endl; - throw std::system_error(); - // exit(1); - } - - uint32_t* diagonal_path_intersections; - cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t)); - - // Codebook already init'ed - cudaStreamSynchronize(stream); - - // Call first kernel - // Collect arguments - void* CL_Args[] = {(void*)&_nz_d_freq, (void*)&CL, - (void*)&nz_dict_size, (void*)&_nz_d_freq, - (void*)&lNodesLeader, (void*)&iNodesFreq, - (void*)&iNodesLeader, (void*)&tempFreq, - (void*)&tempIsLeaf, (void*)&tempIndex, - (void*)©Freq, (void*)©IsLeaf, - (void*)©Index, (void*)&diagonal_path_intersections, - (void*)&mblocks, (void*)&mthreads}; - // Cooperative Launch - cudaLaunchCooperativeKernel( - (void*)par_huffman::GPU_GenerateCL, mblocks, mthreads, CL_Args, - 5 * sizeof(int32_t) + 32 * sizeof(int32_t)); - cudaStreamSynchronize(stream); - - // Exits if the highest codeword length is greater than what - // the adaptive representation can handle - // TODO do proper cleanup - - unsigned int* d_max_CL; - unsigned int max_CL; - cudaMalloc(&d_max_CL, sizeof(unsigned int)); - par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL); - cudaStreamSynchronize(stream); - cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost); - cudaFree(d_max_CL); - - int max_CW_bits = (sizeof(H) * 8) - 8; - if (max_CL > max_CW_bits) { - cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl; - cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8 - << " bits (longest codeword: " << max_CL << " bits)" << endl; - // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl; - // cout << LOG_ERR << "Exiting cuSZ ..." << endl; - // exit(1); - throw std::runtime_error("Falling back to 8-byte Codec."); - } - - // Configure CW for 1024 threads/block - int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024; - int cw_mblocks = std::min(cg_cw_mblocks, nz_nblocks); - - // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded - int cw_tthreads = cw_mblocks * 1024; - if (cw_tthreads < nz_dict_size) { - cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size - << " non-zero item codebook" << endl; - cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads - << " total" << endl - << endl; - // cout << LOG_ERR << "Exiting cuSZ ..." << endl; - // exit(1); - throw std::system_error(); - } - - void* CW_Args[] = { - (void*)&CL, // - (void*)&_nz_d_codebook, // - (void*)&_d_first, // - (void*)&_d_entry, // - (void*)&nz_dict_size}; - - // Call second kernel - cudaLaunchCooperativeKernel( - (void*)par_huffman::GPU_GenerateCW, // - cw_mblocks, // - 1024, // - CW_Args); - cudaStreamSynchronize(stream); - -#ifdef D_DEBUG_PRINT - print_codebook<<<1, 32>>>(codebook, dict_size); // PASS - cudaStreamSynchronize(stream); -#endif - - // Reverse _d_qcode and codebook - par_huffman::detail::GPU_ReverseArray<<>>(codebook, (unsigned int)dict_size); - par_huffman::detail::GPU_ReverseArray<<>>(_d_qcode, (unsigned int)dict_size); - cudaStreamSynchronize(stream); - - par_huffman::detail::GPU_ReorderByIndex<<>>(codebook, _d_qcode, (unsigned int)dict_size); - cudaStreamSynchronize(stream); - - STOP_CUDAEVENT_RECORDING(stream); - TIME_ELAPSED_CUDAEVENT(time_book); - DESTROY_CUDAEVENT_PAIR; - - // Cleanup - cudaFree(CL); - cudaFree(lNodesLeader); - cudaFree(iNodesFreq); - cudaFree(iNodesLeader); - cudaFree(tempFreq); - cudaFree(tempIsLeaf); - cudaFree(tempIndex); - cudaFree(copyFreq); - cudaFree(copyIsLeaf); - cudaFree(copyIndex); - cudaFree(diagonal_path_intersections); - cudaStreamSynchronize(stream); - -#ifdef D_DEBUG_PRINT - print_codebook<<<1, 32>>>(codebook, dict_size); // PASS - cudaStreamSynchronize(stream); -#endif -} - -#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */ +/** + * @file huffman_parbook.cu + * @author Cody Rivera (cjrivera1@crimson.ua.edu) + * @brief Parallel Huffman Construction to generates canonical forward codebook. + * Based on [Ostadzadeh et al. 2007] (https://dblp.org/rec/conf/pdpta/OstadzadehEZMB07.bib) + * "A Two-phase Practical Parallel Algorithm for Construction of Huffman Codes". + * @version 0.1 + * @date 2020-10-24 + * (created) 2020-05 (rev) 2021-06-21 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef C883A574_4491_40E8_A083_1B6E8FB56670 +#define C883A574_4491_40E8_A083_1B6E8FB56670 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hh" +#include "hf/hf_bookg.hh" +#include "par_merge.inl" +#include "utils.hh" +#include "utils/timer.h" + +using std::cout; +using std::endl; +namespace cg = cooperative_groups; + +// GenerateCL Locals +__device__ int iNodesFront = 0; +__device__ int iNodesRear = 0; +__device__ int lNodesCur = 0; + +__device__ int iNodesSize = 0; +__device__ int curLeavesNum; + +__device__ int minFreq; + +__device__ int tempLength; + +__device__ int mergeFront; +__device__ int mergeRear; + +__device__ int lNodesIndex; + +// GenerateCW Locals +__device__ int CCL; +__device__ int CDPI; +__device__ int newCDPI; + +// Profiling +__device__ long long int s[10]; +__device__ long long int st[10]; + +// Mathematically correct mod +#define MOD(a, b) ((((a) % (b)) + (b)) % (b)) + +namespace par_huffman { +namespace detail { + +// clang-format off +template __global__ void GPU_FillArraySequence(T*, unsigned int); +template __global__ void GPU_GetFirstNonzeroIndex(T*, unsigned int, unsigned int*); +template __global__ void GPU_ReverseArray(T*, unsigned int); +template __global__ void GPU_ReorderByIndex(H*, T*, unsigned int); +// clang-format on + +} // namespace detail +} // namespace par_huffman + +namespace par_huffman { + +// Codeword length +template +__global__ void GPU_GenerateCL(F*, F*, int, F*, int*, F*, int*, F*, int*, int*, F*, int*, int*, uint32_t*, int, int); + +// Forward Codebook +template +__global__ void GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size); + +} // namespace par_huffman + +// Parallel huffman code generation +// clang-format off +template +__global__ void par_huffman::GPU_GenerateCL( + F* histogram, F* CL, int size, + /* Global Arrays */ + F* lNodesFreq, int* lNodesLeader, + F* iNodesFreq, int* iNodesLeader, + F* tempFreq, int* tempIsLeaf, int* tempIndex, + F* copyFreq, int* copyIsLeaf, int* copyIndex, + uint32_t* diagonal_path_intersections, int mblocks, int mthreads) +{ + // clang-format on + + extern __shared__ int32_t shmem[]; + // Shared variables + int32_t& x_top = shmem[0]; + int32_t& y_top = shmem[1]; + int32_t& x_bottom = shmem[2]; + int32_t& y_bottom = shmem[3]; + int32_t& found = shmem[4]; + int32_t* oneorzero = &shmem[5]; + + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + const unsigned int i = thread; // Adaptation for easier porting + auto current_grid = cg::this_grid(); + + /* Initialization */ + if (thread < size) { + lNodesLeader[i] = -1; + CL[i] = 0; + } + + if (thread == 0) { + iNodesFront = 0; + iNodesRear = 0; + lNodesCur = 0; + + iNodesSize = 0; + } + current_grid.sync(); + + /* While there is not exactly one internal node */ + while (lNodesCur < size || iNodesSize > 1) { + /* Combine two most frequent nodes on same level */ + if (thread == 0) { + F midFreq[4]; + int midIsLeaf[4]; + for (int i = 0; i < 4; ++i) midFreq[i] = UINT_MAX; + + if (lNodesCur < size) { + midFreq[0] = lNodesFreq[lNodesCur]; + midIsLeaf[0] = 1; + } + if (lNodesCur < size - 1) { + midFreq[1] = lNodesFreq[lNodesCur + 1]; + midIsLeaf[1] = 1; + } + if (iNodesSize >= 1) { + midFreq[2] = iNodesFreq[iNodesFront]; + midIsLeaf[2] = 0; + } + if (iNodesSize >= 2) { + midFreq[3] = iNodesFreq[MOD(iNodesFront + 1, size)]; + midIsLeaf[3] = 0; + } + + /* Select the minimum of minimums - 4elt sorting network */ + /* TODO There's likely a good 1-warp faster way to do this */ + { + F tempFreq; + int tempIsLeaf; + if (midFreq[1] > midFreq[3]) { + tempFreq = midFreq[1]; + midFreq[1] = midFreq[3]; + midFreq[3] = tempFreq; + tempIsLeaf = midIsLeaf[1]; + midIsLeaf[1] = midIsLeaf[3]; + midIsLeaf[3] = tempIsLeaf; + } + if (midFreq[0] > midFreq[2]) { + tempFreq = midFreq[0]; + midFreq[0] = midFreq[2]; + midFreq[2] = tempFreq; + tempIsLeaf = midIsLeaf[0]; + midIsLeaf[0] = midIsLeaf[2]; + midIsLeaf[2] = tempIsLeaf; + } + if (midFreq[0] > midFreq[1]) { + tempFreq = midFreq[0]; + midFreq[0] = midFreq[1]; + midFreq[1] = tempFreq; + tempIsLeaf = midIsLeaf[0]; + midIsLeaf[0] = midIsLeaf[1]; + midIsLeaf[1] = tempIsLeaf; + } + if (midFreq[2] > midFreq[3]) { + tempFreq = midFreq[2]; + midFreq[2] = midFreq[3]; + midFreq[3] = tempFreq; + tempIsLeaf = midIsLeaf[2]; + midIsLeaf[2] = midIsLeaf[3]; + midIsLeaf[3] = tempIsLeaf; + } + if (midFreq[1] > midFreq[2]) { + tempFreq = midFreq[1]; + midFreq[1] = midFreq[2]; + midFreq[2] = tempFreq; + tempIsLeaf = midIsLeaf[1]; + midIsLeaf[1] = midIsLeaf[2]; + midIsLeaf[2] = tempIsLeaf; + } + } + + minFreq = midFreq[0]; + if (midFreq[1] < UINT_MAX) { minFreq += midFreq[1]; } + iNodesFreq[iNodesRear] = minFreq; + iNodesLeader[iNodesRear] = -1; + + /* If is leaf */ + if (midIsLeaf[0]) { + lNodesLeader[lNodesCur] = iNodesRear; + ++CL[lNodesCur], ++lNodesCur; + } + else { + iNodesLeader[iNodesFront] = iNodesRear; + iNodesFront = MOD(iNodesFront + 1, size); + } + if (midIsLeaf[1]) { + lNodesLeader[lNodesCur] = iNodesRear; + ++CL[lNodesCur], ++lNodesCur; + } + else { + iNodesLeader[iNodesFront] = iNodesRear; + iNodesFront = MOD(iNodesFront + 1, size); /* ? */ + } + + // iNodesRear = MOD(iNodesRear + 1, size); + + iNodesSize = MOD(iNodesRear - iNodesFront, size); + } + + // int curLeavesNum; + /* Select elements to copy -- parallelized */ + curLeavesNum = 0; + current_grid.sync(); + if (i >= lNodesCur && i < size) { + // Parallel component + int threadCurLeavesNum; + if (lNodesFreq[i] <= minFreq) { + threadCurLeavesNum = i - lNodesCur + 1; + // Atomic max -- Largest valid index + atomicMax(&curLeavesNum, threadCurLeavesNum); + } + + if (i - lNodesCur < curLeavesNum) { + copyFreq[i - lNodesCur] = lNodesFreq[i]; + copyIndex[i - lNodesCur] = i; + copyIsLeaf[i - lNodesCur] = 1; + } + } + + current_grid.sync(); + + /* Updates Iterators */ + if (thread == 0) { + mergeRear = iNodesRear; + mergeFront = iNodesFront; + + if ((curLeavesNum + iNodesSize) % 2 == 0) { iNodesFront = iNodesRear; } + /* Odd number of nodes to merge - leave out one*/ + else if ( + (iNodesSize != 0) // + and (curLeavesNum == 0 // + or (histogram[lNodesCur + curLeavesNum] <= iNodesFreq[MOD(iNodesRear - 1, size)])) // + ) { + mergeRear = MOD(mergeRear - 1, size); + iNodesFront = MOD(iNodesRear - 1, size); + } + else { + iNodesFront = iNodesRear; + --curLeavesNum; + } + + lNodesCur = lNodesCur + curLeavesNum; + iNodesRear = MOD(iNodesRear + 1, size); + } + current_grid.sync(); + + /* Parallelized Merging Phase */ + + /*if (thread == 0) { + merge(copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum, + iNodesFreq, mergeFront, mergeRear, size, + tempFreq, tempIndex, tempIsLeaf, tempLength); + }*/ + + parMerge( + copyFreq, copyIndex, copyIsLeaf, 0, curLeavesNum, // + iNodesFreq, mergeFront, mergeRear, size, // + tempFreq, tempIndex, tempIsLeaf, tempLength, // + diagonal_path_intersections, mblocks, mthreads, // + x_top, y_top, x_bottom, y_bottom, found, oneorzero); + current_grid.sync(); + + /* Melding phase -- New */ + if (thread < tempLength / 2) { + int ind = MOD(iNodesRear + i, size); + iNodesFreq[ind] = tempFreq[(2 * i)] + tempFreq[(2 * i) + 1]; + iNodesLeader[ind] = -1; + + if (tempIsLeaf[(2 * i)]) { + lNodesLeader[tempIndex[(2 * i)]] = ind; + ++CL[tempIndex[(2 * i)]]; + } + else { + iNodesLeader[tempIndex[(2 * i)]] = ind; + } + if (tempIsLeaf[(2 * i) + 1]) { + lNodesLeader[tempIndex[(2 * i) + 1]] = ind; + ++CL[tempIndex[(2 * i) + 1]]; + } + else { + iNodesLeader[tempIndex[(2 * i) + 1]] = ind; + } + } + current_grid.sync(); + + if (thread == 0) { iNodesRear = MOD(iNodesRear + (tempLength / 2), size); } + current_grid.sync(); + + /* Update leaders */ + if (thread < size) { + if (lNodesLeader[i] != -1) { + if (iNodesLeader[lNodesLeader[i]] != -1) { + lNodesLeader[i] = iNodesLeader[lNodesLeader[i]]; + ++CL[i]; + } + } + } + current_grid.sync(); + + if (thread == 0) { iNodesSize = MOD(iNodesRear - iNodesFront, size); } + current_grid.sync(); + } +} + +// Parallelized with atomic writes, but could replace with Jiannan's similar code +template +__global__ void par_huffman::GPU_GenerateCW(F* CL, H* CW, H* first, H* entry, int size) +{ + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + const unsigned int i = thread; // Porting convenience + auto current_grid = cg::this_grid(); + auto type_bw = sizeof(H) * 8; + + /* Reverse in place - Probably a more CUDA-appropriate way */ + if (thread < size / 2) { + F temp = CL[i]; + CL[i] = CL[size - i - 1]; + CL[size - i - 1] = temp; + } + current_grid.sync(); + + if (thread == 0) { + CCL = CL[0]; + CDPI = 0; + newCDPI = size - 1; + entry[CCL] = 0; + + // Edge case -- only one input symbol + CW[CDPI] = 0; + first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1); + entry[CCL + 1] = 1; + } + current_grid.sync(); + + // Initialize first and entry arrays + if (thread < CCL) { + // Initialization of first to Max ensures that unused code + // lengths are skipped over in decoding. + first[i] = std::numeric_limits::max(); + entry[i] = 0; + } + // Initialize first element of entry + current_grid.sync(); + + while (CDPI < size - 1) { + // CDPI update + if (i < size - 1 && CL[i + 1] > CCL) { atomicMin(&newCDPI, i); } + current_grid.sync(); + + // Last element to update + const int updateEnd = (newCDPI >= size - 1) ? type_bw : CL[newCDPI + 1]; + // Fill base + const int curEntryVal = entry[CCL]; + // Number of elements of length CCL + const int numCCL = (newCDPI - CDPI + 1); + + // Get first codeword + if (i == 0) { + if (CDPI == 0) { CW[newCDPI] = 0; } + else { + CW[newCDPI] = CW[CDPI]; // Pre-stored + } + } + current_grid.sync(); + + if (i < size) { + // Parallel canonical codeword generation + if (i >= CDPI && i < newCDPI) { CW[i] = CW[newCDPI] + (newCDPI - i); } + } + + // Update entry and first arrays in O(1) time + if (thread > CCL && thread < updateEnd) { entry[i] = curEntryVal + numCCL; } + // Add number of entries to next CCL + if (thread == 0) { + if (updateEnd < type_bw) { entry[updateEnd] = curEntryVal + numCCL; } + } + current_grid.sync(); + + // Update first array in O(1) time + if (thread == CCL) { + // Flip least significant CL[CDPI] bits + first[CCL] = CW[CDPI] ^ (((H)1 << (H)CL[CDPI]) - 1); + } + if (thread > CCL && thread < updateEnd) { first[i] = std::numeric_limits::max(); } + current_grid.sync(); + + if (thread == 0) { + if (newCDPI < size - 1) { + int CLDiff = CL[newCDPI + 1] - CL[newCDPI]; + // Add and shift -- Next canonical code + CW[newCDPI + 1] = ((CW[CDPI] + 1) << CLDiff); + CCL = CL[newCDPI + 1]; + + ++newCDPI; + } + + // Update CDPI to newCDPI after codeword length increase + CDPI = newCDPI; + newCDPI = size - 1; + } + current_grid.sync(); + } + + if (thread < size) { + /* Make encoded codeword compatible with CUSZ */ + CW[i] = (CW[i] | (((H)CL[i] & (H)0xffu) << ((sizeof(H) * 8) - 8))) ^ (((H)1 << (H)CL[i]) - 1); + } + current_grid.sync(); + + /* Reverse partial codebook */ + if (thread < size / 2) { + H temp = CW[i]; + CW[i] = CW[size - i - 1]; + CW[size - i - 1] = temp; + } +} + +// TODO forceinilne? +// Helper implementations +template +__global__ void par_huffman::detail::GPU_FillArraySequence(T* array, unsigned int size) +{ + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thread < size) { array[thread] = thread; } +} + +// Precondition -- Result is preset to be equal to size +template +__global__ void par_huffman::detail::GPU_GetFirstNonzeroIndex(T* array, unsigned int size, unsigned int* result) +{ + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + if (array[thread] != 0) { atomicMin(result, thread); } +} + +namespace par_huffman { +namespace detail { +__global__ void GPU_GetMaxCWLength(unsigned int* CL, unsigned int size, unsigned int* result) +{ + (void)size; + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thread == 0) { *result = CL[0]; } +} + +} // namespace detail +} // namespace par_huffman + +/** + * @brief Reorders given a set of indices. Programmer must ensure that all index[i] + * are unique or else race conditions may occur + * + * @tparam T + * @tparam Q + * @param array e.g., codebook + * @param index e.g., input data + * @param size + * @return __global__ + */ +template +__global__ void par_huffman::detail::GPU_ReorderByIndex(H* array, T* index, unsigned int size) +{ + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + H temp; + T newIndex; + if (thread < size) { + temp = array[thread]; + newIndex = index[thread]; + array[(int)newIndex] = temp; + } +} + +// Reverses a given array. +template +__global__ void par_huffman::detail::GPU_ReverseArray(T* array, unsigned int size) +{ + unsigned int thread = (blockIdx.x * blockDim.x) + threadIdx.x; + if (thread < size / 2) { + T temp = array[thread]; + array[thread] = array[size - thread - 1]; + array[size - thread - 1] = temp; + } +} + +// Parallel codebook generation wrapper +template +void asz::hf_buildbook_g( + uint32_t* freq, + int const dict_size, + H* codebook, + uint8_t* reverse_codebook, + int const revbook_nbyte, + float* time_book, + cudaStream_t stream) +{ + // Metadata + auto type_bw = sizeof(H) * 8; + auto _d_first = reinterpret_cast(reverse_codebook); + auto _d_entry = reinterpret_cast(reverse_codebook + (sizeof(H) * type_bw)); + auto _d_qcode = reinterpret_cast(reverse_codebook + (sizeof(H) * 2 * type_bw)); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + // Sort Qcodes by frequency + int nblocks = (dict_size / 1024) + 1; + par_huffman::detail::GPU_FillArraySequence<<>>(_d_qcode, (unsigned int)dict_size); + cudaStreamSynchronize(stream); + + /** + * Originally from par_huffman_sortbyfreq.cu by Cody Rivera (cjrivera1@crimson.ua.edu) + * Sorts quantization codes by frequency, using a key-value sort. This functionality is placed in a separate + * compilation unit as thrust calls fail in par_huffman.cu. + * + * Resolved by + * 1) inlining function + * 2) using `thrust::device_pointer_cast(var)` instead of `thrust::device_pointer(var)` + */ + auto lambda_sort_by_freq = [] __host__(auto freq, auto len, auto qcode) { + thrust::sort_by_key( + thrust::device_pointer_cast(freq), thrust::device_pointer_cast(freq + len), + thrust::device_pointer_cast(qcode)); + }; + + lambda_sort_by_freq(freq, dict_size, _d_qcode); + cudaStreamSynchronize(stream); + + unsigned int* d_first_nonzero_index; + unsigned int first_nonzero_index = dict_size; + cudaMalloc(&d_first_nonzero_index, sizeof(unsigned int)); + cudaMemcpy(d_first_nonzero_index, &first_nonzero_index, sizeof(unsigned int), cudaMemcpyHostToDevice); + par_huffman::detail::GPU_GetFirstNonzeroIndex + <<>>(freq, dict_size, d_first_nonzero_index); + cudaStreamSynchronize(stream); + cudaMemcpy(&first_nonzero_index, d_first_nonzero_index, sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaFree(d_first_nonzero_index); + + int nz_dict_size = dict_size - first_nonzero_index; + unsigned int* _nz_d_freq = freq + first_nonzero_index; + H* _nz_d_codebook = codebook + first_nonzero_index; + int nz_nblocks = (nz_dict_size / 1024) + 1; + + // Memory Allocation -- Perhaps put in another wrapper + // clang-format off + unsigned int *CL = nullptr; + /*unsigned int* lNodesFreq*/ int *lNodesLeader = nullptr; + unsigned int *iNodesFreq = nullptr; int *iNodesLeader = nullptr; + unsigned int *tempFreq = nullptr; int *tempIsLeaf = nullptr; int *tempIndex = nullptr; + unsigned int *copyFreq = nullptr; int *copyIsLeaf = nullptr; int *copyIndex = nullptr; + cudaMalloc(&CL, nz_dict_size * sizeof(unsigned int) ); + cudaMalloc(&lNodesLeader, nz_dict_size * sizeof(int) ); + cudaMalloc(&iNodesFreq, nz_dict_size * sizeof(unsigned int) ); + cudaMalloc(&iNodesLeader, nz_dict_size * sizeof(int) ); + cudaMalloc(&tempFreq, nz_dict_size * sizeof(unsigned int) ); + cudaMalloc(&tempIsLeaf, nz_dict_size * sizeof(int) ); + cudaMalloc(&tempIndex, nz_dict_size * sizeof(int) ); + cudaMalloc(©Freq, nz_dict_size * sizeof(unsigned int) ); + cudaMalloc(©IsLeaf, nz_dict_size * sizeof(int) ); + cudaMalloc(©Index, nz_dict_size * sizeof(int) ); + cudaMemset(CL, 0, nz_dict_size * sizeof(int) ); + // clang-format on + + // Grid configuration for CL -- based on Cooperative Groups + int cg_mblocks; + int cg_blocks_sm; + int device_id; + int mthreads = 32; // 1 warp + cudaDeviceProp deviceProp; + cudaGetDevice(&device_id); + cudaGetDeviceProperties(&deviceProp, device_id); + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &cg_blocks_sm, par_huffman::GPU_GenerateCL, mthreads, 5 * sizeof(int32_t) + 32 * sizeof(int32_t)); + cg_mblocks = deviceProp.multiProcessorCount * cg_blocks_sm; + + int ELTS_PER_SEQ_MERGE = 16; + int mblocks = std::min(cg_mblocks, (nz_dict_size / ELTS_PER_SEQ_MERGE) + 1); + + // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded + int tthreads = mthreads * mblocks; + if (tthreads < nz_dict_size) { + cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size + << " non-zero item codebook" << endl; + cout << LOG_ERR << "Provided parallelism: " << mblocks << " blocks, " << mthreads << " threads, " << tthreads + << " total" << endl + << endl; + // cout << LOG_ERR << "Exiting cuSZ ..." << endl; + throw std::system_error(); + // exit(1); + } + + uint32_t* diagonal_path_intersections; + cudaMalloc(&diagonal_path_intersections, (2 * (mblocks + 1)) * sizeof(uint32_t)); + + // Codebook already init'ed + cudaStreamSynchronize(stream); + + // Call first kernel + // Collect arguments + void* CL_Args[] = {(void*)&_nz_d_freq, (void*)&CL, + (void*)&nz_dict_size, (void*)&_nz_d_freq, + (void*)&lNodesLeader, (void*)&iNodesFreq, + (void*)&iNodesLeader, (void*)&tempFreq, + (void*)&tempIsLeaf, (void*)&tempIndex, + (void*)©Freq, (void*)©IsLeaf, + (void*)©Index, (void*)&diagonal_path_intersections, + (void*)&mblocks, (void*)&mthreads}; + // Cooperative Launch + cudaLaunchCooperativeKernel( + (void*)par_huffman::GPU_GenerateCL, mblocks, mthreads, CL_Args, + 5 * sizeof(int32_t) + 32 * sizeof(int32_t)); + cudaStreamSynchronize(stream); + + // Exits if the highest codeword length is greater than what + // the adaptive representation can handle + // TODO do proper cleanup + + unsigned int* d_max_CL; + unsigned int max_CL; + cudaMalloc(&d_max_CL, sizeof(unsigned int)); + par_huffman::detail::GPU_GetMaxCWLength<<<1, 1>>>(CL, nz_dict_size, d_max_CL); + cudaStreamSynchronize(stream); + cudaMemcpy(&max_CL, d_max_CL, sizeof(unsigned int), cudaMemcpyDeviceToHost); + cudaFree(d_max_CL); + + int max_CW_bits = (sizeof(H) * 8) - 8; + if (max_CL > max_CW_bits) { + cout << LOG_ERR << "Cannot store all Huffman codewords in " << max_CW_bits + 8 << "-bit representation" << endl; + cout << LOG_ERR << "Huffman codeword representation requires at least " << max_CL + 8 + << " bits (longest codeword: " << max_CL << " bits)" << endl; + // cout << LOG_ERR << "(Consider running with -H 8 for 8-byte representation)" << endl << endl; + // cout << LOG_ERR << "Exiting cuSZ ..." << endl; + // exit(1); + throw std::runtime_error("Falling back to 8-byte Codec."); + } + + // Configure CW for 1024 threads/block + int cg_cw_mblocks = (cg_mblocks * mthreads) / 1024; + int cw_mblocks = std::min(cg_cw_mblocks, nz_nblocks); + + // Exit if not enough exposed parallelism -- TODO modify kernels so this is unneeded + int cw_tthreads = cw_mblocks * 1024; + if (cw_tthreads < nz_dict_size) { + cout << LOG_ERR << "Insufficient on-device parallelism to construct a " << nz_dict_size + << " non-zero item codebook" << endl; + cout << LOG_ERR << "Provided parallelism: " << cw_mblocks << " blocks, " << 1024 << " threads, " << cw_tthreads + << " total" << endl + << endl; + // cout << LOG_ERR << "Exiting cuSZ ..." << endl; + // exit(1); + throw std::system_error(); + } + + void* CW_Args[] = { + (void*)&CL, // + (void*)&_nz_d_codebook, // + (void*)&_d_first, // + (void*)&_d_entry, // + (void*)&nz_dict_size}; + + // Call second kernel + cudaLaunchCooperativeKernel( + (void*)par_huffman::GPU_GenerateCW, // + cw_mblocks, // + 1024, // + CW_Args); + cudaStreamSynchronize(stream); + +#ifdef D_DEBUG_PRINT + print_codebook<<<1, 32>>>(codebook, dict_size); // PASS + cudaStreamSynchronize(stream); +#endif + + // Reverse _d_qcode and codebook + par_huffman::detail::GPU_ReverseArray<<>>(codebook, (unsigned int)dict_size); + par_huffman::detail::GPU_ReverseArray<<>>(_d_qcode, (unsigned int)dict_size); + cudaStreamSynchronize(stream); + + par_huffman::detail::GPU_ReorderByIndex<<>>(codebook, _d_qcode, (unsigned int)dict_size); + cudaStreamSynchronize(stream); + + STOP_CUDAEVENT_RECORDING(stream); + TIME_ELAPSED_CUDAEVENT(time_book); + DESTROY_CUDAEVENT_PAIR; + + // Cleanup + cudaFree(CL); + cudaFree(lNodesLeader); + cudaFree(iNodesFreq); + cudaFree(iNodesLeader); + cudaFree(tempFreq); + cudaFree(tempIsLeaf); + cudaFree(tempIndex); + cudaFree(copyFreq); + cudaFree(copyIsLeaf); + cudaFree(copyIndex); + cudaFree(diagonal_path_intersections); + cudaStreamSynchronize(stream); + +#ifdef D_DEBUG_PRINT + print_codebook<<<1, 32>>>(codebook, dict_size); // PASS + cudaStreamSynchronize(stream); +#endif +} + +#endif /* C883A574_4491_40E8_A083_1B6E8FB56670 */ diff --git a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl index 04c8883b..2e8cf159 100644 --- a/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl +++ b/qtensor/compression/cusz/src/hf/detail/hf_codecg.inl @@ -1,296 +1,296 @@ -/** - * @file codec_huffman.cuh - * @author Jiannan Tian - * @brief Huffman kernel definitions - * @version 0.2 - * @date 2020-02-13 - * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH -#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH - -#include -#include -#include -#include -#include -#include - -#include "common.hh" -#include "hf/hf_bookg.hh" -#include "hf/hf_codecg.hh" -#include "hf/hf_struct.h" -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -#define TIX threadIdx.x -#define BIX blockIdx.x -#define BDX blockDim.x - -#if __has_include() -// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" -#include -#else -// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" -#include "../../third_party/cub/cub/cub.cuh" -#endif - -using BYTE = uint8_t; - -extern __shared__ char __codec_huffman_uninitialized[]; - -struct __helper { - __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; } - __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; } - __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; } - __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; } - template - __device__ __forceinline__ static unsigned int global_tid() - { - return blockIdx.x * blockDim.x * SEQ + threadIdx.x; - } - template - __device__ __forceinline__ static unsigned int grid_stride() - { - return blockDim.x * gridDim.x * SEQ; - } -}; - -template -__global__ void hf_decode_kernel( - COMPRESSED* compressed, - uint8_t* revbook, - MetadataT* par_nbit, - MetadataT* par_entry, - int const revbook_nbyte, - int const sublen, - int const pardeg, - UNCOMPRESSED* out_uncompressed); - -namespace asz { -namespace detail { - -template -__global__ void hf_encode_phase1_fill( - UNCOMPRESSED* in_uncompressed, - size_t const in_uncompressed_len, - ENCODED* in_book, - int const in_booklen, - ENCODED* out_encoded); - -template -__global__ void hf_encode_phase2_deflate( - COMPRESSED* inout_inplace, - size_t const len, - MetadataT* par_nbit, - MetadataT* par_ncell, - int const sublen, - int const pardeg); - -template -__global__ void -hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped); - -// TODO change size_t to unsigned int -template -__device__ void -hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook); - -} // namespace detail -} // namespace asz - -// TODO change size_t to unsigned int -template -__device__ void -asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook) -{ - static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8; - - int next_bit; - auto idx_bit = 0; - auto idx_byte = 0; - auto idx_out = 0; - - COMPRESSED bufr = input[idx_byte]; - - auto first = reinterpret_cast(revbook); - auto entry = first + DTYPE_WIDTH; - auto keys = reinterpret_cast(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH)); - COMPRESSED v = (bufr >> (DTYPE_WIDTH - 1)) & 0x1; // get the first bit - auto l = 1; - auto i = 0; - - while (i < total_bw) { - while (v < first[l]) { // append next i_cb bit - ++i; - idx_byte = i / DTYPE_WIDTH; // [1:exclusive] - idx_bit = i % DTYPE_WIDTH; - if (idx_bit == 0) { - // idx_byte += 1; // [1:exclusive] - bufr = input[idx_byte]; - } - - next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1); - v = (v << 1) | next_bit; - ++l; - } - out[idx_out++] = keys[entry[l] + v - first[l]]; - { - ++i; - idx_byte = i / DTYPE_WIDTH; // [2:exclusive] - idx_bit = i % DTYPE_WIDTH; - if (idx_bit == 0) { - // idx_byte += 1; // [2:exclusive] - bufr = input[idx_byte]; - } - - next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1); - v = 0x0 | next_bit; - } - l = 1; - } -} - -template -__global__ void asz::detail::hf_encode_phase1_fill( - UNCOMPRESSED* in_uncompressed, - size_t const in_uncompressed_len, - ENCODED* in_book, - int const in_booklen, - ENCODED* out_encoded) -{ - auto shmem_cb = reinterpret_cast(__codec_huffman_uninitialized); - - // load from global memory - for (auto idx = __helper::local_tid_1(); // - idx < in_booklen; // - idx += __helper::block_stride_1()) - shmem_cb[idx] = in_book[idx]; - - __syncthreads(); - - for (auto idx = __helper::global_tid_1(); // - idx < in_uncompressed_len; // - idx += __helper::grid_stride_1() // - ) - out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]]; -} - -template -__global__ void asz::detail::hf_encode_phase2_deflate( - COMPRESSED* inout_inplace, - size_t const len, - MetadataT* par_nbit, - MetadataT* par_ncell, - int const sublen, - int const pardeg) -{ - constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8; - - auto tid = BIX * BDX + TIX; - - if (tid * sublen < len) { - int residue_bits = CELL_BITWIDTH; - int total_bits = 0; - COMPRESSED* ptr = inout_inplace + tid * sublen; - COMPRESSED bufr; - uint8_t word_width; - - auto did = tid * sublen; - for (auto i = 0; i < sublen; i++, did++) { - if (did == len) break; - - COMPRESSED packed_word = inout_inplace[tid * sublen + i]; - auto word_ptr = reinterpret_cast*>(&packed_word); - word_width = word_ptr->bits; - word_ptr->bits = (uint8_t)0x0; - - if (residue_bits == CELL_BITWIDTH) { // a new unit of compact format - bufr = 0x0; - } - //////////////////////////////////////////////////////////////// - - if (word_width <= residue_bits) { - residue_bits -= word_width; - bufr |= packed_word << residue_bits; - - if (residue_bits == 0) { - residue_bits = CELL_BITWIDTH; - *(ptr++) = bufr; - } - } - else { - // example: we have 5-bit code 11111 but 3 bits available in (*ptr) - // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB - // ^^^ ^^ - auto l_bits = word_width - residue_bits; - auto r_bits = CELL_BITWIDTH - l_bits; - - bufr |= packed_word >> l_bits; - *(ptr++) = bufr; - bufr = packed_word << r_bits; - - residue_bits = r_bits; - } - total_bits += word_width; - } - *ptr = bufr; // manage the last unit - - par_nbit[tid] = total_bits; - par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH; - } -} - -template -__global__ void asz::detail::hf_encode_phase4_concatenate( - Huff* gapped, - Meta* par_entry, - Meta* par_ncell, - int const cfg_sublen, - Huff* non_gapped) -{ - auto n = par_ncell[blockIdx.x]; - auto src = gapped + cfg_sublen * blockIdx.x; - auto dst = non_gapped + par_entry[blockIdx.x]; - - for (auto i = threadIdx.x; i < n; i += blockDim.x) { // block-stride - dst[i] = src[i]; - } -} - -template -__global__ void hf_decode_kernel( - COMPRESSED* compressed, - uint8_t* revbook, - MetadataT* par_nbit, - MetadataT* par_entry, - int const revbook_nbyte, - int const sublen, - int const pardeg, - UNCOMPRESSED* out_uncompressed) -{ - extern __shared__ uint8_t shmem[]; - constexpr auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; - - auto R = (revbook_nbyte - 1 + block_dim) / block_dim; - - for (auto i = 0; i < R; i++) { - if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim]; - } - __syncthreads(); - - auto gid = BIX * BDX + TIX; - - if (gid < pardeg) { - asz::detail::hf_decode_single_thread_inflate( - compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem); - __syncthreads(); - } -} - -#endif +/** + * @file codec_huffman.cuh + * @author Jiannan Tian + * @brief Huffman kernel definitions + * @version 0.2 + * @date 2020-02-13 + * (created) 2020-02-02, (rev1) 2021-02-13, (rev2) 2021-12-29 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_CODEC_HUFFMAN_CUH +#define CUSZ_KERNEL_CODEC_HUFFMAN_CUH + +#include +#include +#include +#include +#include +#include + +#include "common.hh" +#include "hf/hf_bookg.hh" +#include "hf/hf_codecg.hh" +#include "hf/hf_struct.h" +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +#define TIX threadIdx.x +#define BIX blockIdx.x +#define BDX blockDim.x + +#if __has_include() +// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" +#include +#else +// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" +#include "../../third_party/cub/cub/cub.cuh" +#endif + +using BYTE = uint8_t; + +extern __shared__ char __codec_huffman_uninitialized[]; + +struct __helper { + __device__ __forceinline__ static unsigned int local_tid_1() { return threadIdx.x; } + __device__ __forceinline__ static unsigned int global_tid_1() { return blockIdx.x * blockDim.x + threadIdx.x; } + __device__ __forceinline__ static unsigned int block_stride_1() { return blockDim.x; } + __device__ __forceinline__ static unsigned int grid_stride_1() { return blockDim.x * gridDim.x; } + template + __device__ __forceinline__ static unsigned int global_tid() + { + return blockIdx.x * blockDim.x * SEQ + threadIdx.x; + } + template + __device__ __forceinline__ static unsigned int grid_stride() + { + return blockDim.x * gridDim.x * SEQ; + } +}; + +template +__global__ void hf_decode_kernel( + COMPRESSED* compressed, + uint8_t* revbook, + MetadataT* par_nbit, + MetadataT* par_entry, + int const revbook_nbyte, + int const sublen, + int const pardeg, + UNCOMPRESSED* out_uncompressed); + +namespace asz { +namespace detail { + +template +__global__ void hf_encode_phase1_fill( + UNCOMPRESSED* in_uncompressed, + size_t const in_uncompressed_len, + ENCODED* in_book, + int const in_booklen, + ENCODED* out_encoded); + +template +__global__ void hf_encode_phase2_deflate( + COMPRESSED* inout_inplace, + size_t const len, + MetadataT* par_nbit, + MetadataT* par_ncell, + int const sublen, + int const pardeg); + +template +__global__ void +hf_encode_phase4_concatenate(Huff* gapped, Meta* par_entry, Meta* par_ncell, int const cfg_sublen, Huff* non_gapped); + +// TODO change size_t to unsigned int +template +__device__ void +hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook); + +} // namespace detail +} // namespace asz + +// TODO change size_t to unsigned int +template +__device__ void +asz::detail::hf_decode_single_thread_inflate(COMPRESSED* input, UNCOMPRESSED* out, int const total_bw, BYTE* revbook) +{ + static const auto DTYPE_WIDTH = sizeof(COMPRESSED) * 8; + + int next_bit; + auto idx_bit = 0; + auto idx_byte = 0; + auto idx_out = 0; + + COMPRESSED bufr = input[idx_byte]; + + auto first = reinterpret_cast(revbook); + auto entry = first + DTYPE_WIDTH; + auto keys = reinterpret_cast(revbook + sizeof(COMPRESSED) * (2 * DTYPE_WIDTH)); + COMPRESSED v = (bufr >> (DTYPE_WIDTH - 1)) & 0x1; // get the first bit + auto l = 1; + auto i = 0; + + while (i < total_bw) { + while (v < first[l]) { // append next i_cb bit + ++i; + idx_byte = i / DTYPE_WIDTH; // [1:exclusive] + idx_bit = i % DTYPE_WIDTH; + if (idx_bit == 0) { + // idx_byte += 1; // [1:exclusive] + bufr = input[idx_byte]; + } + + next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1); + v = (v << 1) | next_bit; + ++l; + } + out[idx_out++] = keys[entry[l] + v - first[l]]; + { + ++i; + idx_byte = i / DTYPE_WIDTH; // [2:exclusive] + idx_bit = i % DTYPE_WIDTH; + if (idx_bit == 0) { + // idx_byte += 1; // [2:exclusive] + bufr = input[idx_byte]; + } + + next_bit = ((bufr >> (DTYPE_WIDTH - 1 - idx_bit)) & 0x1); + v = 0x0 | next_bit; + } + l = 1; + } +} + +template +__global__ void asz::detail::hf_encode_phase1_fill( + UNCOMPRESSED* in_uncompressed, + size_t const in_uncompressed_len, + ENCODED* in_book, + int const in_booklen, + ENCODED* out_encoded) +{ + auto shmem_cb = reinterpret_cast(__codec_huffman_uninitialized); + + // load from global memory + for (auto idx = __helper::local_tid_1(); // + idx < in_booklen; // + idx += __helper::block_stride_1()) + shmem_cb[idx] = in_book[idx]; + + __syncthreads(); + + for (auto idx = __helper::global_tid_1(); // + idx < in_uncompressed_len; // + idx += __helper::grid_stride_1() // + ) + out_encoded[idx] = shmem_cb[(int)in_uncompressed[idx]]; +} + +template +__global__ void asz::detail::hf_encode_phase2_deflate( + COMPRESSED* inout_inplace, + size_t const len, + MetadataT* par_nbit, + MetadataT* par_ncell, + int const sublen, + int const pardeg) +{ + constexpr int CELL_BITWIDTH = sizeof(COMPRESSED) * 8; + + auto tid = BIX * BDX + TIX; + + if (tid * sublen < len) { + int residue_bits = CELL_BITWIDTH; + int total_bits = 0; + COMPRESSED* ptr = inout_inplace + tid * sublen; + COMPRESSED bufr; + uint8_t word_width; + + auto did = tid * sublen; + for (auto i = 0; i < sublen; i++, did++) { + if (did == len) break; + + COMPRESSED packed_word = inout_inplace[tid * sublen + i]; + auto word_ptr = reinterpret_cast*>(&packed_word); + word_width = word_ptr->bits; + word_ptr->bits = (uint8_t)0x0; + + if (residue_bits == CELL_BITWIDTH) { // a new unit of compact format + bufr = 0x0; + } + //////////////////////////////////////////////////////////////// + + if (word_width <= residue_bits) { + residue_bits -= word_width; + bufr |= packed_word << residue_bits; + + if (residue_bits == 0) { + residue_bits = CELL_BITWIDTH; + *(ptr++) = bufr; + } + } + else { + // example: we have 5-bit code 11111 but 3 bits available in (*ptr) + // 11111 for the residue 3 bits in (*ptr); 11111 for 2 bits of (*(++ptr)), starting with MSB + // ^^^ ^^ + auto l_bits = word_width - residue_bits; + auto r_bits = CELL_BITWIDTH - l_bits; + + bufr |= packed_word >> l_bits; + *(ptr++) = bufr; + bufr = packed_word << r_bits; + + residue_bits = r_bits; + } + total_bits += word_width; + } + *ptr = bufr; // manage the last unit + + par_nbit[tid] = total_bits; + par_ncell[tid] = (total_bits + CELL_BITWIDTH - 1) / CELL_BITWIDTH; + } +} + +template +__global__ void asz::detail::hf_encode_phase4_concatenate( + Huff* gapped, + Meta* par_entry, + Meta* par_ncell, + int const cfg_sublen, + Huff* non_gapped) +{ + auto n = par_ncell[blockIdx.x]; + auto src = gapped + cfg_sublen * blockIdx.x; + auto dst = non_gapped + par_entry[blockIdx.x]; + + for (auto i = threadIdx.x; i < n; i += blockDim.x) { // block-stride + dst[i] = src[i]; + } +} + +template +__global__ void hf_decode_kernel( + COMPRESSED* compressed, + uint8_t* revbook, + MetadataT* par_nbit, + MetadataT* par_entry, + int const revbook_nbyte, + int const sublen, + int const pardeg, + UNCOMPRESSED* out_uncompressed) +{ + extern __shared__ uint8_t shmem[]; + constexpr auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; + + auto R = (revbook_nbyte - 1 + block_dim) / block_dim; + + for (auto i = 0; i < R; i++) { + if (TIX + i * block_dim < revbook_nbyte) shmem[TIX + i * block_dim] = revbook[TIX + i * block_dim]; + } + __syncthreads(); + + auto gid = BIX * BDX + TIX; + + if (gid < pardeg) { + asz::detail::hf_decode_single_thread_inflate( + compressed + par_entry[gid], out_uncompressed + sublen * gid, par_nbit[gid], shmem); + __syncthreads(); + } +} + +#endif diff --git a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl index 7a330ba6..4ed9b580 100644 --- a/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl +++ b/qtensor/compression/cusz/src/hf/detail/hf_pimpl.inl @@ -1,364 +1,364 @@ -/** - * @file huffman_coarse.cuh - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-12-17 - * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * @copyright (C) 2021 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH -#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH - -#include -// #include -// #include -// #include -// #include -#include -#include -// #include - -using std::cout; - -#include "common/definition.hh" -#include "common/type_traits.hh" -#include "utils.hh" - -#include "hf/hf.hh" -#include "hf/hf_bookg.hh" -#include "hf/hf_codecg.hh" - -/****************************************************************************** - macros for shorthand writing - ******************************************************************************/ - -#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD]; - -#define DEVICE2DEVICE_COPY(VAR, FIELD) \ - { \ - constexpr auto D2D = cudaMemcpyDeviceToDevice; \ - auto dst = d_compressed + header.entry[Header::FIELD]; \ - auto src = reinterpret_cast(d_##VAR); \ - CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \ - } - -#define ACCESSOR(SYM, TYPE) reinterpret_cast(in_compressed + header.entry[Header::SYM]) - -#define HC_ALLOCHOST(VAR, SYM) \ - cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \ - memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]); - -#define HC_ALLOCDEV(VAR, SYM) \ - cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \ - cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]); - -#define HC_FREEHOST(VAR) \ - if (h_##VAR) { \ - cudaFreeHost(h_##VAR); \ - h_##VAR = nullptr; \ - } - -#define HC_FREEDEV(VAR) \ - if (d_##VAR) { \ - cudaFree(d_##VAR); \ - d_##VAR = nullptr; \ - } - -/****************************************************************************** - class definition - ******************************************************************************/ - -#define TEMPLATE_TYPE template -#define IMPL LosslessCodec::impl - -namespace cusz { - -TEMPLATE_TYPE -IMPL::~impl() -{ - HC_FREEDEV(tmp); - HC_FREEDEV(book); - HC_FREEDEV(revbook); - HC_FREEDEV(par_nbit); - HC_FREEDEV(par_ncell); - HC_FREEDEV(par_entry); - HC_FREEDEV(bitstream); - - HC_FREEHOST(book); - HC_FREEHOST(revbook); - HC_FREEHOST(par_nbit); - HC_FREEHOST(par_ncell); - HC_FREEHOST(par_entry); -} - -TEMPLATE_TYPE -IMPL::impl() = default; - -//------------------------------------------------------------------------------ - -TEMPLATE_TYPE -void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print) -{ - auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); }; - - auto debug = [&]() { - setlocale(LC_NUMERIC, ""); - printf("\nHuffmanCoarse::init() debugging:\n"); - printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr)); - dbg_println("TMP", d_tmp, RTE::TMP); - dbg_println("BOOK", d_book, RTE::BOOK); - dbg_println("REVBOOK", d_revbook, RTE::REVBOOK); - dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT); - dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL); - dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM); - printf("\n"); - }; - - memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END); - // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1)); - - rte.nbyte[RTE::TMP] = sizeof(H) * in_uncompressed_len; - rte.nbyte[RTE::BOOK] = sizeof(H) * booklen; - rte.nbyte[RTE::REVBOOK] = get_revbook_nbyte(booklen); - rte.nbyte[RTE::PAR_NBIT] = sizeof(M) * pardeg; - rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg; - rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg; - rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes(); - - HC_ALLOCDEV(tmp, TMP); - - { - auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK]; - cudaMalloc(&d_book, total_bytes); - cudaMemset(d_book, 0x0, total_bytes); - - d_revbook = reinterpret_cast(d_book + booklen); - } - - { - cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3); - cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3); - - d_par_nbit = d_par_metadata; - d_par_ncell = d_par_metadata + pardeg; - d_par_entry = d_par_metadata + pardeg * 2; - } - - HC_ALLOCDEV(bitstream, BITSTREAM); - - // standalone definition for output - d_compressed = reinterpret_cast(d_tmp); - - HC_ALLOCHOST(book, BOOK); - HC_ALLOCHOST(revbook, REVBOOK); - - { - cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3); - // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3); - - h_par_nbit = h_par_metadata; - h_par_ncell = h_par_metadata + pardeg; - h_par_entry = h_par_metadata + pardeg * 2; - } - - int numSMs; - cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0); - - int sublen = (in_uncompressed_len - 1) / pardeg + 1; - - book_desc = new hf_book{nullptr, d_book, booklen}; - chunk_desc_d = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry}; - chunk_desc_h = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry}; - bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs}; - - if (dbg_print) debug(); -} - -TEMPLATE_TYPE -void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream) -{ - book_desc->freq = freq; - asz::hf_buildbook_g(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream); -} - -TEMPLATE_TYPE -void IMPL::encode( - T* in_uncompressed, - size_t const in_uncompressed_len, - BYTE*& out_compressed, - size_t& out_compressed_len, - cudaStream_t stream) -{ - time_lossless = 0; - - struct Header header; - - asz::hf_encode_coarse_rev1( - in_uncompressed, in_uncompressed_len, // - book_desc, bitstream_desc, // - out_compressed, out_compressed_len, time_lossless, stream); - - header.total_nbit = - std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0); - header.total_ncell = - std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0); - // update with the precise BITSTREAM nbyte - rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell; - - // d_revbook and revbook_nbyte is hidden; need to improve here - subfile_collect( - header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream); - - out_compressed = d_compressed; - out_compressed_len = header.subfile_size(); -} - -TEMPLATE_TYPE -void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device) -{ - Header header; - if (header_on_device) - CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream)); - - auto d_revbook = ACCESSOR(REVBOOK, BYTE); - auto d_par_nbit = ACCESSOR(PAR_NBIT, M); - auto d_par_entry = ACCESSOR(PAR_ENTRY, M); - auto d_bitstream = ACCESSOR(BITSTREAM, H); - - auto const revbook_nbyte = get_revbook_nbyte(header.booklen); - - // launch_coarse_grained_Huffman_decoding( - asz::hf_decode_coarse( - d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed, - time_lossless, stream); -} - -TEMPLATE_TYPE -void IMPL::clear_buffer() -{ - cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]); - cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]); - cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]); - cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]); - cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]); - cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]); - cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]); -} - -// private helper -TEMPLATE_TYPE -void IMPL::subfile_collect( - Header& header, - size_t const in_uncompressed_len, - int const booklen, - int const sublen, - int const pardeg, - cudaStream_t stream) -{ - auto BARRIER = [&]() { - if (stream) - CHECK_CUDA(cudaStreamSynchronize(stream)); - else - CHECK_CUDA(cudaDeviceSynchronize()); - }; - - header.self_bytes = sizeof(Header); - header.booklen = booklen; - header.sublen = sublen; - header.pardeg = pardeg; - header.uncompressed_len = in_uncompressed_len; - - MetadataT nbyte[Header::END]; - nbyte[Header::HEADER] = sizeof(Header); - - EXPORT_NBYTE(REVBOOK) - EXPORT_NBYTE(PAR_NBIT) - EXPORT_NBYTE(PAR_ENTRY) - EXPORT_NBYTE(BITSTREAM) - - header.entry[0] = 0; - // *.END + 1: need to know the ending position - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } - for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } - - // auto debug_header_entry = [&]() { - // for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]); - // }; - // debug_header_entry(); - - CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); - - /* debug */ BARRIER(); - - DEVICE2DEVICE_COPY(revbook, REVBOOK) - DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT) - DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY) - DEVICE2DEVICE_COPY(bitstream, BITSTREAM) -} - -// getter -TEMPLATE_TYPE -float IMPL::get_time_elapsed() const { return milliseconds; } - -TEMPLATE_TYPE -float IMPL::get_time_book() const { return time_book; } -TEMPLATE_TYPE -float IMPL::get_time_lossless() const { return time_lossless; } - -TEMPLATE_TYPE -H* IMPL::expose_book() const { return d_book; } - -TEMPLATE_TYPE -BYTE* IMPL::expose_revbook() const { return d_revbook; } - -// TODO this kind of space will be overlapping with quant-codes -TEMPLATE_TYPE -size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; } - -TEMPLATE_TYPE -size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; } - -TEMPLATE_TYPE -size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; } - -TEMPLATE_TYPE -constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); } - -// auxiliary -TEMPLATE_TYPE -void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM) -{ - CUdeviceptr pbase0{0}; - size_t psize0{0}; - - cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR); - printf( - "%s:\n" - "\t(supposed) pointer : %p\n" - "\t(supposed) bytes : %'9lu\n" - "\t(queried) pbase0 : %p\n" - "\t(queried) psize0 : %'9lu\n", - SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0); - pbase0 = 0, psize0 = 0; -} - -} // namespace cusz - -#undef HC_ALLOCDEV -#undef HC_ALLOCHOST -#undef HC_FREEDEV -#undef HC_FREEHOST -#undef EXPORT_NBYTE -#undef ACCESSOR -#undef DEVICE2DEVICE_COPY - -#undef TEMPLATE_TYPE -#undef IMPL - -#endif +/** + * @file huffman_coarse.cuh + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-12-17 + * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * @copyright (C) 2021 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_COMPONENT_HUFFMAN_COARSE_CUH +#define CUSZ_COMPONENT_HUFFMAN_COARSE_CUH + +#include +// #include +// #include +// #include +// #include +#include +#include +// #include + +using std::cout; + +#include "common/definition.hh" +#include "common/type_traits.hh" +#include "utils.hh" + +#include "hf/hf.hh" +#include "hf/hf_bookg.hh" +#include "hf/hf_codecg.hh" + +/****************************************************************************** + macros for shorthand writing + ******************************************************************************/ + +#define EXPORT_NBYTE(FIELD) nbyte[Header::FIELD] = rte.nbyte[RTE::FIELD]; + +#define DEVICE2DEVICE_COPY(VAR, FIELD) \ + { \ + constexpr auto D2D = cudaMemcpyDeviceToDevice; \ + auto dst = d_compressed + header.entry[Header::FIELD]; \ + auto src = reinterpret_cast(d_##VAR); \ + CHECK_CUDA(cudaMemcpyAsync(dst, src, nbyte[Header::FIELD], D2D, stream)); \ + } + +#define ACCESSOR(SYM, TYPE) reinterpret_cast(in_compressed + header.entry[Header::SYM]) + +#define HC_ALLOCHOST(VAR, SYM) \ + cudaMallocHost(&h_##VAR, rte.nbyte[RTE::SYM]); \ + memset(h_##VAR, 0x0, rte.nbyte[RTE::SYM]); + +#define HC_ALLOCDEV(VAR, SYM) \ + cudaMalloc(&d_##VAR, rte.nbyte[RTE::SYM]); \ + cudaMemset(d_##VAR, 0x0, rte.nbyte[RTE::SYM]); + +#define HC_FREEHOST(VAR) \ + if (h_##VAR) { \ + cudaFreeHost(h_##VAR); \ + h_##VAR = nullptr; \ + } + +#define HC_FREEDEV(VAR) \ + if (d_##VAR) { \ + cudaFree(d_##VAR); \ + d_##VAR = nullptr; \ + } + +/****************************************************************************** + class definition + ******************************************************************************/ + +#define TEMPLATE_TYPE template +#define IMPL LosslessCodec::impl + +namespace cusz { + +TEMPLATE_TYPE +IMPL::~impl() +{ + HC_FREEDEV(tmp); + HC_FREEDEV(book); + HC_FREEDEV(revbook); + HC_FREEDEV(par_nbit); + HC_FREEDEV(par_ncell); + HC_FREEDEV(par_entry); + HC_FREEDEV(bitstream); + + HC_FREEHOST(book); + HC_FREEHOST(revbook); + HC_FREEHOST(par_nbit); + HC_FREEHOST(par_ncell); + HC_FREEHOST(par_entry); +} + +TEMPLATE_TYPE +IMPL::impl() = default; + +//------------------------------------------------------------------------------ + +TEMPLATE_TYPE +void IMPL::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print) +{ + auto max_compressed_bytes = [&]() { return in_uncompressed_len / 2 * sizeof(H); }; + + auto debug = [&]() { + setlocale(LC_NUMERIC, ""); + printf("\nHuffmanCoarse::init() debugging:\n"); + printf("CUdeviceptr nbyte: %d\n", (int)sizeof(CUdeviceptr)); + dbg_println("TMP", d_tmp, RTE::TMP); + dbg_println("BOOK", d_book, RTE::BOOK); + dbg_println("REVBOOK", d_revbook, RTE::REVBOOK); + dbg_println("PAR_NBIT", d_par_nbit, RTE::PAR_NBIT); + dbg_println("PAR_NCELL", d_par_ncell, RTE::PAR_NCELL); + dbg_println("BITSTREAM", d_bitstream, RTE::BITSTREAM); + printf("\n"); + }; + + memset(rte.nbyte, 0, sizeof(uint32_t) * RTE::END); + // memset(rte.entry, 0, sizeof(uint32_t) * (RTE::END + 1)); + + rte.nbyte[RTE::TMP] = sizeof(H) * in_uncompressed_len; + rte.nbyte[RTE::BOOK] = sizeof(H) * booklen; + rte.nbyte[RTE::REVBOOK] = get_revbook_nbyte(booklen); + rte.nbyte[RTE::PAR_NBIT] = sizeof(M) * pardeg; + rte.nbyte[RTE::PAR_NCELL] = sizeof(M) * pardeg; + rte.nbyte[RTE::PAR_ENTRY] = sizeof(M) * pardeg; + rte.nbyte[RTE::BITSTREAM] = max_compressed_bytes(); + + HC_ALLOCDEV(tmp, TMP); + + { + auto total_bytes = rte.nbyte[RTE::BOOK] + rte.nbyte[RTE::REVBOOK]; + cudaMalloc(&d_book, total_bytes); + cudaMemset(d_book, 0x0, total_bytes); + + d_revbook = reinterpret_cast(d_book + booklen); + } + + { + cudaMalloc(&d_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3); + cudaMemset(d_par_metadata, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3); + + d_par_nbit = d_par_metadata; + d_par_ncell = d_par_metadata + pardeg; + d_par_entry = d_par_metadata + pardeg * 2; + } + + HC_ALLOCDEV(bitstream, BITSTREAM); + + // standalone definition for output + d_compressed = reinterpret_cast(d_tmp); + + HC_ALLOCHOST(book, BOOK); + HC_ALLOCHOST(revbook, REVBOOK); + + { + cudaMallocHost(&h_par_metadata, rte.nbyte[RTE::PAR_NBIT] * 3); + // cudaMemset(h_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT] * 3); + + h_par_nbit = h_par_metadata; + h_par_ncell = h_par_metadata + pardeg; + h_par_entry = h_par_metadata + pardeg * 2; + } + + int numSMs; + cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0); + + int sublen = (in_uncompressed_len - 1) / pardeg + 1; + + book_desc = new hf_book{nullptr, d_book, booklen}; + chunk_desc_d = new hf_chunk{d_par_nbit, d_par_ncell, d_par_entry}; + chunk_desc_h = new hf_chunk{h_par_nbit, h_par_ncell, h_par_entry}; + bitstream_desc = new hf_bitstream{d_tmp, d_bitstream, chunk_desc_d, chunk_desc_h, sublen, pardeg, numSMs}; + + if (dbg_print) debug(); +} + +TEMPLATE_TYPE +void IMPL::build_codebook(cusz::FREQ* freq, int const booklen, cudaStream_t stream) +{ + book_desc->freq = freq; + asz::hf_buildbook_g(freq, booklen, d_book, d_revbook, get_revbook_nbyte(booklen), &time_book, stream); +} + +TEMPLATE_TYPE +void IMPL::encode( + T* in_uncompressed, + size_t const in_uncompressed_len, + BYTE*& out_compressed, + size_t& out_compressed_len, + cudaStream_t stream) +{ + time_lossless = 0; + + struct Header header; + + asz::hf_encode_coarse_rev1( + in_uncompressed, in_uncompressed_len, // + book_desc, bitstream_desc, // + out_compressed, out_compressed_len, time_lossless, stream); + + header.total_nbit = + std::accumulate((M*)chunk_desc_h->bits, (M*)chunk_desc_h->bits + bitstream_desc->pardeg, (size_t)0); + header.total_ncell = + std::accumulate((M*)chunk_desc_h->cells, (M*)chunk_desc_h->cells + bitstream_desc->pardeg, (size_t)0); + // update with the precise BITSTREAM nbyte + rte.nbyte[RTE::BITSTREAM] = sizeof(H) * header.total_ncell; + + // d_revbook and revbook_nbyte is hidden; need to improve here + subfile_collect( + header, in_uncompressed_len, book_desc->booklen, bitstream_desc->sublen, bitstream_desc->pardeg, stream); + + out_compressed = d_compressed; + out_compressed_len = header.subfile_size(); +} + +TEMPLATE_TYPE +void IMPL::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device) +{ + Header header; + if (header_on_device) + CHECK_CUDA(cudaMemcpyAsync(&header, in_compressed, sizeof(header), cudaMemcpyDeviceToHost, stream)); + + auto d_revbook = ACCESSOR(REVBOOK, BYTE); + auto d_par_nbit = ACCESSOR(PAR_NBIT, M); + auto d_par_entry = ACCESSOR(PAR_ENTRY, M); + auto d_bitstream = ACCESSOR(BITSTREAM, H); + + auto const revbook_nbyte = get_revbook_nbyte(header.booklen); + + // launch_coarse_grained_Huffman_decoding( + asz::hf_decode_coarse( + d_bitstream, d_revbook, revbook_nbyte, d_par_nbit, d_par_entry, header.sublen, header.pardeg, out_decompressed, + time_lossless, stream); +} + +TEMPLATE_TYPE +void IMPL::clear_buffer() +{ + cudaMemset(d_tmp, 0x0, rte.nbyte[RTE::TMP]); + cudaMemset(d_book, 0x0, rte.nbyte[RTE::BOOK]); + cudaMemset(d_revbook, 0x0, rte.nbyte[RTE::REVBOOK]); + cudaMemset(d_par_nbit, 0x0, rte.nbyte[RTE::PAR_NBIT]); + cudaMemset(d_par_ncell, 0x0, rte.nbyte[RTE::PAR_NCELL]); + cudaMemset(d_par_entry, 0x0, rte.nbyte[RTE::PAR_ENTRY]); + cudaMemset(d_bitstream, 0x0, rte.nbyte[RTE::BITSTREAM]); +} + +// private helper +TEMPLATE_TYPE +void IMPL::subfile_collect( + Header& header, + size_t const in_uncompressed_len, + int const booklen, + int const sublen, + int const pardeg, + cudaStream_t stream) +{ + auto BARRIER = [&]() { + if (stream) + CHECK_CUDA(cudaStreamSynchronize(stream)); + else + CHECK_CUDA(cudaDeviceSynchronize()); + }; + + header.self_bytes = sizeof(Header); + header.booklen = booklen; + header.sublen = sublen; + header.pardeg = pardeg; + header.uncompressed_len = in_uncompressed_len; + + MetadataT nbyte[Header::END]; + nbyte[Header::HEADER] = sizeof(Header); + + EXPORT_NBYTE(REVBOOK) + EXPORT_NBYTE(PAR_NBIT) + EXPORT_NBYTE(PAR_ENTRY) + EXPORT_NBYTE(BITSTREAM) + + header.entry[0] = 0; + // *.END + 1: need to know the ending position + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] = nbyte[i - 1]; } + for (auto i = 1; i < Header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } + + // auto debug_header_entry = [&]() { + // for (auto i = 0; i < Header::END + 1; i++) printf("%d, header entry: %d\n", i, header.entry[i]); + // }; + // debug_header_entry(); + + CHECK_CUDA(cudaMemcpyAsync(d_compressed, &header, sizeof(header), cudaMemcpyHostToDevice, stream)); + + /* debug */ BARRIER(); + + DEVICE2DEVICE_COPY(revbook, REVBOOK) + DEVICE2DEVICE_COPY(par_nbit, PAR_NBIT) + DEVICE2DEVICE_COPY(par_entry, PAR_ENTRY) + DEVICE2DEVICE_COPY(bitstream, BITSTREAM) +} + +// getter +TEMPLATE_TYPE +float IMPL::get_time_elapsed() const { return milliseconds; } + +TEMPLATE_TYPE +float IMPL::get_time_book() const { return time_book; } +TEMPLATE_TYPE +float IMPL::get_time_lossless() const { return time_lossless; } + +TEMPLATE_TYPE +H* IMPL::expose_book() const { return d_book; } + +TEMPLATE_TYPE +BYTE* IMPL::expose_revbook() const { return d_revbook; } + +// TODO this kind of space will be overlapping with quant-codes +TEMPLATE_TYPE +size_t IMPL::get_workspace_nbyte(size_t len) const { return sizeof(H) * len; } + +TEMPLATE_TYPE +size_t IMPL::get_max_output_nbyte(size_t len) const { return sizeof(H) * len / 2; } + +TEMPLATE_TYPE +size_t IMPL::get_revbook_nbyte(int dict_size) { return sizeof(BOOK) * (2 * CELL_BITWIDTH) + sizeof(SYM) * dict_size; } + +TEMPLATE_TYPE +constexpr bool IMPL::can_overlap_input_and_firstphase_encode() { return sizeof(T) == sizeof(H); } + +// auxiliary +TEMPLATE_TYPE +void IMPL::dbg_println(const std::string SYM_name, void* VAR, int SYM) +{ + CUdeviceptr pbase0{0}; + size_t psize0{0}; + + cuMemGetAddressRange(&pbase0, &psize0, (CUdeviceptr)VAR); + printf( + "%s:\n" + "\t(supposed) pointer : %p\n" + "\t(supposed) bytes : %'9lu\n" + "\t(queried) pbase0 : %p\n" + "\t(queried) psize0 : %'9lu\n", + SYM_name.c_str(), (void*)VAR, (size_t)rte.nbyte[SYM], (void*)&pbase0, psize0); + pbase0 = 0, psize0 = 0; +} + +} // namespace cusz + +#undef HC_ALLOCDEV +#undef HC_ALLOCHOST +#undef HC_FREEDEV +#undef HC_FREEHOST +#undef EXPORT_NBYTE +#undef ACCESSOR +#undef DEVICE2DEVICE_COPY + +#undef TEMPLATE_TYPE +#undef IMPL + +#endif diff --git a/qtensor/compression/cusz/src/hf/detail/par_merge.inl b/qtensor/compression/cusz/src/hf/detail/par_merge.inl index 6e934a08..70068967 100644 --- a/qtensor/compression/cusz/src/hf/detail/par_merge.inl +++ b/qtensor/compression/cusz/src/hf/detail/par_merge.inl @@ -1,445 +1,445 @@ -/* - * Authors: - * Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com) - * High Performance Computing Lab, Georgia Tech - * - * Future Publication: - * GPU MergePath: A GPU Merging Algorithm - * ACM International Conference on Supercomputing 2012 - * June 25-29 2012, San Servolo, Venice, Italy - * - * (C) 2012 Georgia Institute of Technology - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - Neither the name of the Georgia Institute of Technology nor the names of - * its contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - */ - -/** - * @file par_merge.h - * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)) - * @brief Modified and adapted by Cody Rivera - * @version 0.3 - * @date 2020-10-24 - * (created) 2020-06 (rev) 2021-06-21 - * - */ - -#ifndef CUSZ_KERNEL_PAR_MERGE_CUH -#define CUSZ_KERNEL_PAR_MERGE_CUH - -#include -#include -#include -#include -#include -#include - -#include -namespace cg = cooperative_groups; - -#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) -#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) -// Mathematically correct modulo -#define MOD(a, b) ((((a) % (b)) + (b)) % (b)) - -/* MERGETYPE - * Performs merges of two sorted pseudorandom arrays of length - * Times the runs and reports on the average time - * Checks the output of each merge for correctness - */ -#define PADDING 1024 - -/******************************************************************************** - * signature - ********************************************************************************/ - -// Partition array -template -__device__ void cudaWorkloadDiagonals( - F* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - F* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - uint32_t* diagonal_path_intersections, - /* Shared Memory */ - int32_t& x_top, - int32_t& y_top, - int32_t& x_bottom, - int32_t& y_bottom, - int32_t& found, - int32_t* oneorzero); - -// Merge partitions -template -__device__ void cudaMergeSinglePath( - F* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - F* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - uint32_t* diagonal_path_intersections, - F* tempFreq, - int* tempIndex, - int* tempIsLeaf, - int tempLength); - -template -__device__ void parMerge( - F* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - F* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - F* tempFreq, - int* tempIndex, - int* tempIsLeaf, - int& tempLength, - uint32_t* diagonal_path_intersections, - int blocks, - int threads, - /* Shared Memory */ - int32_t& x_top, - int32_t& y_top, - int32_t& x_bottom, - int32_t& y_bottom, - int32_t& found, - int32_t* oneorzero); - -template -__device__ void merge( - F* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - F* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - F* tempFreq, - int* tempIndex, - int* tempIsLeaf, - int& tempLength); - -/******************************************************************************** - * definition - ********************************************************************************/ - -// clang-format off -template -__device__ void parMerge( - F* copyFreq, int* copyIndex, int* copyIsLeaf, int cStart, int cEnd, - F* iNodesFreq, int iStart, int iEnd, int iNodesCap, - F* tempFreq, int* tempIndex, int* tempIsLeaf, int& tempLength, - uint32_t* diagonal_path_intersections, int blocks, int threads, - /* Shared Memory */ - int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom, - int32_t& found, int32_t* oneorzero) - { - // clang-format on - auto current_grid = cg::this_grid(); - current_grid.sync(); - tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap); - - if (tempLength == 0) return; - - // Perform the global diagonal intersection serach to divide work among SMs - cudaWorkloadDiagonals( - copyFreq, copyIndex, copyIsLeaf, cStart, cEnd, // - iNodesFreq, iStart, iEnd, iNodesCap, // - diagonal_path_intersections, // - x_top, y_top, x_bottom, y_bottom, found, oneorzero); - current_grid.sync(); - - // Merge between global diagonals independently on each block - cudaMergeSinglePath( - copyFreq, copyIndex, copyIsLeaf, cStart, cEnd, // - iNodesFreq, iStart, iEnd, iNodesCap, // - diagonal_path_intersections, // - tempFreq, tempIndex, tempIsLeaf, tempLength); - current_grid.sync(); -} - -/* CUDAWORKLOADDIAGONALS - * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path. - * This divides the workload into independent merges for the next step - */ -// clang-format off -template -__device__ void cudaWorkloadDiagonals( - F* copyFreq, int* copyIndex, int* copyIsLeaf, - int cStart, int cEnd, - F* iNodesFreq, - int iStart, int iEnd, int iNodesCap, - uint32_t* diagonal_path_intersections, - /* Shared Memory */ - int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom, - int32_t& found, int32_t* oneorzero) -{ - // clang-format on - uint32_t A_length = cEnd - cStart; - uint32_t B_length = MOD(iEnd - iStart, iNodesCap); - // Calculate combined index around the MergePath "matrix" - int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x; - /* - __shared__ int32_t x_top, y_top, x_bottom, y_bottom, found; - __shared__ int32_t oneorzero[32]; - */ - int threadOffset = threadIdx.x - 16; - - if (threadIdx.x < 32) { - // Figure out the coordinates of our diagonal - if (A_length >= B_length) { - x_top = MIN(combinedIndex, A_length); - y_top = combinedIndex > A_length ? combinedIndex - (A_length) : 0; - x_bottom = y_top; - y_bottom = x_top; - } - else { - y_bottom = MIN(combinedIndex, B_length); - x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0; - y_top = x_bottom; - x_top = y_bottom; - } - } - - // if (threadIdx.x == 0) { - // printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom); - //} - - found = 0; - - // Search the diagonal - while (!found) { - // Update our coordinates within the 32-wide section of the diagonal - int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset; - int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset; - int32_t getfrom_x = current_x + cStart - 1; - // Below statement is a more efficient, divmodless version of the following - // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap); - int32_t getfrom_y = iStart + current_y; - - if (threadIdx.x < 32) { - if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap; - - // Are we a '1' or '0' with respect to A[x] <= B[x] - if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; } - else if (current_y >= (int32_t)B_length || current_x < 1) { - oneorzero[threadIdx.x] = 1; - } - else { - oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0; - } - } - - __syncthreads(); - - // If we find the meeting of the '1's and '0's, we found the - // intersection of the path and diagonal - if (threadIdx.x > 0 and // - threadIdx.x < 32 and // - (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1]) // - ) { - found = 1; - - diagonal_path_intersections[blockIdx.x] = current_x; - diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y; - } - - __syncthreads(); - - // Adjust the search window on the diagonal - if (threadIdx.x == 16) { - if (oneorzero[31] != 0) { - x_bottom = current_x; - y_bottom = current_y; - } - else { - x_top = current_x; - y_top = current_y; - } - } - __syncthreads(); - } - - // Set the boundary diagonals (through 0,0 and A_length,B_length) - if (threadIdx.x == 0 && blockIdx.x == 0) { - diagonal_path_intersections[0] = 0; - diagonal_path_intersections[gridDim.x + 1] = 0; - diagonal_path_intersections[gridDim.x] = A_length; - diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length; - } -} - -// Serial merge -// clang-format off -template -__device__ void merge( - F* copyFreq, int* copyIndex, int* copyIsLeaf, int cStart, int cEnd, - F* iNodesFreq, int iStart, int iEnd, int iNodesCap, - F* tempFreq, int* tempIndex, int* tempIsLeaf, int& tempLength) -{ - // clang-format on - int len = 0; - int iterCopy = cStart, iterINodes = iStart; - - while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) { - if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) { - tempFreq[len] = copyFreq[iterCopy]; - tempIndex[len] = copyIndex[iterCopy]; - tempIsLeaf[len] = copyIsLeaf[iterCopy]; - ++iterCopy; - } - else { - tempFreq[len] = iNodesFreq[iterINodes]; - tempIndex[len] = iterINodes; - tempIsLeaf[len] = 0; - iterINodes = MOD(iterINodes + 1, iNodesCap); - } - ++len; - } - - while (iterCopy < cEnd) { - tempFreq[len] = copyFreq[iterCopy]; - tempIndex[len] = copyIndex[iterCopy]; - tempIsLeaf[len] = copyIsLeaf[iterCopy]; - ++iterCopy; - ++len; - } - while (MOD(iEnd - iterINodes, iNodesCap) > 0) { - tempFreq[len] = iNodesFreq[iterINodes]; - tempIndex[len] = iterINodes; - tempIsLeaf[len] = 0; - iterINodes = MOD(iterINodes + 1, iNodesCap); - ++len; - } - - tempLength = len; -} - -/* CUDAMERGESINGLEPATH - * Performs merge windows within a thread block from that block's global diagonal - * intersection to the next - */ -#define K 512 -#define PAD_SIZE 0 - -// clang-format off -template -__device__ void cudaMergeSinglePath( - F* copyFreq, int* copyIndex, int* copyIsLeaf, - int cStart, int cEnd, - F* iNodesFreq, - int iStart, int iEnd, int iNodesCap, - uint32_t* diagonal_path_intersections, - F* tempFreq, int* tempIndex, int* tempIsLeaf, - int tempLength) -{ - // clang-format on - // Temporary Code -- Serial Merge Per Block - if (threadIdx.x == 0) { - // Boundaries - int x_block_top = diagonal_path_intersections[blockIdx.x]; - int y_block_top = diagonal_path_intersections[blockIdx.x + gridDim.x + 1]; - int x_block_stop = diagonal_path_intersections[blockIdx.x + 1]; - int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2]; - - // Actual indexes - int x_start = x_block_top + cStart; - int x_end = x_block_stop + cStart; - int y_start = MOD(iStart + y_block_top, iNodesCap); - int y_end = MOD(iStart + y_block_stop, iNodesCap); - - int offset = x_block_top + y_block_top; - - int dummy; // Unused result - // TODO optimize serial merging of each partition - merge( - copyFreq, copyIndex, copyIsLeaf, x_start, x_end, // - iNodesFreq, y_start, y_end, iNodesCap, // - tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy); - if (0) { - printf( - "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top, - y_block_stop, dummy); - } - } -} - -// `unsigned int` instantiations -template __device__ void parMerge( - unsigned int* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - unsigned int* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - unsigned int* tempFreq, - int* tempIndex, - int* tempIsLeaf, - int& tempLength, - uint32_t* diagonal_path_intersections, - int blocks, - int threads, - /* Shared Memory */ - int32_t& x_top, - int32_t& y_top, - int32_t& x_bottom, - int32_t& y_bottom, - int32_t& found, - int32_t* oneorzero); - -template __device__ void merge( - unsigned int* copyFreq, - int* copyIndex, - int* copyIsLeaf, - int cStart, - int cEnd, - unsigned int* iNodesFreq, - int iStart, - int iEnd, - int iNodesCap, - unsigned int* tempFreq, - int* tempIndex, - int* tempIsLeaf, - int& tempLength); - +/* + * Authors: + * Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com) + * High Performance Computing Lab, Georgia Tech + * + * Future Publication: + * GPU MergePath: A GPU Merging Algorithm + * ACM International Conference on Supercomputing 2012 + * June 25-29 2012, San Servolo, Venice, Italy + * + * (C) 2012 Georgia Institute of Technology + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the Georgia Institute of Technology nor the names of + * its contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file par_merge.h + * @author Oded Green (ogreen@gatech.edu), Rob McColl (robert.c.mccoll@gmail.com)) + * @brief Modified and adapted by Cody Rivera + * @version 0.3 + * @date 2020-10-24 + * (created) 2020-06 (rev) 2021-06-21 + * + */ + +#ifndef CUSZ_KERNEL_PAR_MERGE_CUH +#define CUSZ_KERNEL_PAR_MERGE_CUH + +#include +#include +#include +#include +#include +#include + +#include +namespace cg = cooperative_groups; + +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +// Mathematically correct modulo +#define MOD(a, b) ((((a) % (b)) + (b)) % (b)) + +/* MERGETYPE + * Performs merges of two sorted pseudorandom arrays of length + * Times the runs and reports on the average time + * Checks the output of each merge for correctness + */ +#define PADDING 1024 + +/******************************************************************************** + * signature + ********************************************************************************/ + +// Partition array +template +__device__ void cudaWorkloadDiagonals( + F* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + F* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + uint32_t* diagonal_path_intersections, + /* Shared Memory */ + int32_t& x_top, + int32_t& y_top, + int32_t& x_bottom, + int32_t& y_bottom, + int32_t& found, + int32_t* oneorzero); + +// Merge partitions +template +__device__ void cudaMergeSinglePath( + F* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + F* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + uint32_t* diagonal_path_intersections, + F* tempFreq, + int* tempIndex, + int* tempIsLeaf, + int tempLength); + +template +__device__ void parMerge( + F* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + F* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + F* tempFreq, + int* tempIndex, + int* tempIsLeaf, + int& tempLength, + uint32_t* diagonal_path_intersections, + int blocks, + int threads, + /* Shared Memory */ + int32_t& x_top, + int32_t& y_top, + int32_t& x_bottom, + int32_t& y_bottom, + int32_t& found, + int32_t* oneorzero); + +template +__device__ void merge( + F* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + F* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + F* tempFreq, + int* tempIndex, + int* tempIsLeaf, + int& tempLength); + +/******************************************************************************** + * definition + ********************************************************************************/ + +// clang-format off +template +__device__ void parMerge( + F* copyFreq, int* copyIndex, int* copyIsLeaf, int cStart, int cEnd, + F* iNodesFreq, int iStart, int iEnd, int iNodesCap, + F* tempFreq, int* tempIndex, int* tempIsLeaf, int& tempLength, + uint32_t* diagonal_path_intersections, int blocks, int threads, + /* Shared Memory */ + int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom, + int32_t& found, int32_t* oneorzero) + { + // clang-format on + auto current_grid = cg::this_grid(); + current_grid.sync(); + tempLength = (cEnd - cStart) + MOD(iEnd - iStart, iNodesCap); + + if (tempLength == 0) return; + + // Perform the global diagonal intersection serach to divide work among SMs + cudaWorkloadDiagonals( + copyFreq, copyIndex, copyIsLeaf, cStart, cEnd, // + iNodesFreq, iStart, iEnd, iNodesCap, // + diagonal_path_intersections, // + x_top, y_top, x_bottom, y_bottom, found, oneorzero); + current_grid.sync(); + + // Merge between global diagonals independently on each block + cudaMergeSinglePath( + copyFreq, copyIndex, copyIsLeaf, cStart, cEnd, // + iNodesFreq, iStart, iEnd, iNodesCap, // + diagonal_path_intersections, // + tempFreq, tempIndex, tempIsLeaf, tempLength); + current_grid.sync(); +} + +/* CUDAWORKLOADDIAGONALS + * Performs a 32-wide binary search on one glboal diagonal per block to find the intersection with the path. + * This divides the workload into independent merges for the next step + */ +// clang-format off +template +__device__ void cudaWorkloadDiagonals( + F* copyFreq, int* copyIndex, int* copyIsLeaf, + int cStart, int cEnd, + F* iNodesFreq, + int iStart, int iEnd, int iNodesCap, + uint32_t* diagonal_path_intersections, + /* Shared Memory */ + int32_t& x_top, int32_t& y_top, int32_t& x_bottom, int32_t& y_bottom, + int32_t& found, int32_t* oneorzero) +{ + // clang-format on + uint32_t A_length = cEnd - cStart; + uint32_t B_length = MOD(iEnd - iStart, iNodesCap); + // Calculate combined index around the MergePath "matrix" + int32_t combinedIndex = ((uint64_t)blockIdx.x * ((uint64_t)A_length + (uint64_t)B_length)) / (uint64_t)gridDim.x; + /* + __shared__ int32_t x_top, y_top, x_bottom, y_bottom, found; + __shared__ int32_t oneorzero[32]; + */ + int threadOffset = threadIdx.x - 16; + + if (threadIdx.x < 32) { + // Figure out the coordinates of our diagonal + if (A_length >= B_length) { + x_top = MIN(combinedIndex, A_length); + y_top = combinedIndex > A_length ? combinedIndex - (A_length) : 0; + x_bottom = y_top; + y_bottom = x_top; + } + else { + y_bottom = MIN(combinedIndex, B_length); + x_bottom = combinedIndex > B_length ? combinedIndex - (B_length) : 0; + y_top = x_bottom; + x_top = y_bottom; + } + } + + // if (threadIdx.x == 0) { + // printf("Diagonal block %d: (%d, %d) to (%d, %d)\n", blockIdx.x, x_top, y_top, x_bottom, y_bottom); + //} + + found = 0; + + // Search the diagonal + while (!found) { + // Update our coordinates within the 32-wide section of the diagonal + int32_t current_x = x_top - ((x_top - x_bottom) >> 1) - threadOffset; + int32_t current_y = y_top + ((y_bottom - y_top) >> 1) + threadOffset; + int32_t getfrom_x = current_x + cStart - 1; + // Below statement is a more efficient, divmodless version of the following + // int32_t getfrom_y = MOD(iStart + current_y, iNodesCap); + int32_t getfrom_y = iStart + current_y; + + if (threadIdx.x < 32) { + if (getfrom_y >= iNodesCap) getfrom_y -= iNodesCap; + + // Are we a '1' or '0' with respect to A[x] <= B[x] + if (current_x > (int32_t)A_length or current_y < 0) { oneorzero[threadIdx.x] = 0; } + else if (current_y >= (int32_t)B_length || current_x < 1) { + oneorzero[threadIdx.x] = 1; + } + else { + oneorzero[threadIdx.x] = (copyFreq[getfrom_x] <= iNodesFreq[getfrom_y]) ? 1 : 0; + } + } + + __syncthreads(); + + // If we find the meeting of the '1's and '0's, we found the + // intersection of the path and diagonal + if (threadIdx.x > 0 and // + threadIdx.x < 32 and // + (oneorzero[threadIdx.x] != oneorzero[threadIdx.x - 1]) // + ) { + found = 1; + + diagonal_path_intersections[blockIdx.x] = current_x; + diagonal_path_intersections[blockIdx.x + gridDim.x + 1] = current_y; + } + + __syncthreads(); + + // Adjust the search window on the diagonal + if (threadIdx.x == 16) { + if (oneorzero[31] != 0) { + x_bottom = current_x; + y_bottom = current_y; + } + else { + x_top = current_x; + y_top = current_y; + } + } + __syncthreads(); + } + + // Set the boundary diagonals (through 0,0 and A_length,B_length) + if (threadIdx.x == 0 && blockIdx.x == 0) { + diagonal_path_intersections[0] = 0; + diagonal_path_intersections[gridDim.x + 1] = 0; + diagonal_path_intersections[gridDim.x] = A_length; + diagonal_path_intersections[gridDim.x + gridDim.x + 1] = B_length; + } +} + +// Serial merge +// clang-format off +template +__device__ void merge( + F* copyFreq, int* copyIndex, int* copyIsLeaf, int cStart, int cEnd, + F* iNodesFreq, int iStart, int iEnd, int iNodesCap, + F* tempFreq, int* tempIndex, int* tempIsLeaf, int& tempLength) +{ + // clang-format on + int len = 0; + int iterCopy = cStart, iterINodes = iStart; + + while (iterCopy < cEnd && MOD(iEnd - iterINodes, iNodesCap) > 0) { + if (copyFreq[iterCopy] <= iNodesFreq[iterINodes]) { + tempFreq[len] = copyFreq[iterCopy]; + tempIndex[len] = copyIndex[iterCopy]; + tempIsLeaf[len] = copyIsLeaf[iterCopy]; + ++iterCopy; + } + else { + tempFreq[len] = iNodesFreq[iterINodes]; + tempIndex[len] = iterINodes; + tempIsLeaf[len] = 0; + iterINodes = MOD(iterINodes + 1, iNodesCap); + } + ++len; + } + + while (iterCopy < cEnd) { + tempFreq[len] = copyFreq[iterCopy]; + tempIndex[len] = copyIndex[iterCopy]; + tempIsLeaf[len] = copyIsLeaf[iterCopy]; + ++iterCopy; + ++len; + } + while (MOD(iEnd - iterINodes, iNodesCap) > 0) { + tempFreq[len] = iNodesFreq[iterINodes]; + tempIndex[len] = iterINodes; + tempIsLeaf[len] = 0; + iterINodes = MOD(iterINodes + 1, iNodesCap); + ++len; + } + + tempLength = len; +} + +/* CUDAMERGESINGLEPATH + * Performs merge windows within a thread block from that block's global diagonal + * intersection to the next + */ +#define K 512 +#define PAD_SIZE 0 + +// clang-format off +template +__device__ void cudaMergeSinglePath( + F* copyFreq, int* copyIndex, int* copyIsLeaf, + int cStart, int cEnd, + F* iNodesFreq, + int iStart, int iEnd, int iNodesCap, + uint32_t* diagonal_path_intersections, + F* tempFreq, int* tempIndex, int* tempIsLeaf, + int tempLength) +{ + // clang-format on + // Temporary Code -- Serial Merge Per Block + if (threadIdx.x == 0) { + // Boundaries + int x_block_top = diagonal_path_intersections[blockIdx.x]; + int y_block_top = diagonal_path_intersections[blockIdx.x + gridDim.x + 1]; + int x_block_stop = diagonal_path_intersections[blockIdx.x + 1]; + int y_block_stop = diagonal_path_intersections[blockIdx.x + gridDim.x + 2]; + + // Actual indexes + int x_start = x_block_top + cStart; + int x_end = x_block_stop + cStart; + int y_start = MOD(iStart + y_block_top, iNodesCap); + int y_end = MOD(iStart + y_block_stop, iNodesCap); + + int offset = x_block_top + y_block_top; + + int dummy; // Unused result + // TODO optimize serial merging of each partition + merge( + copyFreq, copyIndex, copyIsLeaf, x_start, x_end, // + iNodesFreq, y_start, y_end, iNodesCap, // + tempFreq + offset, tempIndex + offset, tempIsLeaf + offset, dummy); + if (0) { + printf( + "block: %d x: %d %d, y: %d %d, contrib: %d\n", blockIdx.x, x_block_top, x_block_stop, y_block_top, + y_block_stop, dummy); + } + } +} + +// `unsigned int` instantiations +template __device__ void parMerge( + unsigned int* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + unsigned int* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + unsigned int* tempFreq, + int* tempIndex, + int* tempIsLeaf, + int& tempLength, + uint32_t* diagonal_path_intersections, + int blocks, + int threads, + /* Shared Memory */ + int32_t& x_top, + int32_t& y_top, + int32_t& x_bottom, + int32_t& y_bottom, + int32_t& found, + int32_t* oneorzero); + +template __device__ void merge( + unsigned int* copyFreq, + int* copyIndex, + int* copyIsLeaf, + int cStart, + int cEnd, + unsigned int* iNodesFreq, + int iStart, + int iEnd, + int iNodesCap, + unsigned int* tempFreq, + int* tempIndex, + int* tempIsLeaf, + int& tempLength); + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/src/hf/hf.cc b/qtensor/compression/cusz/src/hf/hf.cc index 19387263..54b95b25 100644 --- a/qtensor/compression/cusz/src/hf/hf.cc +++ b/qtensor/compression/cusz/src/hf/hf.cc @@ -1,109 +1,109 @@ -/** - * @file codec.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-04-23 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "common/type_traits.hh" - -#include "hf/hf.hh" -#include "hf/hf_bookg.hh" -#include "hf/hf_codecg.hh" - -namespace cusz { - -#define TEMPLATE_TYPE template -#define HUFFMAN_COARSE LosslessCodec - -TEMPLATE_TYPE -HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); } - -TEMPLATE_TYPE -HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique()} {} - -TEMPLATE_TYPE -HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique(*old.pimpl)} -{ - // TODO allocation/deep copy -} - -TEMPLATE_TYPE -HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old) -{ - *pimpl = *old.pimpl; - // TODO allocation/deep copy - return *this; -} - -TEMPLATE_TYPE -HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default; - -TEMPLATE_TYPE -HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default; - -//------------------------------------------------------------------------------ - -TEMPLATE_TYPE -void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print) -{ - pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print); -} - -TEMPLATE_TYPE -void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream) -{ - pimpl->build_codebook(freq, booklen, stream); -} - -TEMPLATE_TYPE -void HUFFMAN_COARSE::encode( - T* in_uncompressed, - size_t const in_uncompressed_len, - BYTE*& out_compressed, - size_t& out_compressed_len, - cudaStream_t stream) -{ - pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream); -} - -TEMPLATE_TYPE -void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device) -{ - pimpl->decode(in_compressed, out_decompressed, stream, header_on_device); -} - -TEMPLATE_TYPE -void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); } - -TEMPLATE_TYPE -float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); } - -TEMPLATE_TYPE -float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); } -TEMPLATE_TYPE -float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); } - -#undef TEMPLATE_TYPE -#undef HUFFMAN_COARSE - -} // namespace cusz - -#define HUFFCOARSE_CC(E, ETF, H, M) \ - template class cusz::LosslessCodec::type, HuffTrait::type, MetadataTrait::type>; - -HUFFCOARSE_CC(1, false, 4, 4) // uint -HUFFCOARSE_CC(1, false, 8, 4) // -HUFFCOARSE_CC(2, false, 4, 4) // -HUFFCOARSE_CC(2, false, 8, 4) // -HUFFCOARSE_CC(4, false, 4, 4) // -HUFFCOARSE_CC(4, false, 8, 4) // - -HUFFCOARSE_CC(4, true, 4, 4) // float -HUFFCOARSE_CC(4, true, 8, 4) // - -#undef HUFFCOARSE_CC +/** + * @file codec.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-04-23 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "common/type_traits.hh" + +#include "hf/hf.hh" +#include "hf/hf_bookg.hh" +#include "hf/hf_codecg.hh" + +namespace cusz { + +#define TEMPLATE_TYPE template +#define HUFFMAN_COARSE LosslessCodec + +TEMPLATE_TYPE +HUFFMAN_COARSE::~LosslessCodec() { pimpl.reset(); } + +TEMPLATE_TYPE +HUFFMAN_COARSE::LosslessCodec() : pimpl{std::make_unique()} {} + +TEMPLATE_TYPE +HUFFMAN_COARSE::LosslessCodec(const HUFFMAN_COARSE& old) : pimpl{std::make_unique(*old.pimpl)} +{ + // TODO allocation/deep copy +} + +TEMPLATE_TYPE +HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(const HUFFMAN_COARSE& old) +{ + *pimpl = *old.pimpl; + // TODO allocation/deep copy + return *this; +} + +TEMPLATE_TYPE +HUFFMAN_COARSE::LosslessCodec(HUFFMAN_COARSE&&) = default; + +TEMPLATE_TYPE +HUFFMAN_COARSE& HUFFMAN_COARSE::operator=(HUFFMAN_COARSE&&) = default; + +//------------------------------------------------------------------------------ + +TEMPLATE_TYPE +void HUFFMAN_COARSE::init(size_t const in_uncompressed_len, int const booklen, int const pardeg, bool dbg_print) +{ + pimpl->init(in_uncompressed_len, booklen, pardeg, dbg_print); +} + +TEMPLATE_TYPE +void HUFFMAN_COARSE::build_codebook(uint32_t* freq, int const booklen, cudaStream_t stream) +{ + pimpl->build_codebook(freq, booklen, stream); +} + +TEMPLATE_TYPE +void HUFFMAN_COARSE::encode( + T* in_uncompressed, + size_t const in_uncompressed_len, + BYTE*& out_compressed, + size_t& out_compressed_len, + cudaStream_t stream) +{ + pimpl->encode(in_uncompressed, in_uncompressed_len, out_compressed, out_compressed_len, stream); +} + +TEMPLATE_TYPE +void HUFFMAN_COARSE::decode(BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool header_on_device) +{ + pimpl->decode(in_compressed, out_decompressed, stream, header_on_device); +} + +TEMPLATE_TYPE +void HUFFMAN_COARSE::clear_buffer() { pimpl->clear_buffer(); } + +TEMPLATE_TYPE +float HUFFMAN_COARSE::get_time_elapsed() const { return pimpl->get_time_elapsed(); } + +TEMPLATE_TYPE +float HUFFMAN_COARSE::get_time_book() const { return pimpl->get_time_book(); } +TEMPLATE_TYPE +float HUFFMAN_COARSE::get_time_lossless() const { return pimpl->get_time_lossless(); } + +#undef TEMPLATE_TYPE +#undef HUFFMAN_COARSE + +} // namespace cusz + +#define HUFFCOARSE_CC(E, ETF, H, M) \ + template class cusz::LosslessCodec::type, HuffTrait::type, MetadataTrait::type>; + +HUFFCOARSE_CC(1, false, 4, 4) // uint +HUFFCOARSE_CC(1, false, 8, 4) // +HUFFCOARSE_CC(2, false, 4, 4) // +HUFFCOARSE_CC(2, false, 8, 4) // +HUFFCOARSE_CC(4, false, 4, 4) // +HUFFCOARSE_CC(4, false, 8, 4) // + +HUFFCOARSE_CC(4, true, 4, 4) // float +HUFFCOARSE_CC(4, true, 8, 4) // + +#undef HUFFCOARSE_CC diff --git a/qtensor/compression/cusz/src/hf/hf_bookg.cu b/qtensor/compression/cusz/src/hf/hf_bookg.cu index fc6d3ac9..9bcb37ba 100644 --- a/qtensor/compression/cusz/src/hf/hf_bookg.cu +++ b/qtensor/compression/cusz/src/hf/hf_bookg.cu @@ -1,33 +1,33 @@ -/** - * @file hf_bookg.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "detail/hf_bookg.inl" -#include "hf/hf_bookg.hh" - -#define PAR_BOOK(T, H) \ - template void asz::hf_buildbook_g(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t); - -PAR_BOOK(uint8_t, uint32_t); -PAR_BOOK(uint16_t, uint32_t); -PAR_BOOK(uint32_t, uint32_t); -PAR_BOOK(float, uint32_t); - -PAR_BOOK(uint8_t, uint64_t); -PAR_BOOK(uint16_t, uint64_t); -PAR_BOOK(uint32_t, uint64_t); -PAR_BOOK(float, uint64_t); - -PAR_BOOK(uint8_t, unsigned long long); -PAR_BOOK(uint16_t, unsigned long long); -PAR_BOOK(uint32_t, unsigned long long); -PAR_BOOK(float, unsigned long long); - -#undef PAR_BOOK +/** + * @file hf_bookg.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "detail/hf_bookg.inl" +#include "hf/hf_bookg.hh" + +#define PAR_BOOK(T, H) \ + template void asz::hf_buildbook_g(uint32_t*, int const, H*, uint8_t*, int const, float*, cudaStream_t); + +PAR_BOOK(uint8_t, uint32_t); +PAR_BOOK(uint16_t, uint32_t); +PAR_BOOK(uint32_t, uint32_t); +PAR_BOOK(float, uint32_t); + +PAR_BOOK(uint8_t, uint64_t); +PAR_BOOK(uint16_t, uint64_t); +PAR_BOOK(uint32_t, uint64_t); +PAR_BOOK(float, uint64_t); + +PAR_BOOK(uint8_t, unsigned long long); +PAR_BOOK(uint16_t, unsigned long long); +PAR_BOOK(uint32_t, unsigned long long); +PAR_BOOK(float, unsigned long long); + +#undef PAR_BOOK diff --git a/qtensor/compression/cusz/src/hf/hf_codecg.cu b/qtensor/compression/cusz/src/hf/hf_codecg.cu index 9b7d9f0b..54da37f0 100644 --- a/qtensor/compression/cusz/src/hf/hf_codecg.cu +++ b/qtensor/compression/cusz/src/hf/hf_codecg.cu @@ -1,269 +1,269 @@ -/** - * @file hf_codecg.cu - * @author Jiannan Tian - * @brief kernel wrappers; launching Huffman kernels - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include -#include "detail/hf_codecg.inl" -#include "hf/hf_bookg.hh" -#include "hf/hf_codecg.hh" - -template -void asz::hf_encode_coarse( - T* uncompressed, - H* d_internal_coded, - size_t const len, - uint32_t* d_freq, - H* d_book, - int const booklen, - H* d_bitstream, - M* d_par_metadata, - M* h_par_metadata, - int const sublen, - int const pardeg, - int numSMs, - uint8_t*& out_compressed, - size_t& out_compressed_len, - float& time_lossless, - cudaStream_t stream) -{ - auto d_par_nbit = d_par_metadata; - auto d_par_ncell = d_par_metadata + pardeg; - auto d_par_entry = d_par_metadata + pardeg * 2; - - auto h_par_nbit = h_par_metadata; - auto h_par_ncell = h_par_metadata + pardeg; - auto h_par_entry = h_par_metadata + pardeg * 2; - - CREATE_CUDAEVENT_PAIR; - - /* phase 1 */ - { - auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE; - auto grid_dim = ConfigHelper::get_npart(len, block_dim); - - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase1_fill // - <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>> // - (uncompressed, len, d_book, booklen, d_internal_coded); - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } - - /* phase 2 */ - { - auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; - auto grid_dim = ConfigHelper::get_npart(pardeg, block_dim); - - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase2_deflate // - <<>> // - (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg); - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } - - /* phase 3 */ - { - CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M)); - for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1]; // inclusive scan - - CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - } - - /* phase 4 */ - { - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase4_concatenate<<>> // - (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream); - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } - - DESTROY_CUDAEVENT_PAIR; -} - -template -void asz::hf_encode_coarse_rev1( - T* uncompressed, - size_t const len, - hf_book* book_desc, - hf_bitstream* bitstream_desc, - uint8_t*& out_compressed, // 22-10-12 buggy - size_t& out_compressed_len, // 22-10-12 buggy - float& time_lossless, - cudaStream_t stream) -{ - CREATE_CUDAEVENT_PAIR; - - H* d_buffer = (H*)bitstream_desc->buffer; - H* d_bitstream = (H*)bitstream_desc->bitstream; - H* d_book = (H*)book_desc->book; - int const booklen = book_desc->booklen; - int const sublen = bitstream_desc->sublen; - int const pardeg = bitstream_desc->pardeg; - int const numSMs = bitstream_desc->numSMs; - // uint32_t* d_freq = book_desc->freq; - - auto d_par_nbit = (M*)bitstream_desc->d_metadata->bits; - auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells; - auto d_par_entry = (M*)bitstream_desc->d_metadata->entries; - - auto h_par_nbit = (M*)bitstream_desc->h_metadata->bits; - auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells; - auto h_par_entry = (M*)bitstream_desc->h_metadata->entries; - - /* phase 1 */ - { - auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE; - auto grid_dim = ConfigHelper::get_npart(len, block_dim); - - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase1_fill // - <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>> // - (uncompressed, len, d_book, booklen, d_buffer); - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } - - /* phase 2 */ - { - auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; - auto grid_dim = ConfigHelper::get_npart(pardeg, block_dim); - - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase2_deflate // - <<>> // - (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg); - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } - - /* phase 3 */ - { - CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M)); - for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1]; // inclusive scan - - CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - } - - /* phase 4 */ - { - START_CUDAEVENT_RECORDING(stream); - - asz::detail::hf_encode_phase4_concatenate<<>> // - (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream); - - STOP_CUDAEVENT_RECORDING(stream); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - - float stage_time; - TIME_ELAPSED_CUDAEVENT(&stage_time); - time_lossless += stage_time; - } -} - -template -void asz::hf_decode_coarse( - H* d_bitstream, - uint8_t* d_revbook, - int const revbook_nbyte, - M* d_par_nbit, - M* d_par_entry, - int const sublen, - int const pardeg, - T* out_decompressed, - float& time_lossless, - cudaStream_t stream) -{ - auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; // = deflating - auto const grid_dim = ConfigHelper::get_npart(pardeg, block_dim); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream) - - hf_decode_kernel // - <<>> // - (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed); - - STOP_CUDAEVENT_RECORDING(stream) - cudaStreamSynchronize(stream); - - TIME_ELAPSED_CUDAEVENT(&time_lossless); - DESTROY_CUDAEVENT_PAIR; -} - -#define HF_CODEC_INIT(T, H, M) \ - template void asz::hf_encode_coarse( \ - T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \ - float&, cudaStream_t); \ - \ - template void asz::hf_encode_coarse_rev1( \ - T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t); \ - \ - template void asz::hf_decode_coarse( \ - H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t); - -HF_CODEC_INIT(uint8_t, uint32_t, uint32_t); -HF_CODEC_INIT(uint16_t, uint32_t, uint32_t); -HF_CODEC_INIT(uint32_t, uint32_t, uint32_t); -HF_CODEC_INIT(float, uint32_t, uint32_t); -HF_CODEC_INIT(uint8_t, uint64_t, uint32_t); -HF_CODEC_INIT(uint16_t, uint64_t, uint32_t); -HF_CODEC_INIT(uint32_t, uint64_t, uint32_t); -HF_CODEC_INIT(float, uint64_t, uint32_t); -HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t); -HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t); -HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t); -HF_CODEC_INIT(float, unsigned long long, uint32_t); - -#undef HFBOOK_INIT -#undef HF_CODEC_INIT +/** + * @file hf_codecg.cu + * @author Jiannan Tian + * @brief kernel wrappers; launching Huffman kernels + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include +#include "detail/hf_codecg.inl" +#include "hf/hf_bookg.hh" +#include "hf/hf_codecg.hh" + +template +void asz::hf_encode_coarse( + T* uncompressed, + H* d_internal_coded, + size_t const len, + uint32_t* d_freq, + H* d_book, + int const booklen, + H* d_bitstream, + M* d_par_metadata, + M* h_par_metadata, + int const sublen, + int const pardeg, + int numSMs, + uint8_t*& out_compressed, + size_t& out_compressed_len, + float& time_lossless, + cudaStream_t stream) +{ + auto d_par_nbit = d_par_metadata; + auto d_par_ncell = d_par_metadata + pardeg; + auto d_par_entry = d_par_metadata + pardeg * 2; + + auto h_par_nbit = h_par_metadata; + auto h_par_ncell = h_par_metadata + pardeg; + auto h_par_entry = h_par_metadata + pardeg * 2; + + CREATE_CUDAEVENT_PAIR; + + /* phase 1 */ + { + auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE; + auto grid_dim = ConfigHelper::get_npart(len, block_dim); + + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase1_fill // + <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>> // + (uncompressed, len, d_book, booklen, d_internal_coded); + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } + + /* phase 2 */ + { + auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; + auto grid_dim = ConfigHelper::get_npart(pardeg, block_dim); + + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase2_deflate // + <<>> // + (d_internal_coded, len, d_par_nbit, d_par_ncell, sublen, pardeg); + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } + + /* phase 3 */ + { + CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M)); + for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1]; // inclusive scan + + CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + /* phase 4 */ + { + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase4_concatenate<<>> // + (d_internal_coded, d_par_entry, d_par_ncell, sublen, d_bitstream); + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } + + DESTROY_CUDAEVENT_PAIR; +} + +template +void asz::hf_encode_coarse_rev1( + T* uncompressed, + size_t const len, + hf_book* book_desc, + hf_bitstream* bitstream_desc, + uint8_t*& out_compressed, // 22-10-12 buggy + size_t& out_compressed_len, // 22-10-12 buggy + float& time_lossless, + cudaStream_t stream) +{ + CREATE_CUDAEVENT_PAIR; + + H* d_buffer = (H*)bitstream_desc->buffer; + H* d_bitstream = (H*)bitstream_desc->bitstream; + H* d_book = (H*)book_desc->book; + int const booklen = book_desc->booklen; + int const sublen = bitstream_desc->sublen; + int const pardeg = bitstream_desc->pardeg; + int const numSMs = bitstream_desc->numSMs; + // uint32_t* d_freq = book_desc->freq; + + auto d_par_nbit = (M*)bitstream_desc->d_metadata->bits; + auto d_par_ncell = (M*)bitstream_desc->d_metadata->cells; + auto d_par_entry = (M*)bitstream_desc->d_metadata->entries; + + auto h_par_nbit = (M*)bitstream_desc->h_metadata->bits; + auto h_par_ncell = (M*)bitstream_desc->h_metadata->cells; + auto h_par_entry = (M*)bitstream_desc->h_metadata->entries; + + /* phase 1 */ + { + auto block_dim = HuffmanHelper::BLOCK_DIM_ENCODE; + auto grid_dim = ConfigHelper::get_npart(len, block_dim); + + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase1_fill // + <<<8 * numSMs, 256, sizeof(H) * booklen, stream>>> // + (uncompressed, len, d_book, booklen, d_buffer); + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } + + /* phase 2 */ + { + auto block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; + auto grid_dim = ConfigHelper::get_npart(pardeg, block_dim); + + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase2_deflate // + <<>> // + (d_buffer, len, d_par_nbit, d_par_ncell, sublen, pardeg); + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } + + /* phase 3 */ + { + CHECK_CUDA(cudaMemcpyAsync(h_par_nbit, d_par_nbit, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaMemcpyAsync(h_par_ncell, d_par_ncell, pardeg * sizeof(M), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + memcpy(h_par_entry + 1, h_par_ncell, (pardeg - 1) * sizeof(M)); + for (auto i = 1; i < pardeg; i++) h_par_entry[i] += h_par_entry[i - 1]; // inclusive scan + + CHECK_CUDA(cudaMemcpyAsync(d_par_entry, h_par_entry, pardeg * sizeof(M), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + /* phase 4 */ + { + START_CUDAEVENT_RECORDING(stream); + + asz::detail::hf_encode_phase4_concatenate<<>> // + (d_buffer, d_par_entry, d_par_ncell, sublen, d_bitstream); + + STOP_CUDAEVENT_RECORDING(stream); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + + float stage_time; + TIME_ELAPSED_CUDAEVENT(&stage_time); + time_lossless += stage_time; + } +} + +template +void asz::hf_decode_coarse( + H* d_bitstream, + uint8_t* d_revbook, + int const revbook_nbyte, + M* d_par_nbit, + M* d_par_entry, + int const sublen, + int const pardeg, + T* out_decompressed, + float& time_lossless, + cudaStream_t stream) +{ + auto const block_dim = HuffmanHelper::BLOCK_DIM_DEFLATE; // = deflating + auto const grid_dim = ConfigHelper::get_npart(pardeg, block_dim); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream) + + hf_decode_kernel // + <<>> // + (d_bitstream, d_revbook, d_par_nbit, d_par_entry, revbook_nbyte, sublen, pardeg, out_decompressed); + + STOP_CUDAEVENT_RECORDING(stream) + cudaStreamSynchronize(stream); + + TIME_ELAPSED_CUDAEVENT(&time_lossless); + DESTROY_CUDAEVENT_PAIR; +} + +#define HF_CODEC_INIT(T, H, M) \ + template void asz::hf_encode_coarse( \ + T*, H*, size_t const, uint32_t*, H*, int const, H*, M*, M*, int const, int const, int, uint8_t*&, size_t&, \ + float&, cudaStream_t); \ + \ + template void asz::hf_encode_coarse_rev1( \ + T*, size_t const, hf_book*, hf_bitstream*, uint8_t*&, size_t&, float&, cudaStream_t); \ + \ + template void asz::hf_decode_coarse( \ + H*, uint8_t*, int const, M*, M*, int const, int const, T*, float&, cudaStream_t); + +HF_CODEC_INIT(uint8_t, uint32_t, uint32_t); +HF_CODEC_INIT(uint16_t, uint32_t, uint32_t); +HF_CODEC_INIT(uint32_t, uint32_t, uint32_t); +HF_CODEC_INIT(float, uint32_t, uint32_t); +HF_CODEC_INIT(uint8_t, uint64_t, uint32_t); +HF_CODEC_INIT(uint16_t, uint64_t, uint32_t); +HF_CODEC_INIT(uint32_t, uint64_t, uint32_t); +HF_CODEC_INIT(float, uint64_t, uint32_t); +HF_CODEC_INIT(uint8_t, unsigned long long, uint32_t); +HF_CODEC_INIT(uint16_t, unsigned long long, uint32_t); +HF_CODEC_INIT(uint32_t, unsigned long long, uint32_t); +HF_CODEC_INIT(float, unsigned long long, uint32_t); + +#undef HFBOOK_INIT +#undef HF_CODEC_INIT diff --git a/qtensor/compression/cusz/src/hf/hf_pimpl.cu b/qtensor/compression/cusz/src/hf/hf_pimpl.cu index 595ccea4..08a35282 100644 --- a/qtensor/compression/cusz/src/hf/hf_pimpl.cu +++ b/qtensor/compression/cusz/src/hf/hf_pimpl.cu @@ -1,31 +1,31 @@ -/** - * @file huffman_coarse.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2021-12-17 - * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * @copyright (C) 2021 by Washington State University, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include "detail/hf_pimpl.inl" -#include "hf/hf.hh" - -#define HUFFCOARSE(E, ETF, H, M) \ - template class cusz::LosslessCodec::type, HuffTrait::type, MetadataTrait::type>::impl; - -HUFFCOARSE(1, false, 4, 4) // uint -HUFFCOARSE(1, false, 8, 4) // -HUFFCOARSE(2, false, 4, 4) // -HUFFCOARSE(2, false, 8, 4) // -HUFFCOARSE(4, false, 4, 4) // -HUFFCOARSE(4, false, 8, 4) // - -HUFFCOARSE(4, true, 4, 4) // float -HUFFCOARSE(4, true, 8, 4) // - -#undef HUFFCOARSE +/** + * @file huffman_coarse.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2021-12-17 + * (created) 2020-04-24 (rev1) 2021-09-05 (rev2) 2021-12-29 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * @copyright (C) 2021 by Washington State University, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include "detail/hf_pimpl.inl" +#include "hf/hf.hh" + +#define HUFFCOARSE(E, ETF, H, M) \ + template class cusz::LosslessCodec::type, HuffTrait::type, MetadataTrait::type>::impl; + +HUFFCOARSE(1, false, 4, 4) // uint +HUFFCOARSE(1, false, 8, 4) // +HUFFCOARSE(2, false, 4, 4) // +HUFFCOARSE(2, false, 8, 4) // +HUFFCOARSE(4, false, 4, 4) // +HUFFCOARSE(4, false, 8, 4) // + +HUFFCOARSE(4, true, 4, 4) // float +HUFFCOARSE(4, true, 8, 4) // + +#undef HUFFCOARSE diff --git a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu index 5433d7d8..146a8cd1 100644 --- a/qtensor/compression/cusz/src/kernel/claunch_cuda.cu +++ b/qtensor/compression/cusz/src/kernel/claunch_cuda.cu @@ -1,76 +1,76 @@ -/** - * @file kernel_cuda.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-07-24 - * - * (C) 2022 by Washington State University, Argonne National Laboratory - * - */ - -#include "detail/hist.inl" -#include "detail/spline3.inl" -// #include "hf/hf_codecg.hh" -// #include "hf/hf_struct.h" -#include "kernel/claunch_cuda.h" -#include "kernel/cpplaunch_cuda.hh" -#include "utils/cuda_err.cuh" - -#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ - cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \ - double const eb, int const radius, float* time_elapsed, cudaStream_t stream) \ - { \ - if (NO_R_SEPARATE) \ - launch_construct_Spline3( \ - data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ - else \ - launch_construct_Spline3( \ - data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ - return CUSZ_SUCCESS; \ - } \ - cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb, \ - int const radius, float* time_elapsed, cudaStream_t stream) \ - { \ - launch_reconstruct_Spline3( \ - xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ - return CUSZ_SUCCESS; \ - } - -C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); -C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); -C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); -C_SPLINE3(fp32, fp32, fp32, float, float, float); - -#undef C_SPLINE3 - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ - template <> \ - cusz_error_status cusz::cpplaunch_construct_Spline3( \ - bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \ - double const eb, int const radius, float* time_elapsed, cudaStream_t stream) \ - { \ - return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream); \ - } \ - \ - template <> \ - cusz_error_status cusz::cpplaunch_reconstruct_Spline3( \ - T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb, \ - int const radius, float* time_elapsed, cudaStream_t stream) \ - { \ - return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream); \ - } - -CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); -CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); -CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); -CPP_SPLINE3(fp32, fp32, fp32, float, float, float); - -#undef CPP_SPLINE3 +/** + * @file kernel_cuda.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-07-24 + * + * (C) 2022 by Washington State University, Argonne National Laboratory + * + */ + +#include "detail/hist.inl" +#include "detail/spline3.inl" +// #include "hf/hf_codecg.hh" +// #include "hf/hf_struct.h" +#include "kernel/claunch_cuda.h" +#include "kernel/cpplaunch_cuda.hh" +#include "utils/cuda_err.cuh" + +#define C_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ + cusz_error_status claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, \ + double const eb, int const radius, float* time_elapsed, cudaStream_t stream) \ + { \ + if (NO_R_SEPARATE) \ + launch_construct_Spline3( \ + data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ + else \ + launch_construct_Spline3( \ + data, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ + return CUSZ_SUCCESS; \ + } \ + cusz_error_status claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + T* xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* errctrl, dim3 const ec_len3, double const eb, \ + int const radius, float* time_elapsed, cudaStream_t stream) \ + { \ + launch_reconstruct_Spline3( \ + xdata, len3, anchor, an_len3, errctrl, ec_len3, eb, radius, *time_elapsed, stream); \ + return CUSZ_SUCCESS; \ + } + +C_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); +C_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); +C_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); +C_SPLINE3(fp32, fp32, fp32, float, float, float); + +#undef C_SPLINE3 + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define CPP_SPLINE3(Tliteral, Eliteral, FPliteral, T, E, FP) \ + template <> \ + cusz_error_status cusz::cpplaunch_construct_Spline3( \ + bool NO_R_SEPARATE, T* data, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, \ + double const eb, int const radius, float* time_elapsed, cudaStream_t stream) \ + { \ + return claunch_construct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + NO_R_SEPARATE, data, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream); \ + } \ + \ + template <> \ + cusz_error_status cusz::cpplaunch_reconstruct_Spline3( \ + T * xdata, dim3 const len3, T* anchor, dim3 const an_len3, E* eq, dim3 const ec_len3, double const eb, \ + int const radius, float* time_elapsed, cudaStream_t stream) \ + { \ + return claunch_reconstruct_Spline3_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + xdata, len3, anchor, an_len3, eq, ec_len3, eb, radius, time_elapsed, stream); \ + } + +CPP_SPLINE3(fp32, ui8, fp32, float, uint8_t, float); +CPP_SPLINE3(fp32, ui16, fp32, float, uint16_t, float); +CPP_SPLINE3(fp32, ui32, fp32, float, uint32_t, float); +CPP_SPLINE3(fp32, fp32, fp32, float, float, float); + +#undef CPP_SPLINE3 diff --git a/qtensor/compression/cusz/src/kernel/detail/hist.inl b/qtensor/compression/cusz/src/kernel/detail/hist.inl index a3781eb6..1950970d 100644 --- a/qtensor/compression/cusz/src/kernel/detail/hist.inl +++ b/qtensor/compression/cusz/src/kernel/detail/hist.inl @@ -1,100 +1,100 @@ -/** - * @file hist.inl - * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu) - * @brief Fast histogramming from [Gómez-Luna et al. 2013] - * @version 0.1 - * @date 2020-09-20 - * Created on 2020-02-16 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_HIST_CUH -#define CUSZ_KERNEL_HIST_CUH - -#include -#include -#include - -#include "common.hh" -#include "utils/timer.h" - -#define MIN(a, b) ((a) < (b)) ? (a) : (b) -const static unsigned int WARP_SIZE = 32; - -#define tix threadIdx.x -#define tiy threadIdx.y -#define tiz threadIdx.z -#define bix blockIdx.x -#define biy blockIdx.y -#define biz blockIdx.z -#define bdx blockDim.x -#define bdy blockDim.y -#define bdz blockDim.z - -namespace kernel { - -template -__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread); - -/* Copied from J. Gomez-Luna et al */ -template -__global__ void p2013Histogram(T*, FREQ*, size_t, int, int); - -} // namespace kernel - -template -__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread) -{ - unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int j; - if (i * symbols_per_thread < N) { // if there is a symbol to count, - for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) { - if (j < N) { - unsigned int item = in_data[j]; // Symbol to count - atomicAdd(&out_freq[item], 1); // update bin count by 1 - } - } - } -} - -template -__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R) -{ - // static_assert( - // std::numeric_limits::is_integer and (not std::numeric_limits::is_signed), - // "T must be `unsigned integer` type of {1,2,4} bytes"); - - extern __shared__ int Hs[/*(nbin + 1) * R*/]; - - const unsigned int warp_id = (int)(tix / WARP_SIZE); - const unsigned int lane = tix % WARP_SIZE; - const unsigned int warps_block = bdx / WARP_SIZE; - const unsigned int off_rep = (nbin + 1) * (tix % R); - const unsigned int begin = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane; - unsigned int end = (N / warps_block) * (warp_id + 1); - const unsigned int step = WARP_SIZE * gridDim.x; - - // final warp handles data outside of the warps_block partitions - if (warp_id >= warps_block - 1) end = N; - - for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0; - __syncthreads(); - - for (unsigned int i = begin; i < end; i += step) { - int d = in_data[i]; - d = d <= 0 and d >= nbin ? nbin / 2 : d; - atomicAdd(&Hs[off_rep + d], 1); - } - __syncthreads(); - - for (unsigned int pos = tix; pos < nbin; pos += bdx) { - int sum = 0; - for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; } - atomicAdd(out_freq + pos, sum); - } -} - -#endif +/** + * @file hist.inl + * @author Cody Rivera (cjrivera1@crimson.ua.edu), Megan Hickman Fulp (mlhickm@g.clemson.edu) + * @brief Fast histogramming from [Gómez-Luna et al. 2013] + * @version 0.1 + * @date 2020-09-20 + * Created on 2020-02-16 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_HIST_CUH +#define CUSZ_KERNEL_HIST_CUH + +#include +#include +#include + +#include "common.hh" +#include "utils/timer.h" + +#define MIN(a, b) ((a) < (b)) ? (a) : (b) +const static unsigned int WARP_SIZE = 32; + +#define tix threadIdx.x +#define tiy threadIdx.y +#define tiz threadIdx.z +#define bix blockIdx.x +#define biy blockIdx.y +#define biz blockIdx.z +#define bdx blockDim.x +#define bdy blockDim.y +#define bdz blockDim.z + +namespace kernel { + +template +__global__ void NaiveHistogram(Input in_data[], int out_freq[], int N, int symbols_per_thread); + +/* Copied from J. Gomez-Luna et al */ +template +__global__ void p2013Histogram(T*, FREQ*, size_t, int, int); + +} // namespace kernel + +template +__global__ void kernel::NaiveHistogram(T in_data[], int out_freq[], int N, int symbols_per_thread) +{ + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int j; + if (i * symbols_per_thread < N) { // if there is a symbol to count, + for (j = i * symbols_per_thread; j < (i + 1) * symbols_per_thread; j++) { + if (j < N) { + unsigned int item = in_data[j]; // Symbol to count + atomicAdd(&out_freq[item], 1); // update bin count by 1 + } + } + } +} + +template +__global__ void kernel::p2013Histogram(T* in_data, FREQ* out_freq, size_t N, int nbin, int R) +{ + // static_assert( + // std::numeric_limits::is_integer and (not std::numeric_limits::is_signed), + // "T must be `unsigned integer` type of {1,2,4} bytes"); + + extern __shared__ int Hs[/*(nbin + 1) * R*/]; + + const unsigned int warp_id = (int)(tix / WARP_SIZE); + const unsigned int lane = tix % WARP_SIZE; + const unsigned int warps_block = bdx / WARP_SIZE; + const unsigned int off_rep = (nbin + 1) * (tix % R); + const unsigned int begin = (N / warps_block) * warp_id + WARP_SIZE * blockIdx.x + lane; + unsigned int end = (N / warps_block) * (warp_id + 1); + const unsigned int step = WARP_SIZE * gridDim.x; + + // final warp handles data outside of the warps_block partitions + if (warp_id >= warps_block - 1) end = N; + + for (unsigned int pos = tix; pos < (nbin + 1) * R; pos += bdx) Hs[pos] = 0; + __syncthreads(); + + for (unsigned int i = begin; i < end; i += step) { + int d = in_data[i]; + d = d <= 0 and d >= nbin ? nbin / 2 : d; + atomicAdd(&Hs[off_rep + d], 1); + } + __syncthreads(); + + for (unsigned int pos = tix; pos < nbin; pos += bdx) { + int sum = 0; + for (int base = 0; base < (nbin + 1) * R; base += nbin + 1) { sum += Hs[base + pos]; } + atomicAdd(out_freq + pos, sum); + } +} + +#endif diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl index 0e1f9acd..28fd3bdc 100644 --- a/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl +++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo.inl @@ -1,816 +1,816 @@ -/** - * @file lorenzo.inl - * @author Jiannan Tian - * @brief Dual-ErrCtrl Lorenzo method. - * @version 0.2 - * @date 2021-01-16 - * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11 - * (rev4) 2021-04-30 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_LORENZO_CUH -#define CUSZ_KERNEL_LORENZO_CUH - -#include -// #include "utils/cuda_err.cuh" -// #include "utils/timer.h" - -#if __has_include() -// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" -#include -#else -// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" -#include "../../third_party/cub/cub/cub.cuh" -#endif - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -#define TIX threadIdx.x -#define TIY threadIdx.y -#define TIZ threadIdx.z -#define BIX blockIdx.x -#define BIY blockIdx.y -#define BIZ blockIdx.z -#define BDX blockDim.x -#define BDY blockDim.y -#define BDZ blockDim.z - -using DIM = unsigned int; -using STRIDE = unsigned int; - -namespace cusz { - -/** - * @brief compress-time 1D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param data input - * @param errctrl output 1 - * @param outlier output 2 - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2_r precalculated reciprocal of eb*2 - */ -template -__global__ void -c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r); - -/** - * @brief compress-time 2D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param data input - * @param errctrl output 1 - * @param outlier output 2 - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2_r precalculated reciprocal of eb*2 - */ -template -__global__ void c_lorenzo_2d1l_16x16data_mapto16x2( - Data* data, - ErrCtrl* errctrl, - Data* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r); - -/** - * @brief compress-time 3D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param data input - * @param errctrl output 1 - * @param outlier output 2 - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2_r precalculated reciprocal of eb*2 - */ -template -__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8( - Data* data, - ErrCtrl* errctrl, - Data* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r); - -/** - * @brief decompress-time 1D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param outlier input 1 - * @param quant input 2 - * @param xdata output - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2 precalculated eb*2 - */ -template < - typename Data, - typename ErrCtrl, - typename FP = float, - int BLOCK = 256, - int SEQ = 8> -__global__ void x_lorenzo_1d1l( - Data* outlier, // - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2); - -/** - * @brief decompress-time 2D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param outlier input 1 - * @param quant input 2 - * @param xdata output - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2 precalculated eb*2 - */ -template -__global__ void x_lorenzo_2d1l_16x16data_mapto16x2( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2); - -/** - * @brief decompress-time 3D Lorenzo pred-quant kernel - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param outlier input 1 - * @param quant input 2 - * @param xdata output - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2 precalculated eb*2 - */ -template -__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2); - -/** - * @brief decompress-time 3D Lorenzo pred-quant kernel (variant) - * - * @tparam Data type of input data - * @tparam ErrCtrl type of error-control code - * @tparam FP type for internal floating-point processing - * @tparam BLOCK block size - * @tparam SEQ degree of sequentiality - * @param outlier input 1 - * @param quant input 2 - * @param xdata output - * @param len3 data length in 3D - * @param stride3 data stride in 3D - * @param radius quant-code radius - * @param ebx2 precalculated eb*2 - */ -template -__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2); - -} // namespace cusz - -namespace { - -/** - * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier - */ -template -__forceinline__ __device__ void pred1d_radius_separate( - Data thread_scope[SEQ], - volatile Data* shmem_data, - volatile ErrCtrl* shmem_quant, - int radius, - Data from_last_stripe = 0) -{ - if CONSTEXPR (FIRST_POINT) { // i == 0 - Data delta = thread_scope[0] - from_last_stripe; - bool quantizable = fabs(delta) < radius; - Data candidate = delta + radius; - shmem_data[0 + TIX * SEQ] = (1 - quantizable) * candidate; // output; reuse data for outlier - shmem_quant[0 + TIX * SEQ] = quantizable * static_cast(candidate); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) { - Data delta = thread_scope[i] - thread_scope[i - 1]; - bool quantizable = fabs(delta) < radius; - Data candidate = delta + radius; - shmem_data[i + TIX * SEQ] = (1 - quantizable) * candidate; // output; reuse data for outlier - shmem_quant[i + TIX * SEQ] = quantizable * static_cast(candidate); - } - __syncthreads(); - } -} - -template -__forceinline__ __device__ void load1d( - Data* data, - unsigned int dimx, - unsigned int id_base, - volatile Data* shmem_data, - Data thread_scope[SEQ], - Data& from_last_stripe, - FP ebx2_r) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + TIX + i * NTHREAD; - if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); } - } - __syncthreads(); - - for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i]; - - if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1]; - __syncthreads(); -} - -template -__forceinline__ __device__ void write1d( - volatile Data* shmem_data, - Data* data, - unsigned int dimx, - unsigned int id_base, - volatile ErrCtrl* shmem_quant = nullptr, - ErrCtrl* quant = nullptr) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + TIX + i * NTHREAD; - if (id < dimx) { - if CONSTEXPR (NO_R_SEPARATE) { // TODO no-radius-separate uses shmem_data - quant[id] = shmem_data[TIX + i * NTHREAD]; - } - else { - data[id] = shmem_data[TIX + i * NTHREAD]; - quant[id] = shmem_quant[TIX + i * NTHREAD]; - } - } - } -} - -template -__forceinline__ __device__ void load2d_prequant( - Data* data, - Data center[YSEQ + 1], - unsigned int dimx, - unsigned int dimy, - unsigned int stridey, - unsigned int gix, - unsigned int giy_base, - FP ebx2_r) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r); - } - auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16); // same-warp, next-16 - if (TIY == 1) center[0] = tmp; -} - -template -__forceinline__ __device__ void pred2d(Data center[YSEQ + 1]) -{ - /* prediction - original form: Data delta = center[i] - center[i - 1] + west[i] - west[i - 1]; - short form: Data delta = center[i] - west[i]; - */ -#pragma unroll - for (auto i = YSEQ; i > 0; i--) { - center[i] -= center[i - 1]; - auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); - if (TIX > 0) center[i] -= west; - } - __syncthreads(); -} - -template -__forceinline__ __device__ void postquant_write2d( - Data center[YSEQ + 1], - ErrCtrl* quant, - Data* outlier, - unsigned int dimx, - unsigned int dimy, - unsigned int stridey, - int radius, - unsigned int gix, - unsigned int giy_base) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - - if (gix < dimx and giy_base + i - 1 < dimy) { - bool quantizable = fabs(center[i]) < radius; - Data candidate = center[i] + radius; - outlier[gid] = (1 - quantizable) * candidate; // output; reuse data for outlier - quant[gid] = quantizable * static_cast(candidate); - } - } -} - -} // namespace - -template < - typename Data, - typename ErrCtrl, - typename FP, - int BLOCK, - int SEQ> -__global__ void cusz::c_lorenzo_1d1l( // - Data* data, - ErrCtrl* quant, - Data* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r) -{ - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - union { - uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)]; - Data data[BLOCK]; - } space; - } shmem; - - auto id_base = BIX * BLOCK; - - Data thread_scope[SEQ]; - Data from_last_stripe{0}; - - /******************************************************************************** - * load from DRAM using striped layout, perform prequant - ********************************************************************************/ - load1d(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r); - - // the original SZ/cuSZ design - auto shmem_quant = reinterpret_cast(shmem.space.uninitialized + sizeof(Data) * BLOCK); - pred1d_radius_separate( - thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe); - pred1d_radius_separate(thread_scope, shmem.space.data, shmem_quant, radius); - write1d(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant); -} - -template -__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2( - Data* data, - ErrCtrl* quant, - Data* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r) -{ - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - Data center[YSEQ + 1] = {0}; // nw n - // w center - - auto gix = BIX * BDX + TIX; // BDX == 16 - auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 - - load2d_prequant(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r); - pred2d(center); - postquant_write2d(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base); -} - -template -__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8( - Data* data, - ErrCtrl* quant, - Data* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r) -{ - constexpr auto BLOCK = 8; - __shared__ Data shmem[8][8][32]; - - auto z = TIZ; - - auto gix = BIX * (BLOCK * 4) + TIX; - auto giy_base = BIY * BLOCK; - auto giz = BIZ * BLOCK + z; - auto base_id = gix + giy_base * stride3.y + giz * stride3.z; - - /******************************************************************************** - * load from DRAM, perform prequant - ********************************************************************************/ - if (gix < len3.x and giz < len3.z) { - for (auto y = 0; y < BLOCK; y++) { - if (giy_base + y < len3.y) { - shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r); // prequant (fp presence) - } - } - } - __syncthreads(); // necessary to ensure correctness - - auto x = TIX % 8; - - for (auto y = 0; y < BLOCK; y++) { - Data delta; - - /******************************************************************************** - * prediction - ********************************************************************************/ - delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0) // dist=3 - - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0) // dist=2 - - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0) // - - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0) // - + (x > 0 ? shmem[z][y][TIX - 1] : 0) // dist=1 - + (y > 0 ? shmem[z][y - 1][TIX] : 0) // - + (z > 0 ? shmem[z - 1][y][TIX] : 0)); // - - auto id = base_id + (y * stride3.y); - - bool quantizable = fabs(delta) < radius; - Data candidate = delta + radius; - if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) { - outlier[id] = (1 - quantizable) * candidate; // output; reuse data for outlier - quant[id] = quantizable * static_cast(candidate); - } - } - /* EOF */ -} - -template -__global__ void cusz::x_lorenzo_1d1l( // - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2) -{ - constexpr auto block_dim = BLOCK / SEQ; // dividable - - // coalesce-load (warp-striped) and transpose in shmem (similar for store) - typedef cub::BlockLoad BlockLoadT_outlier; - typedef cub::BlockLoad BlockLoadT_quant; - typedef cub::BlockStore BlockStoreT_xdata; - typedef cub::BlockScan - BlockScanT_xdata; // TODO autoselect algorithm - - __shared__ union TempStorage { // overlap shared memory space - typename BlockLoadT_outlier::TempStorage load_outlier; - typename BlockLoadT_quant::TempStorage load_quant; - typename BlockStoreT_xdata::TempStorage store_xdata; - typename BlockScanT_xdata::TempStorage scan_xdata; - } temp_storage; - - // thread-scope tiled data - union ThreadData { - Data xdata[SEQ]; - Data outlier[SEQ]; - } thread_scope; - ErrCtrl thread_scope_quant[SEQ]; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block - ********************************************************************************/ - BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant); - __syncthreads(); // barrier for shmem reuse - BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier); - __syncthreads(); // barrier for shmem reuse - -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = (BIX * BDX + TIX) * SEQ + i; - thread_scope.xdata[i] = - id < len3.x ? thread_scope.outlier[i] + static_cast(thread_scope_quant[i]) - radius : 0; - } - __syncthreads(); - - /******************************************************************************** - * perform partial-sum using cub::InclusiveSum - ********************************************************************************/ - BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata); - __syncthreads(); // barrier for shmem reuse - - /******************************************************************************** - * scale by ebx2 and write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2; - __syncthreads(); // barrier for shmem reuse - - BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata); -} - -template -__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2) -{ - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ Data intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - Data thread_scope[YSEQ]; - /* - . ------> gix (x) - | t00 t01 t02 t03 ... t0f - | ts00_0 ts00_0 ts00_0 ts00_0 - giy ts00_1 ts00_1 ts00_1 ts00_1 - (y) | | | | - ts00_7 ts00_7 ts00_7 ts00_7 - - | t10 t11 t12 t13 ... t1f - | ts00_0 ts00_0 ts00_0 ts00_0 - giy ts00_1 ts00_1 ts00_1 ts00_1 - (y) | | | | - ts00_7 ts00_7 ts00_7 ts00_7 - */ - - auto gix = BIX * BLOCK + TIX; - auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < len3.x and giy_base + i < len3.y) - thread_scope[i] = outlier[gid] + static_cast(quant[gid]) - radius; // fuse - else - thread_scope[i] = 0; // TODO set as init state? - } - - /******************************************************************************** - * partial-sum along y-axis, sequantially - ********************************************************************************/ - for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1]; - // two-pass: store for cross-threadscope update - if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1]; - __syncthreads(); - // two-pass: load and update - if (TIY == 1) { - auto tmp = intermediate[TIX]; -#pragma unroll - for (auto& i : thread_scope) i += tmp; - } - - /******************************************************************************** - * in-warp partial-sum along x-axis - ********************************************************************************/ -#pragma unroll - for (auto& i : thread_scope) { - for (auto d = 1; d < BLOCK; d *= 2) { - Data n = __shfl_up_sync(0xffffffff, i, d, 16); - if (TIX >= d) i += n; - } - i *= ebx2; - } - - /******************************************************************************** - * write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i]; - } -} - -template -__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2) -{ - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ Data intermediate[BLOCK][4][8]; - Data thread_scope[YSEQ]; - - auto seg_id = TIX / 8; - auto seg_tix = TIX % 8; - - auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; - auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - ********************************************************************************/ -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - auto gid = get_gid(y); - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_scope[y] = outlier[gid] + static_cast(quant[gid]) - static_cast(radius); // fuse - else - thread_scope[y] = 0; - } - - /******************************************************************************** - * partial-sum along y-axis, sequantially - ********************************************************************************/ - for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1]; - - /******************************************************************************** - * ND partial-sums along x- and z-axis - * in-warp shuffle used: in order to perform, it's transposed after X-partial sum - ********************************************************************************/ - auto dist = 1; - Data addend; - -#pragma unroll - for (auto i = 0; i < BLOCK; i++) { - Data val = thread_scope[i]; - - for (dist = 1; dist < BLOCK; dist *= 2) { - addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - for (dist = 1; dist < BLOCK; dist *= 2) { - addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - thread_scope[i] = val; - } - - /******************************************************************************** - * write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; } - } - /* EOF */ -} - -/******************************************************************************** - * experimental prototype toward further optmization - ********************************************************************************/ -template -__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8( - Data* outlier, - ErrCtrl* quant, - Data* xdata, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2) -{ - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ Data intermediate[BLOCK][4][8]; - Data thread_scope = 0; - - auto seg_id = TIX / 8; - auto seg_tix = TIX % 8; - - auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; - auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - auto y = 0; - - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously -#pragma unroll - for (y = 0; y < YSEQ; y++) { - auto gid = get_gid(y); - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_scope += outlier[gid] + static_cast(quant[gid]) - static_cast(radius); // fuse - - Data val = thread_scope; - - // shuffle, ND partial-sums - for (auto dist = 1; dist < BLOCK; dist *= 2) { - Data addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - Data addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - // thread_scope += val; - - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; } - } -} - -#undef TIX -#undef TIY -#undef TIZ -#undef BIX -#undef BIY -#undef BIZ -#undef BDX -#undef BDY -#undef BDZ - -#endif +/** + * @file lorenzo.inl + * @author Jiannan Tian + * @brief Dual-ErrCtrl Lorenzo method. + * @version 0.2 + * @date 2021-01-16 + * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11 + * (rev4) 2021-04-30 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_LORENZO_CUH +#define CUSZ_KERNEL_LORENZO_CUH + +#include +// #include "utils/cuda_err.cuh" +// #include "utils/timer.h" + +#if __has_include() +// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" +#include +#else +// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" +#include "../../third_party/cub/cub/cub.cuh" +#endif + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +#define TIX threadIdx.x +#define TIY threadIdx.y +#define TIZ threadIdx.z +#define BIX blockIdx.x +#define BIY blockIdx.y +#define BIZ blockIdx.z +#define BDX blockDim.x +#define BDY blockDim.y +#define BDZ blockDim.z + +using DIM = unsigned int; +using STRIDE = unsigned int; + +namespace cusz { + +/** + * @brief compress-time 1D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param data input + * @param errctrl output 1 + * @param outlier output 2 + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2_r precalculated reciprocal of eb*2 + */ +template +__global__ void +c_lorenzo_1d1l(Data* data, ErrCtrl* errctrl, Data* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2_r); + +/** + * @brief compress-time 2D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param data input + * @param errctrl output 1 + * @param outlier output 2 + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2_r precalculated reciprocal of eb*2 + */ +template +__global__ void c_lorenzo_2d1l_16x16data_mapto16x2( + Data* data, + ErrCtrl* errctrl, + Data* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r); + +/** + * @brief compress-time 3D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param data input + * @param errctrl output 1 + * @param outlier output 2 + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2_r precalculated reciprocal of eb*2 + */ +template +__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8( + Data* data, + ErrCtrl* errctrl, + Data* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r); + +/** + * @brief decompress-time 1D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param outlier input 1 + * @param quant input 2 + * @param xdata output + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2 precalculated eb*2 + */ +template < + typename Data, + typename ErrCtrl, + typename FP = float, + int BLOCK = 256, + int SEQ = 8> +__global__ void x_lorenzo_1d1l( + Data* outlier, // + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2); + +/** + * @brief decompress-time 2D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param outlier input 1 + * @param quant input 2 + * @param xdata output + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2 precalculated eb*2 + */ +template +__global__ void x_lorenzo_2d1l_16x16data_mapto16x2( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2); + +/** + * @brief decompress-time 3D Lorenzo pred-quant kernel + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param outlier input 1 + * @param quant input 2 + * @param xdata output + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2 precalculated eb*2 + */ +template +__global__ void x_lorenzo_3d1l_32x8x8data_mapto32x1x8( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2); + +/** + * @brief decompress-time 3D Lorenzo pred-quant kernel (variant) + * + * @tparam Data type of input data + * @tparam ErrCtrl type of error-control code + * @tparam FP type for internal floating-point processing + * @tparam BLOCK block size + * @tparam SEQ degree of sequentiality + * @param outlier input 1 + * @param quant input 2 + * @param xdata output + * @param len3 data length in 3D + * @param stride3 data stride in 3D + * @param radius quant-code radius + * @param ebx2 precalculated eb*2 + */ +template +__global__ void x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2); + +} // namespace cusz + +namespace { + +/** + * @brief (Original SZ/cuSZ design) 1D: separate delta by radius in to quant-code and outlier + */ +template +__forceinline__ __device__ void pred1d_radius_separate( + Data thread_scope[SEQ], + volatile Data* shmem_data, + volatile ErrCtrl* shmem_quant, + int radius, + Data from_last_stripe = 0) +{ + if CONSTEXPR (FIRST_POINT) { // i == 0 + Data delta = thread_scope[0] - from_last_stripe; + bool quantizable = fabs(delta) < radius; + Data candidate = delta + radius; + shmem_data[0 + TIX * SEQ] = (1 - quantizable) * candidate; // output; reuse data for outlier + shmem_quant[0 + TIX * SEQ] = quantizable * static_cast(candidate); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) { + Data delta = thread_scope[i] - thread_scope[i - 1]; + bool quantizable = fabs(delta) < radius; + Data candidate = delta + radius; + shmem_data[i + TIX * SEQ] = (1 - quantizable) * candidate; // output; reuse data for outlier + shmem_quant[i + TIX * SEQ] = quantizable * static_cast(candidate); + } + __syncthreads(); + } +} + +template +__forceinline__ __device__ void load1d( + Data* data, + unsigned int dimx, + unsigned int id_base, + volatile Data* shmem_data, + Data thread_scope[SEQ], + Data& from_last_stripe, + FP ebx2_r) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + TIX + i * NTHREAD; + if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); } + } + __syncthreads(); + + for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i]; + + if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1]; + __syncthreads(); +} + +template +__forceinline__ __device__ void write1d( + volatile Data* shmem_data, + Data* data, + unsigned int dimx, + unsigned int id_base, + volatile ErrCtrl* shmem_quant = nullptr, + ErrCtrl* quant = nullptr) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + TIX + i * NTHREAD; + if (id < dimx) { + if CONSTEXPR (NO_R_SEPARATE) { // TODO no-radius-separate uses shmem_data + quant[id] = shmem_data[TIX + i * NTHREAD]; + } + else { + data[id] = shmem_data[TIX + i * NTHREAD]; + quant[id] = shmem_quant[TIX + i * NTHREAD]; + } + } + } +} + +template +__forceinline__ __device__ void load2d_prequant( + Data* data, + Data center[YSEQ + 1], + unsigned int dimx, + unsigned int dimy, + unsigned int stridey, + unsigned int gix, + unsigned int giy_base, + FP ebx2_r) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r); + } + auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16); // same-warp, next-16 + if (TIY == 1) center[0] = tmp; +} + +template +__forceinline__ __device__ void pred2d(Data center[YSEQ + 1]) +{ + /* prediction + original form: Data delta = center[i] - center[i - 1] + west[i] - west[i - 1]; + short form: Data delta = center[i] - west[i]; + */ +#pragma unroll + for (auto i = YSEQ; i > 0; i--) { + center[i] -= center[i - 1]; + auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); + if (TIX > 0) center[i] -= west; + } + __syncthreads(); +} + +template +__forceinline__ __device__ void postquant_write2d( + Data center[YSEQ + 1], + ErrCtrl* quant, + Data* outlier, + unsigned int dimx, + unsigned int dimy, + unsigned int stridey, + int radius, + unsigned int gix, + unsigned int giy_base) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + + if (gix < dimx and giy_base + i - 1 < dimy) { + bool quantizable = fabs(center[i]) < radius; + Data candidate = center[i] + radius; + outlier[gid] = (1 - quantizable) * candidate; // output; reuse data for outlier + quant[gid] = quantizable * static_cast(candidate); + } + } +} + +} // namespace + +template < + typename Data, + typename ErrCtrl, + typename FP, + int BLOCK, + int SEQ> +__global__ void cusz::c_lorenzo_1d1l( // + Data* data, + ErrCtrl* quant, + Data* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r) +{ + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + union { + uint8_t uninitialized[BLOCK * sizeof(Data) + BLOCK * sizeof(ErrCtrl)]; + Data data[BLOCK]; + } space; + } shmem; + + auto id_base = BIX * BLOCK; + + Data thread_scope[SEQ]; + Data from_last_stripe{0}; + + /******************************************************************************** + * load from DRAM using striped layout, perform prequant + ********************************************************************************/ + load1d(data, len3.x, id_base, shmem.space.data, thread_scope, from_last_stripe, ebx2_r); + + // the original SZ/cuSZ design + auto shmem_quant = reinterpret_cast(shmem.space.uninitialized + sizeof(Data) * BLOCK); + pred1d_radius_separate( + thread_scope, shmem.space.data, shmem_quant, radius, from_last_stripe); + pred1d_radius_separate(thread_scope, shmem.space.data, shmem_quant, radius); + write1d(shmem.space.data, outlier, len3.x, id_base, shmem_quant, quant); +} + +template +__global__ void cusz::c_lorenzo_2d1l_16x16data_mapto16x2( + Data* data, + ErrCtrl* quant, + Data* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r) +{ + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + Data center[YSEQ + 1] = {0}; // nw n + // w center + + auto gix = BIX * BDX + TIX; // BDX == 16 + auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 + + load2d_prequant(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r); + pred2d(center); + postquant_write2d(center, quant, outlier, len3.x, len3.y, stride3.y, radius, gix, giy_base); +} + +template +__global__ void cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8( + Data* data, + ErrCtrl* quant, + Data* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r) +{ + constexpr auto BLOCK = 8; + __shared__ Data shmem[8][8][32]; + + auto z = TIZ; + + auto gix = BIX * (BLOCK * 4) + TIX; + auto giy_base = BIY * BLOCK; + auto giz = BIZ * BLOCK + z; + auto base_id = gix + giy_base * stride3.y + giz * stride3.z; + + /******************************************************************************** + * load from DRAM, perform prequant + ********************************************************************************/ + if (gix < len3.x and giz < len3.z) { + for (auto y = 0; y < BLOCK; y++) { + if (giy_base + y < len3.y) { + shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r); // prequant (fp presence) + } + } + } + __syncthreads(); // necessary to ensure correctness + + auto x = TIX % 8; + + for (auto y = 0; y < BLOCK; y++) { + Data delta; + + /******************************************************************************** + * prediction + ********************************************************************************/ + delta = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0) // dist=3 + - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0) // dist=2 + - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0) // + - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0) // + + (x > 0 ? shmem[z][y][TIX - 1] : 0) // dist=1 + + (y > 0 ? shmem[z][y - 1][TIX] : 0) // + + (z > 0 ? shmem[z - 1][y][TIX] : 0)); // + + auto id = base_id + (y * stride3.y); + + bool quantizable = fabs(delta) < radius; + Data candidate = delta + radius; + if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) { + outlier[id] = (1 - quantizable) * candidate; // output; reuse data for outlier + quant[id] = quantizable * static_cast(candidate); + } + } + /* EOF */ +} + +template +__global__ void cusz::x_lorenzo_1d1l( // + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2) +{ + constexpr auto block_dim = BLOCK / SEQ; // dividable + + // coalesce-load (warp-striped) and transpose in shmem (similar for store) + typedef cub::BlockLoad BlockLoadT_outlier; + typedef cub::BlockLoad BlockLoadT_quant; + typedef cub::BlockStore BlockStoreT_xdata; + typedef cub::BlockScan + BlockScanT_xdata; // TODO autoselect algorithm + + __shared__ union TempStorage { // overlap shared memory space + typename BlockLoadT_outlier::TempStorage load_outlier; + typename BlockLoadT_quant::TempStorage load_quant; + typename BlockStoreT_xdata::TempStorage store_xdata; + typename BlockScanT_xdata::TempStorage scan_xdata; + } temp_storage; + + // thread-scope tiled data + union ThreadData { + Data xdata[SEQ]; + Data outlier[SEQ]; + } thread_scope; + ErrCtrl thread_scope_quant[SEQ]; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block + ********************************************************************************/ + BlockLoadT_quant(temp_storage.load_quant).Load(quant + (BIX * BDX) * SEQ, thread_scope_quant); + __syncthreads(); // barrier for shmem reuse + BlockLoadT_outlier(temp_storage.load_outlier).Load(outlier + (BIX * BDX) * SEQ, thread_scope.outlier); + __syncthreads(); // barrier for shmem reuse + +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = (BIX * BDX + TIX) * SEQ + i; + thread_scope.xdata[i] = + id < len3.x ? thread_scope.outlier[i] + static_cast(thread_scope_quant[i]) - radius : 0; + } + __syncthreads(); + + /******************************************************************************** + * perform partial-sum using cub::InclusiveSum + ********************************************************************************/ + BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata); + __syncthreads(); // barrier for shmem reuse + + /******************************************************************************** + * scale by ebx2 and write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2; + __syncthreads(); // barrier for shmem reuse + + BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata); +} + +template +__global__ void cusz::x_lorenzo_2d1l_16x16data_mapto16x2( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2) +{ + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ Data intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + Data thread_scope[YSEQ]; + /* + . ------> gix (x) + | t00 t01 t02 t03 ... t0f + | ts00_0 ts00_0 ts00_0 ts00_0 + giy ts00_1 ts00_1 ts00_1 ts00_1 + (y) | | | | + ts00_7 ts00_7 ts00_7 ts00_7 + + | t10 t11 t12 t13 ... t1f + | ts00_0 ts00_0 ts00_0 ts00_0 + giy ts00_1 ts00_1 ts00_1 ts00_1 + (y) | | | | + ts00_7 ts00_7 ts00_7 ts00_7 + */ + + auto gix = BIX * BLOCK + TIX; + auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < len3.x and giy_base + i < len3.y) + thread_scope[i] = outlier[gid] + static_cast(quant[gid]) - radius; // fuse + else + thread_scope[i] = 0; // TODO set as init state? + } + + /******************************************************************************** + * partial-sum along y-axis, sequantially + ********************************************************************************/ + for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1]; + // two-pass: store for cross-threadscope update + if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1]; + __syncthreads(); + // two-pass: load and update + if (TIY == 1) { + auto tmp = intermediate[TIX]; +#pragma unroll + for (auto& i : thread_scope) i += tmp; + } + + /******************************************************************************** + * in-warp partial-sum along x-axis + ********************************************************************************/ +#pragma unroll + for (auto& i : thread_scope) { + for (auto d = 1; d < BLOCK; d *= 2) { + Data n = __shfl_up_sync(0xffffffff, i, d, 16); + if (TIX >= d) i += n; + } + i *= ebx2; + } + + /******************************************************************************** + * write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i]; + } +} + +template +__global__ void cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2) +{ + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ Data intermediate[BLOCK][4][8]; + Data thread_scope[YSEQ]; + + auto seg_id = TIX / 8; + auto seg_tix = TIX % 8; + + auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; + auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + ********************************************************************************/ +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + auto gid = get_gid(y); + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_scope[y] = outlier[gid] + static_cast(quant[gid]) - static_cast(radius); // fuse + else + thread_scope[y] = 0; + } + + /******************************************************************************** + * partial-sum along y-axis, sequantially + ********************************************************************************/ + for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1]; + + /******************************************************************************** + * ND partial-sums along x- and z-axis + * in-warp shuffle used: in order to perform, it's transposed after X-partial sum + ********************************************************************************/ + auto dist = 1; + Data addend; + +#pragma unroll + for (auto i = 0; i < BLOCK; i++) { + Data val = thread_scope[i]; + + for (dist = 1; dist < BLOCK; dist *= 2) { + addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + for (dist = 1; dist < BLOCK; dist *= 2) { + addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + thread_scope[i] = val; + } + + /******************************************************************************** + * write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; } + } + /* EOF */ +} + +/******************************************************************************** + * experimental prototype toward further optmization + ********************************************************************************/ +template +__global__ void cusz::x_lorenzo_3d1lvar_32x8x8data_mapto32x1x8( + Data* outlier, + ErrCtrl* quant, + Data* xdata, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2) +{ + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ Data intermediate[BLOCK][4][8]; + Data thread_scope = 0; + + auto seg_id = TIX / 8; + auto seg_tix = TIX % 8; + + auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; + auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + auto y = 0; + + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously +#pragma unroll + for (y = 0; y < YSEQ; y++) { + auto gid = get_gid(y); + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_scope += outlier[gid] + static_cast(quant[gid]) - static_cast(radius); // fuse + + Data val = thread_scope; + + // shuffle, ND partial-sums + for (auto dist = 1; dist < BLOCK; dist *= 2) { + Data addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + Data addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + // thread_scope += val; + + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = val * ebx2; } + } +} + +#undef TIX +#undef TIY +#undef TIZ +#undef BIX +#undef BIY +#undef BIZ +#undef BDX +#undef BDY +#undef BDZ + +#endif diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl index 764f44ec..83a52b4b 100644 --- a/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl +++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo23.inl @@ -1,1237 +1,1237 @@ -/** - * @file lorenzo23.inl - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2022-12-22 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "subroutine.inl" - -namespace subr = psz::cuda::__device; - -namespace psz { -namespace cuda { -namespace __kernel { - -//////////////////////////////////////////////////////////////////////////////// -// 1D - -namespace v0 { - -template -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); - -template -__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); - -namespace compaction { - -template > -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} - -namespace delta_only { - -template -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); - -template -__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -} // namespace v0 - -namespace v1_pn { - -namespace compaction { - -template > -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} // namespace compaction - -template -__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -namespace delta_only { - -template -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); - -template -__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -} // namespace v1_pn - -//////////////////////////////////////////////////////////////////////////////// -// 2D - -namespace v0 { - -template -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); - -template -__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); - -namespace delta_only { - -template -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); - -template -__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -namespace compaction { - -template > -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} // namespace compaction - -} // namespace v0 - -namespace v1_pn { - -namespace compaction { - -template > -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} // namespace compaction - -template -__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -namespace delta_only { - -template -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); - -template -__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -} // namespace v1_pn - -//////////////////////////////////////////////////////////////////////////////// -// 3D - -namespace v0 { - -// TODO -> `legacy` -namespace legacy { -template -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); - -} - -template -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); - -template -__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); - -namespace delta_only { - -template -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant); - -template -__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -namespace compaction { - -template > -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} - -} // namespace v0 - -namespace v1_pn { - -namespace compaction { - -template > -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); - -} - -template -__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -namespace delta_only { - -template -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant); - -template -__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata); - -} // namespace delta_only - -} // namespace v1_pn - -} // namespace __kernel -} // namespace cuda -} // namespace psz - -//////////////////////////////////////////////////////////////////////////////// -// 1D definition - -template -__global__ void -psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - union { - T data[BLOCK]; - T outlier[BLOCK]; - }; - EQ quant[BLOCK]; - } s; - - T prev{0}; - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); - subr_v0::predict_quantize_1d(thp_data, s.quant, s.outlier, radius, prev); - subr_v0::predict_quantize_1d(thp_data, s.quant, s.outlier, radius); - subr_v0::write_1d(s.quant, s.outlier, len3.x, id_base, quant, outlier); -} - -template -__global__ void -psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - union { - T data[BLOCK]; - T outlier[BLOCK]; - }; - EQ quant[BLOCK]; - } s; - - T prev{0}; - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); - subr_v0::predict_quantize__no_outlier_1d(thp_data, s.quant, prev); - subr_v0::predict_quantize__no_outlier_1d(thp_data, s.quant); - subr_v0::write_1d(s.quant, nullptr, len3.x, id_base, quant, nullptr); -} - -template -__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l( - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - Compaction outlier_desc) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace subr_v0c = psz::cuda::__device::v0::compaction; - - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - union { - T data[BLOCK]; - T outlier[BLOCK]; - }; - EQ quant[BLOCK]; - } s; - - T prev{0}; - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); - subr_v0c::predict_quantize_1d(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev); - subr_v0c::predict_quantize_1d(thp_data, s.quant, len3.x, radius, id_base, outlier_desc); - subr_v0::write_1d(s.quant, nullptr, len3.x, id_base, quant, nullptr); -} - -template -__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l( // - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - Compaction outlier) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace subr_v1c = psz::cuda::__device::v1_pn::compaction; - - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - union { - T data[BLOCK]; - T outlier[BLOCK]; - }; - EQ quant[BLOCK]; - } s; - - T prev{0}; - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); - subr_v1c::predict_quantize_1d(thp_data, s.quant, s.outlier, radius, prev); - subr_v1c::predict_quantize_1d(thp_data, s.quant, s.outlier, radius); - subr_v0::write_1d(s.quant, s.outlier, len3.x, id_base, quant, outlier); -} - -template -__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l( // - EQ* quant, - T* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace wave32 = psz::cuda::__device::wave32; - - constexpr auto NTHREAD = BLOCK / SEQ; // equiv. to blockDim.x - - __shared__ struct { - union { - T outlier[BLOCK]; - T xdata[BLOCK]; - }; - // even if it's wave64, "/32" works - T exchange_in[NTHREAD / 32]; - T exchange_out[NTHREAD / 32]; - } s; - - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::load_fuse_1d(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data); - subr_v0::block_scan_1d(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata); - subr_v0::write_1d(s.xdata, nullptr, len3.x, id_base, xdata, nullptr); -} - -template -__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l( // - EQ* quant, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto NTHREAD = BLOCK / SEQ; // equiv. to blockDim.x - - __shared__ struct { - T xdata[BLOCK]; - // even if it's wave64, "/32" works - T exchange_in[NTHREAD / 32]; - T exchange_out[NTHREAD / 32]; - } s; - - T thp_data[SEQ]; - - auto id_base = blockIdx.x * BLOCK; - - subr_v0::delta_only::load_1d(quant, len3.x, id_base, s.xdata, thp_data); - subr_v0::block_scan_1d(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata); - subr_v0::write_1d(s.xdata, nullptr, len3.x, id_base, xdata, nullptr); -} - -//////////////////////////////////////////////////////////////////////////////// -// 2D definition - -template -__global__ void -psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - T center[YSEQ + 1] = {0}; // NW N first element <- 0 - // W center - - auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); - subr_v0::predict_2d(center); - subr_v0::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier); -} - -template -__global__ void -psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - T center[YSEQ + 1] = {0}; // NW N first element <- 0 - // W center - - auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); - subr_v0::predict_2d(center); - subr_v0::delta_only::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, quant); -} - -template -__global__ void -psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - T center[YSEQ + 1] = {0}; // NW N first element <- 0 - // W center - - auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); - subr_v0::predict_2d(center); - subr_v1d::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, quant); -} - -template -__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l( - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - Compaction outlier) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - T center[YSEQ + 1] = {0}; // NW N first element <- 0 - // W center - - auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); - subr_v0::predict_2d(center); - subr_v0::compaction::quantize_write_2d( - center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier); -} - -// 16x16 data block maps to 16x2 (one warp) thread block -template -__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l( // - EQ* quant, - T* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - T thread_private[YSEQ]; - - auto gix = blockIdx.x * BLOCK + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - subr_v0::load_fuse_2d( - quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private); - subr_v0::block_scan_2d(thread_private, intermediate, ebx2); - subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); -} - -// 16x16 data block maps to 16x2 (one warp) thread block -template -__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l( // - EQ* quant, - T* outlier, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace subr_v1_pn = psz::cuda::__device::v1_pn; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - T thread_private[YSEQ]; - - auto gix = blockIdx.x * BLOCK + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - subr_v1_pn::load_fuse_2d(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); - subr_v0::block_scan_2d(thread_private, intermediate, ebx2); - subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); -} - -// 16x16 data block maps to 16x2 (one warp) thread block -template -__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l( // - EQ* quant, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - T thread_private[YSEQ]; - - auto gix = blockIdx.x * BLOCK + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - subr_v0::delta_only::load_2d(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); - subr_v0::block_scan_2d(thread_private, intermediate, ebx2); - subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); -} - -// 16x16 data block maps to 16x2 (one warp) thread block -template -__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l( // - EQ* quant, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - namespace subr_v0 = psz::cuda::__device::v0; - namespace subr_v1_pn = psz::cuda::__device::v1_pn; - - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - T thread_private[YSEQ]; - - auto gix = blockIdx.x * BLOCK + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 - - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - subr_v1_pn::delta_only::load_2d(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); - subr_v0::block_scan_2d(thread_private, intermediate, ebx2); - subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); -} - -template -__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l( - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - T* outlier) -{ - constexpr auto BLOCK = 8; - __shared__ T s[8][8][32]; - - auto z = threadIdx.z; - - auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK; - auto giz = blockIdx.z * BLOCK + z; - auto base_id = gix + giy_base * stride3.y + giz * stride3.z; - - auto giy = [&](auto y) { return giy_base + y; }; - auto gid = [&](auto y) { return base_id + y * stride3.y; }; - - auto load_prequant_3d = [&]() { - if (gix < len3.x and giz < len3.z) { - for (auto y = 0; y < BLOCK; y++) - if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (x < len3.x and y < len3.y and z < len3.z) { - quant[gid] = quantizable * static_cast(candidate); - outlier[gid] = (not quantizable) * candidate; - } - }; - - auto x = threadIdx.x % 8; - - auto predict_3d = [&](auto y) { - T delta = s[z][y][threadIdx.x] - // - ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0) // dist=3 - - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0) // dist=2 - - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0) // - - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0) // - + (x > 0 ? s[z][y][threadIdx.x - 1] : 0) // dist=1 - + (y > 0 ? s[z][y - 1][threadIdx.x] : 0) // - + (z > 0 ? s[z - 1][y][threadIdx.x] : 0)); // - return delta; - }; - - //////////////////////////////////////////////////////////////////////////// - - load_prequant_3d(); - for (auto y = 0; y < BLOCK; y++) { - auto delta = predict_3d(y); - quantize_write(delta, gix, giy(y), giz, gid(y)); - } -} - -template -__global__ void -psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) -{ - constexpr auto BLOCK = 8; - __shared__ T s[9][33]; - T delta[BLOCK + 1] = {0}; // first el = 0 - - const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - const auto giy = blockIdx.y * BLOCK + threadIdx.y; - const auto giz_base = blockIdx.z * BLOCK; - const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; - - auto giz = [&](auto z) { return giz_base + z; }; - auto gid = [&](auto z) { return base_id + z * stride3.z; }; - - auto load_prequant_3d = [&]() { - if (gix < len3.x and giy < len3.y) { - for (auto z = 0; z < BLOCK; z++) - if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (x < len3.x and y < len3.y and z < len3.z) { - quant[gid] = quantizable * static_cast(candidate); - outlier[gid] = (not quantizable) * candidate; - } - }; - - //////////////////////////////////////////////////////////////////////////// - - /* z-direction, sequential in private buffer - delta = + (s[z][y][x] - s[z-1][y][x]) - - (s[z][y][x-1] - s[z-1][y][x-1]) - + (s[z][y-1][x-1] - s[z-1][y-1][x-1]) - - (s[z][y-1][x] - s[z-1][y-1][x]) - - x-direction, shuffle - delta = + (s[z][y][x] - s[z][y][x-1]) - - (s[z][y-1][x] - s[z][y-1][x-1]) - - y-direction, shmem - delta = s[z][y][x] - s[z][y-1][x] - */ - - load_prequant_3d(); - - for (auto z = BLOCK; z > 0; z--) { - // z-direction - delta[z] -= delta[z - 1]; - - // x-direction - auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); - if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; - - // y-direction, exchange via shmem - // ghost padding along y - s[threadIdx.y + 1][threadIdx.x] = delta[z]; - __syncthreads(); - - delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; - - // now delta[z] is delta - quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); - } -} - -template -__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l( // - T* data, - dim3 len3, - dim3 stride3, - FP ebx2_r, - EQ* quant) -{ - constexpr auto BLOCK = 8; - __shared__ T s[9][33]; - T delta[BLOCK + 1] = {0}; // first el = 0 - - const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - const auto giy = blockIdx.y * BLOCK + threadIdx.y; - const auto giz_base = blockIdx.z * BLOCK; - const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; - - auto giz = [&](auto z) { return giz_base + z; }; - auto gid = [&](auto z) { return base_id + z * stride3.z; }; - - auto load_prequant_3d = [&]() { - if (gix < len3.x and giy < len3.y) { - for (auto z = 0; z < BLOCK; z++) - if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { - if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast(delta); - }; - - //////////////////////////////////////////////////////////////////////////// - - load_prequant_3d(); - - for (auto z = BLOCK; z > 0; z--) { - // z-direction - delta[z] -= delta[z - 1]; - - // x-direction - auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); - if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; - - // y-direction, exchange via shmem - // ghost padding along y - s[threadIdx.y + 1][threadIdx.x] = delta[z]; - __syncthreads(); - - delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; - - // now delta[z] is delta - quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); - } -} - -template -__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l( // - T* data, - dim3 len3, - dim3 stride3, - FP ebx2_r, - EQ* quant) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - constexpr auto BLOCK = 8; - __shared__ T s[9][33]; - T delta[BLOCK + 1] = {0}; // first el = 0 - - const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - const auto giy = blockIdx.y * BLOCK + threadIdx.y; - const auto giz_base = blockIdx.z * BLOCK; - const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; - - auto giz = [&](auto z) { return giz_base + z; }; - auto gid = [&](auto z) { return base_id + z * stride3.z; }; - - auto load_prequant_3d = [&]() { - if (gix < len3.x and giy < len3.y) { - for (auto z = 0; z < BLOCK; z++) - if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { - if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN::encode(static_cast(delta)); - }; - - //////////////////////////////////////////////////////////////////////////// - - load_prequant_3d(); - - for (auto z = BLOCK; z > 0; z--) { - // z-direction - delta[z] -= delta[z - 1]; - - // x-direction - auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); - if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; - - // y-direction, exchange via shmem - // ghost padding along y - s[threadIdx.y + 1][threadIdx.x] = delta[z]; - __syncthreads(); - - delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; - - // now delta[z] is delta - quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); - } -} - -template -__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l( - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - Compaction outlier) -{ - constexpr auto BLOCK = 8; - __shared__ T s[9][33]; - T delta[BLOCK + 1] = {0}; // first el = 0 - - const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - const auto giy = blockIdx.y * BLOCK + threadIdx.y; - const auto giz_base = blockIdx.z * BLOCK; - const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; - - auto giz = [&](auto z) { return giz_base + z; }; - auto gid = [&](auto z) { return base_id + z * stride3.z; }; - - auto load_prequant_3d = [&]() { - if (gix < len3.x and giy < len3.y) { - for (auto z = 0; z < BLOCK; z++) - if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) { - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (x < len3.x and y < len3.y and z < len3.z) { - quant[gid] = quantizable * static_cast(candidate); - if (not quantizable) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.idx[cur_idx] = gid; - outlier.val[cur_idx] = candidate; - } - } - }; - - //////////////////////////////////////////////////////////////////////////// - - load_prequant_3d(); - - for (auto z = BLOCK; z > 0; z--) { - // z-direction - delta[z] -= delta[z - 1]; - - // x-direction - auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); - if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; - - // y-direction, exchange via shmem - // ghost padding along y - s[threadIdx.y + 1][threadIdx.x] = delta[z]; - __syncthreads(); - - delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; - - // now delta[z] is delta - quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); - } -} - -template -__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l( - T* data, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2_r, - EQ* quant, - Compaction outlier) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - constexpr auto BLOCK = 8; - __shared__ T s[9][33]; - T delta[BLOCK + 1] = {0}; // first el = 0 - - const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; - const auto giy = blockIdx.y * BLOCK + threadIdx.y; - const auto giz_base = blockIdx.z * BLOCK; - const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; - - auto giz = [&](auto z) { return giz_base + z; }; - auto gid = [&](auto z) { return base_id + z * stride3.z; }; - - // TODO move to subroutine.inl - auto load_prequant_3d = [&]() { - if (gix < len3.x and giy < len3.y) { - for (auto z = 0; z < BLOCK; z++) - if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); - }; - - auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) { - bool quantizable = fabs(delta) < radius; - UI UI_delta = PN::encode(static_cast(delta)); - - T candidate = delta + radius; - if (x < len3.x and y < len3.y and z < len3.z) { - quant[gid] = quantizable * UI_delta; - if (not quantizable) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.idx[cur_idx] = gid; - outlier.val[cur_idx] = UI_delta; - } - } - }; - - //////////////////////////////////////////////////////////////////////////// - - load_prequant_3d(); - - for (auto z = BLOCK; z > 0; z--) { - // z-direction - delta[z] -= delta[z - 1]; - - // x-direction - auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); - if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; - - // y-direction, exchange via shmem - // ghost padding along y - s[threadIdx.y + 1][threadIdx.x] = delta[z]; - __syncthreads(); - - delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; - - // now delta[z] is delta - quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); - } -} - -// 32x8x8 data block maps to 32x1x8 thread block -template -__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l( // - EQ* quant, - T* outlier, - dim3 len3, - dim3 stride3, - int radius, - FP ebx2, - T* xdata) -{ - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ T intermediate[BLOCK][4][8]; - T thread_private[YSEQ]; - - auto seg_id = threadIdx.x / 8; - auto seg_tix = threadIdx.x % 8; - - auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK; - auto giy = [&](auto y) { return giy_base + y; }; - auto giz = blockIdx.z * BLOCK + threadIdx.z; - auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - auto load_fuse_3d = [&]() { - // load to thread-private array (fuse at the same time) -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_private[y] = outlier[gid(y)] + static_cast(quant[gid(y)]) - radius; // fuse - else - thread_private[y] = 0; - } - }; - - auto block_scan_3d = [&]() { - // partial-sum along y-axis, sequentially - for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; - -#pragma unroll - for (auto i = 0; i < BLOCK; i++) { - // ND partial-sums along x- and z-axis - // in-warp shuffle used: in order to perform, it's transposed after X-partial sum - T val = thread_private[i]; - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - thread_private[i] = val; - } - }; - - auto decomp_write_3d = [&]() { -#pragma unroll - for (auto y = 0; y < YSEQ; y++) - if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; - }; - - //////////////////////////////////////////////////////////////////////////// - load_fuse_3d(); - block_scan_3d(); - decomp_write_3d(); -} - -// 32x8x8 data block maps to 32x1x8 thread block -template -__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l( // - EQ* quant, - T* outlier, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ T intermediate[BLOCK][4][8]; - T thread_private[YSEQ]; - - auto seg_id = threadIdx.x / 8; - auto seg_tix = threadIdx.x % 8; - - auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK; - auto giy = [&](auto y) { return giy_base + y; }; - auto giz = blockIdx.z * BLOCK + threadIdx.z; - auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - auto load_fuse_3d = [&]() { - // load to thread-private array (fuse at the same time) -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_private[y] = outlier[gid(y)] + PN::decode(quant[gid(y)]); // fuse - else - thread_private[y] = 0; - } - }; - - auto block_scan_3d = [&]() { - // partial-sum along y-axis, sequentially - for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; - -#pragma unroll - for (auto i = 0; i < BLOCK; i++) { - // ND partial-sums along x- and z-axis - // in-warp shuffle used: in order to perform, it's transposed after X-partial sum - T val = thread_private[i]; - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - thread_private[i] = val; - } - }; - - auto decomp_write_3d = [&]() { -#pragma unroll - for (auto y = 0; y < YSEQ; y++) - if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; - }; - - //////////////////////////////////////////////////////////////////////////// - load_fuse_3d(); - block_scan_3d(); - decomp_write_3d(); -} - -// 32x8x8 data block maps to 32x1x8 thread block -template -__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l( // - EQ* quant, - dim3 len3, - dim3 stride3, - FP ebx2, - T* xdata) -{ - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ T intermediate[BLOCK][4][8]; - T thread_private[YSEQ]; - - auto seg_id = threadIdx.x / 8; - auto seg_tix = threadIdx.x % 8; - - auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; - auto giy_base = blockIdx.y * BLOCK; - auto giy = [&](auto y) { return giy_base + y; }; - auto giz = blockIdx.z * BLOCK + threadIdx.z; - auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - auto load_3d = [&]() { - // load to thread-private array (fuse at the same time) -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_private[y] = static_cast(quant[gid(y)]); // fuse - else - thread_private[y] = 0; - } - }; - - auto block_scan_3d = [&]() { - // partial-sum along y-axis, sequentially - for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; - -#pragma unroll - for (auto i = 0; i < BLOCK; i++) { - // ND partial-sums along x- and z-axis - // in-warp shuffle used: in order to perform, it's transposed after X-partial sum - T val = thread_private[i]; - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - for (auto dist = 1; dist < BLOCK; dist *= 2) { - auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[threadIdx.z][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][threadIdx.z]; - __syncthreads(); - - thread_private[i] = val; - } - }; - - auto decomp_write_3d = [&]() { -#pragma unroll - for (auto y = 0; y < YSEQ; y++) - if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; - }; - - //////////////////////////////////////////////////////////////////////////// - load_3d(); - block_scan_3d(); - decomp_write_3d(); -} +/** + * @file lorenzo23.inl + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2022-12-22 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "subroutine.inl" + +namespace subr = psz::cuda::__device; + +namespace psz { +namespace cuda { +namespace __kernel { + +//////////////////////////////////////////////////////////////////////////////// +// 1D + +namespace v0 { + +template +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); + +template +__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); + +namespace compaction { + +template > +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} + +namespace delta_only { + +template +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); + +template +__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +} // namespace v0 + +namespace v1_pn { + +namespace compaction { + +template > +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} // namespace compaction + +template +__global__ void x_lorenzo_1d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +namespace delta_only { + +template +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); + +template +__global__ void x_lorenzo_1d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +} // namespace v1_pn + +//////////////////////////////////////////////////////////////////////////////// +// 2D + +namespace v0 { + +template +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); + +template +__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); + +namespace delta_only { + +template +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); + +template +__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +namespace compaction { + +template > +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} // namespace compaction + +} // namespace v0 + +namespace v1_pn { + +namespace compaction { + +template > +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} // namespace compaction + +template +__global__ void x_lorenzo_2d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +namespace delta_only { + +template +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* delta); + +template +__global__ void x_lorenzo_2d1l(EQ* delta, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +} // namespace v1_pn + +//////////////////////////////////////////////////////////////////////////////// +// 3D + +namespace v0 { + +// TODO -> `legacy` +namespace legacy { +template +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); + +} + +template +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier); + +template +__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata); + +namespace delta_only { + +template +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant); + +template +__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +namespace compaction { + +template > +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} + +} // namespace v0 + +namespace v1_pn { + +namespace compaction { + +template > +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, Compaction outlier); + +} + +template +__global__ void x_lorenzo_3d1l(EQ* quant, T* outlier, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +namespace delta_only { + +template +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant); + +template +__global__ void x_lorenzo_3d1l(EQ* quant, dim3 len3, dim3 stride3, FP ebx2, T* xdata); + +} // namespace delta_only + +} // namespace v1_pn + +} // namespace __kernel +} // namespace cuda +} // namespace psz + +//////////////////////////////////////////////////////////////////////////////// +// 1D definition + +template +__global__ void +psz::cuda::__kernel::v0::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + union { + T data[BLOCK]; + T outlier[BLOCK]; + }; + EQ quant[BLOCK]; + } s; + + T prev{0}; + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); + subr_v0::predict_quantize_1d(thp_data, s.quant, s.outlier, radius, prev); + subr_v0::predict_quantize_1d(thp_data, s.quant, s.outlier, radius); + subr_v0::write_1d(s.quant, s.outlier, len3.x, id_base, quant, outlier); +} + +template +__global__ void +psz::cuda::__kernel::v0::delta_only::c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + union { + T data[BLOCK]; + T outlier[BLOCK]; + }; + EQ quant[BLOCK]; + } s; + + T prev{0}; + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); + subr_v0::predict_quantize__no_outlier_1d(thp_data, s.quant, prev); + subr_v0::predict_quantize__no_outlier_1d(thp_data, s.quant); + subr_v0::write_1d(s.quant, nullptr, len3.x, id_base, quant, nullptr); +} + +template +__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l( + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + Compaction outlier_desc) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace subr_v0c = psz::cuda::__device::v0::compaction; + + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + union { + T data[BLOCK]; + T outlier[BLOCK]; + }; + EQ quant[BLOCK]; + } s; + + T prev{0}; + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); + subr_v0c::predict_quantize_1d(thp_data, s.quant, len3.x, radius, id_base, outlier_desc, prev); + subr_v0c::predict_quantize_1d(thp_data, s.quant, len3.x, radius, id_base, outlier_desc); + subr_v0::write_1d(s.quant, nullptr, len3.x, id_base, quant, nullptr); +} + +template +__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_1d1l( // + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + Compaction outlier) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace subr_v1c = psz::cuda::__device::v1_pn::compaction; + + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + union { + T data[BLOCK]; + T outlier[BLOCK]; + }; + EQ quant[BLOCK]; + } s; + + T prev{0}; + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::load_prequant_1d(data, len3.x, id_base, s.data, thp_data, prev, ebx2_r); + subr_v1c::predict_quantize_1d(thp_data, s.quant, s.outlier, radius, prev); + subr_v1c::predict_quantize_1d(thp_data, s.quant, s.outlier, radius); + subr_v0::write_1d(s.quant, s.outlier, len3.x, id_base, quant, outlier); +} + +template +__global__ void psz::cuda::__kernel::v0::x_lorenzo_1d1l( // + EQ* quant, + T* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace wave32 = psz::cuda::__device::wave32; + + constexpr auto NTHREAD = BLOCK / SEQ; // equiv. to blockDim.x + + __shared__ struct { + union { + T outlier[BLOCK]; + T xdata[BLOCK]; + }; + // even if it's wave64, "/32" works + T exchange_in[NTHREAD / 32]; + T exchange_out[NTHREAD / 32]; + } s; + + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::load_fuse_1d(quant, outlier, len3.x, id_base, radius, s.xdata, thp_data); + subr_v0::block_scan_1d(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata); + subr_v0::write_1d(s.xdata, nullptr, len3.x, id_base, xdata, nullptr); +} + +template +__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_1d1l( // + EQ* quant, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto NTHREAD = BLOCK / SEQ; // equiv. to blockDim.x + + __shared__ struct { + T xdata[BLOCK]; + // even if it's wave64, "/32" works + T exchange_in[NTHREAD / 32]; + T exchange_out[NTHREAD / 32]; + } s; + + T thp_data[SEQ]; + + auto id_base = blockIdx.x * BLOCK; + + subr_v0::delta_only::load_1d(quant, len3.x, id_base, s.xdata, thp_data); + subr_v0::block_scan_1d(thp_data, ebx2, s.exchange_in, s.exchange_out, s.xdata); + subr_v0::write_1d(s.xdata, nullptr, len3.x, id_base, xdata, nullptr); +} + +//////////////////////////////////////////////////////////////////////////////// +// 2D definition + +template +__global__ void +psz::cuda::__kernel::v0::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + T center[YSEQ + 1] = {0}; // NW N first element <- 0 + // W center + + auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); + subr_v0::predict_2d(center); + subr_v0::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier); +} + +template +__global__ void +psz::cuda::__kernel::v0::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + T center[YSEQ + 1] = {0}; // NW N first element <- 0 + // W center + + auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); + subr_v0::predict_2d(center); + subr_v0::delta_only::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, quant); +} + +template +__global__ void +psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, FP ebx2_r, EQ* quant) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace subr_v1d = psz::cuda::__device::v1_pn::delta_only; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + T center[YSEQ + 1] = {0}; // NW N first element <- 0 + // W center + + auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); + subr_v0::predict_2d(center); + subr_v1d::quantize_write_2d(center, len3.x, gix, len3.y, giy_base, stride3.y, quant); +} + +template +__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l( + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + Compaction outlier) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + T center[YSEQ + 1] = {0}; // NW N first element <- 0 + // W center + + auto gix = blockIdx.x * BLOCK + threadIdx.x; // BDX == BLOCK == 16 + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + subr_v0::load_prequant_2d(data, len3.x, gix, len3.y, giy_base, stride3.y, ebx2_r, center); + subr_v0::predict_2d(center); + subr_v0::compaction::quantize_write_2d( + center, len3.x, gix, len3.y, giy_base, stride3.y, radius, quant, outlier); +} + +// 16x16 data block maps to 16x2 (one warp) thread block +template +__global__ void psz::cuda::__kernel::v0::x_lorenzo_2d1l( // + EQ* quant, + T* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + T thread_private[YSEQ]; + + auto gix = blockIdx.x * BLOCK + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + subr_v0::load_fuse_2d( + quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, radius, thread_private); + subr_v0::block_scan_2d(thread_private, intermediate, ebx2); + subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); +} + +// 16x16 data block maps to 16x2 (one warp) thread block +template +__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_2d1l( // + EQ* quant, + T* outlier, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace subr_v1_pn = psz::cuda::__device::v1_pn; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + T thread_private[YSEQ]; + + auto gix = blockIdx.x * BLOCK + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + subr_v1_pn::load_fuse_2d(quant, outlier, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); + subr_v0::block_scan_2d(thread_private, intermediate, ebx2); + subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); +} + +// 16x16 data block maps to 16x2 (one warp) thread block +template +__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_2d1l( // + EQ* quant, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + T thread_private[YSEQ]; + + auto gix = blockIdx.x * BLOCK + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + subr_v0::delta_only::load_2d(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); + subr_v0::block_scan_2d(thread_private, intermediate, ebx2); + subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); +} + +// 16x16 data block maps to 16x2 (one warp) thread block +template +__global__ void psz::cuda::__kernel::v1_pn::delta_only::x_lorenzo_2d1l( // + EQ* quant, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + namespace subr_v0 = psz::cuda::__device::v0; + namespace subr_v1_pn = psz::cuda::__device::v1_pn; + + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ T intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + T thread_private[YSEQ]; + + auto gix = blockIdx.x * BLOCK + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK + threadIdx.y * YSEQ; // BDY * YSEQ = BLOCK == 16 + + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + subr_v1_pn::delta_only::load_2d(quant, len3.x, gix, len3.y, giy_base, stride3.y, thread_private); + subr_v0::block_scan_2d(thread_private, intermediate, ebx2); + subr_v0::decomp_write_2d(thread_private, len3.x, gix, len3.y, giy_base, stride3.y, xdata); +} + +template +__global__ void psz::cuda::__kernel::v0::legacy::c_lorenzo_3d1l( + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + T* outlier) +{ + constexpr auto BLOCK = 8; + __shared__ T s[8][8][32]; + + auto z = threadIdx.z; + + auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK; + auto giz = blockIdx.z * BLOCK + z; + auto base_id = gix + giy_base * stride3.y + giz * stride3.z; + + auto giy = [&](auto y) { return giy_base + y; }; + auto gid = [&](auto y) { return base_id + y * stride3.y; }; + + auto load_prequant_3d = [&]() { + if (gix < len3.x and giz < len3.z) { + for (auto y = 0; y < BLOCK; y++) + if (giy(y) < len3.y) s[z][y][threadIdx.x] = round(data[gid(y)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (x < len3.x and y < len3.y and z < len3.z) { + quant[gid] = quantizable * static_cast(candidate); + outlier[gid] = (not quantizable) * candidate; + } + }; + + auto x = threadIdx.x % 8; + + auto predict_3d = [&](auto y) { + T delta = s[z][y][threadIdx.x] - // + ((z > 0 and y > 0 and x > 0 ? s[z - 1][y - 1][threadIdx.x - 1] : 0) // dist=3 + - (y > 0 and x > 0 ? s[z][y - 1][threadIdx.x - 1] : 0) // dist=2 + - (z > 0 and x > 0 ? s[z - 1][y][threadIdx.x - 1] : 0) // + - (z > 0 and y > 0 ? s[z - 1][y - 1][threadIdx.x] : 0) // + + (x > 0 ? s[z][y][threadIdx.x - 1] : 0) // dist=1 + + (y > 0 ? s[z][y - 1][threadIdx.x] : 0) // + + (z > 0 ? s[z - 1][y][threadIdx.x] : 0)); // + return delta; + }; + + //////////////////////////////////////////////////////////////////////////// + + load_prequant_3d(); + for (auto y = 0; y < BLOCK; y++) { + auto delta = predict_3d(y); + quantize_write(delta, gix, giy(y), giz, gid(y)); + } +} + +template +__global__ void +psz::cuda::__kernel::v0::c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* quant, T* outlier) +{ + constexpr auto BLOCK = 8; + __shared__ T s[9][33]; + T delta[BLOCK + 1] = {0}; // first el = 0 + + const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + const auto giy = blockIdx.y * BLOCK + threadIdx.y; + const auto giz_base = blockIdx.z * BLOCK; + const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; + + auto giz = [&](auto z) { return giz_base + z; }; + auto gid = [&](auto z) { return base_id + z * stride3.z; }; + + auto load_prequant_3d = [&]() { + if (gix < len3.x and giy < len3.y) { + for (auto z = 0; z < BLOCK; z++) + if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (x < len3.x and y < len3.y and z < len3.z) { + quant[gid] = quantizable * static_cast(candidate); + outlier[gid] = (not quantizable) * candidate; + } + }; + + //////////////////////////////////////////////////////////////////////////// + + /* z-direction, sequential in private buffer + delta = + (s[z][y][x] - s[z-1][y][x]) + - (s[z][y][x-1] - s[z-1][y][x-1]) + + (s[z][y-1][x-1] - s[z-1][y-1][x-1]) + - (s[z][y-1][x] - s[z-1][y-1][x]) + + x-direction, shuffle + delta = + (s[z][y][x] - s[z][y][x-1]) + - (s[z][y-1][x] - s[z][y-1][x-1]) + + y-direction, shmem + delta = s[z][y][x] - s[z][y-1][x] + */ + + load_prequant_3d(); + + for (auto z = BLOCK; z > 0; z--) { + // z-direction + delta[z] -= delta[z - 1]; + + // x-direction + auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); + if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; + + // y-direction, exchange via shmem + // ghost padding along y + s[threadIdx.y + 1][threadIdx.x] = delta[z]; + __syncthreads(); + + delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; + + // now delta[z] is delta + quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); + } +} + +template +__global__ void psz::cuda::__kernel::v0::delta_only::c_lorenzo_3d1l( // + T* data, + dim3 len3, + dim3 stride3, + FP ebx2_r, + EQ* quant) +{ + constexpr auto BLOCK = 8; + __shared__ T s[9][33]; + T delta[BLOCK + 1] = {0}; // first el = 0 + + const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + const auto giy = blockIdx.y * BLOCK + threadIdx.y; + const auto giz_base = blockIdx.z * BLOCK; + const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; + + auto giz = [&](auto z) { return giz_base + z; }; + auto gid = [&](auto z) { return base_id + z * stride3.z; }; + + auto load_prequant_3d = [&]() { + if (gix < len3.x and giy < len3.y) { + for (auto z = 0; z < BLOCK; z++) + if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { + if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = static_cast(delta); + }; + + //////////////////////////////////////////////////////////////////////////// + + load_prequant_3d(); + + for (auto z = BLOCK; z > 0; z--) { + // z-direction + delta[z] -= delta[z - 1]; + + // x-direction + auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); + if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; + + // y-direction, exchange via shmem + // ghost padding along y + s[threadIdx.y + 1][threadIdx.x] = delta[z]; + __syncthreads(); + + delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; + + // now delta[z] is delta + quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); + } +} + +template +__global__ void psz::cuda::__kernel::v1_pn::delta_only::c_lorenzo_3d1l( // + T* data, + dim3 len3, + dim3 stride3, + FP ebx2_r, + EQ* quant) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + constexpr auto BLOCK = 8; + __shared__ T s[9][33]; + T delta[BLOCK + 1] = {0}; // first el = 0 + + const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + const auto giy = blockIdx.y * BLOCK + threadIdx.y; + const auto giz_base = blockIdx.z * BLOCK; + const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; + + auto giz = [&](auto z) { return giz_base + z; }; + auto gid = [&](auto z) { return base_id + z * stride3.z; }; + + auto load_prequant_3d = [&]() { + if (gix < len3.x and giy < len3.y) { + for (auto z = 0; z < BLOCK; z++) + if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_write = [&](T delta, auto x, auto y, auto z, auto gid) { + if (x < len3.x and y < len3.y and z < len3.z) quant[gid] = PN::encode(static_cast(delta)); + }; + + //////////////////////////////////////////////////////////////////////////// + + load_prequant_3d(); + + for (auto z = BLOCK; z > 0; z--) { + // z-direction + delta[z] -= delta[z - 1]; + + // x-direction + auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); + if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; + + // y-direction, exchange via shmem + // ghost padding along y + s[threadIdx.y + 1][threadIdx.x] = delta[z]; + __syncthreads(); + + delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; + + // now delta[z] is delta + quantize_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); + } +} + +template +__global__ void psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l( + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + Compaction outlier) +{ + constexpr auto BLOCK = 8; + __shared__ T s[9][33]; + T delta[BLOCK + 1] = {0}; // first el = 0 + + const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + const auto giy = blockIdx.y * BLOCK + threadIdx.y; + const auto giz_base = blockIdx.z * BLOCK; + const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; + + auto giz = [&](auto z) { return giz_base + z; }; + auto gid = [&](auto z) { return base_id + z * stride3.z; }; + + auto load_prequant_3d = [&]() { + if (gix < len3.x and giy < len3.y) { + for (auto z = 0; z < BLOCK; z++) + if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) { + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (x < len3.x and y < len3.y and z < len3.z) { + quant[gid] = quantizable * static_cast(candidate); + if (not quantizable) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.idx[cur_idx] = gid; + outlier.val[cur_idx] = candidate; + } + } + }; + + //////////////////////////////////////////////////////////////////////////// + + load_prequant_3d(); + + for (auto z = BLOCK; z > 0; z--) { + // z-direction + delta[z] -= delta[z - 1]; + + // x-direction + auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); + if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; + + // y-direction, exchange via shmem + // ghost padding along y + s[threadIdx.y + 1][threadIdx.x] = delta[z]; + __syncthreads(); + + delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; + + // now delta[z] is delta + quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); + } +} + +template +__global__ void psz::cuda::__kernel::v1_pn::compaction::c_lorenzo_3d1l( + T* data, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2_r, + EQ* quant, + Compaction outlier) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + constexpr auto BLOCK = 8; + __shared__ T s[9][33]; + T delta[BLOCK + 1] = {0}; // first el = 0 + + const auto gix = blockIdx.x * (BLOCK * 4) + threadIdx.x; + const auto giy = blockIdx.y * BLOCK + threadIdx.y; + const auto giz_base = blockIdx.z * BLOCK; + const auto base_id = gix + giy * stride3.y + giz_base * stride3.z; + + auto giz = [&](auto z) { return giz_base + z; }; + auto gid = [&](auto z) { return base_id + z * stride3.z; }; + + // TODO move to subroutine.inl + auto load_prequant_3d = [&]() { + if (gix < len3.x and giy < len3.y) { + for (auto z = 0; z < BLOCK; z++) + if (giz(z) < len3.z) delta[z + 1] = round(data[gid(z)] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); + }; + + auto quantize_compact_write = [&](T delta, auto x, auto y, auto z, auto gid) { + bool quantizable = fabs(delta) < radius; + UI UI_delta = PN::encode(static_cast(delta)); + + T candidate = delta + radius; + if (x < len3.x and y < len3.y and z < len3.z) { + quant[gid] = quantizable * UI_delta; + if (not quantizable) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.idx[cur_idx] = gid; + outlier.val[cur_idx] = UI_delta; + } + } + }; + + //////////////////////////////////////////////////////////////////////////// + + load_prequant_3d(); + + for (auto z = BLOCK; z > 0; z--) { + // z-direction + delta[z] -= delta[z - 1]; + + // x-direction + auto prev_x = __shfl_up_sync(0xffffffff, delta[z], 1, 8); + if (threadIdx.x % BLOCK > 0) delta[z] -= prev_x; + + // y-direction, exchange via shmem + // ghost padding along y + s[threadIdx.y + 1][threadIdx.x] = delta[z]; + __syncthreads(); + + delta[z] -= (threadIdx.y > 0) * s[threadIdx.y][threadIdx.x]; + + // now delta[z] is delta + quantize_compact_write(delta[z], gix, giy, giz(z - 1), gid(z - 1)); + } +} + +// 32x8x8 data block maps to 32x1x8 thread block +template +__global__ void psz::cuda::__kernel::v0::x_lorenzo_3d1l( // + EQ* quant, + T* outlier, + dim3 len3, + dim3 stride3, + int radius, + FP ebx2, + T* xdata) +{ + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ T intermediate[BLOCK][4][8]; + T thread_private[YSEQ]; + + auto seg_id = threadIdx.x / 8; + auto seg_tix = threadIdx.x % 8; + + auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK; + auto giy = [&](auto y) { return giy_base + y; }; + auto giz = blockIdx.z * BLOCK + threadIdx.z; + auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + auto load_fuse_3d = [&]() { + // load to thread-private array (fuse at the same time) +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_private[y] = outlier[gid(y)] + static_cast(quant[gid(y)]) - radius; // fuse + else + thread_private[y] = 0; + } + }; + + auto block_scan_3d = [&]() { + // partial-sum along y-axis, sequentially + for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; + +#pragma unroll + for (auto i = 0; i < BLOCK; i++) { + // ND partial-sums along x- and z-axis + // in-warp shuffle used: in order to perform, it's transposed after X-partial sum + T val = thread_private[i]; + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + thread_private[i] = val; + } + }; + + auto decomp_write_3d = [&]() { +#pragma unroll + for (auto y = 0; y < YSEQ; y++) + if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; + }; + + //////////////////////////////////////////////////////////////////////////// + load_fuse_3d(); + block_scan_3d(); + decomp_write_3d(); +} + +// 32x8x8 data block maps to 32x1x8 thread block +template +__global__ void psz::cuda::__kernel::v1_pn::x_lorenzo_3d1l( // + EQ* quant, + T* outlier, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ T intermediate[BLOCK][4][8]; + T thread_private[YSEQ]; + + auto seg_id = threadIdx.x / 8; + auto seg_tix = threadIdx.x % 8; + + auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK; + auto giy = [&](auto y) { return giy_base + y; }; + auto giz = blockIdx.z * BLOCK + threadIdx.z; + auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + auto load_fuse_3d = [&]() { + // load to thread-private array (fuse at the same time) +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_private[y] = outlier[gid(y)] + PN::decode(quant[gid(y)]); // fuse + else + thread_private[y] = 0; + } + }; + + auto block_scan_3d = [&]() { + // partial-sum along y-axis, sequentially + for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; + +#pragma unroll + for (auto i = 0; i < BLOCK; i++) { + // ND partial-sums along x- and z-axis + // in-warp shuffle used: in order to perform, it's transposed after X-partial sum + T val = thread_private[i]; + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + thread_private[i] = val; + } + }; + + auto decomp_write_3d = [&]() { +#pragma unroll + for (auto y = 0; y < YSEQ; y++) + if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; + }; + + //////////////////////////////////////////////////////////////////////////// + load_fuse_3d(); + block_scan_3d(); + decomp_write_3d(); +} + +// 32x8x8 data block maps to 32x1x8 thread block +template +__global__ void psz::cuda::__kernel::v0::delta_only::x_lorenzo_3d1l( // + EQ* quant, + dim3 len3, + dim3 stride3, + FP ebx2, + T* xdata) +{ + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ T intermediate[BLOCK][4][8]; + T thread_private[YSEQ]; + + auto seg_id = threadIdx.x / 8; + auto seg_tix = threadIdx.x % 8; + + auto gix = blockIdx.x * (4 * BLOCK) + threadIdx.x; + auto giy_base = blockIdx.y * BLOCK; + auto giy = [&](auto y) { return giy_base + y; }; + auto giz = blockIdx.z * BLOCK + threadIdx.z; + auto gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + auto load_3d = [&]() { + // load to thread-private array (fuse at the same time) +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_private[y] = static_cast(quant[gid(y)]); // fuse + else + thread_private[y] = 0; + } + }; + + auto block_scan_3d = [&]() { + // partial-sum along y-axis, sequentially + for (auto y = 1; y < YSEQ; y++) thread_private[y] += thread_private[y - 1]; + +#pragma unroll + for (auto i = 0; i < BLOCK; i++) { + // ND partial-sums along x- and z-axis + // in-warp shuffle used: in order to perform, it's transposed after X-partial sum + T val = thread_private[i]; + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + for (auto dist = 1; dist < BLOCK; dist *= 2) { + auto addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[threadIdx.z][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][threadIdx.z]; + __syncthreads(); + + thread_private[i] = val; + } + }; + + auto decomp_write_3d = [&]() { +#pragma unroll + for (auto y = 0; y < YSEQ; y++) + if (gix < len3.x and giy(y) < len3.y and giz < len3.z) xdata[gid(y)] = thread_private[y] * ebx2; + }; + + //////////////////////////////////////////////////////////////////////////// + load_3d(); + block_scan_3d(); + decomp_write_3d(); +} diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl index 2ed25984..5a317a60 100644 --- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl +++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_proto.inl @@ -1,214 +1,214 @@ -/** - * @file lorenzo_proto.inl - * @author Jiannan Tian - * @brief (prototype) Dual-EQ Lorenzo method. - * @version 0.2 - * @date 2021-01-16 - * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11 - * (rev4) 2021-04-30 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH -#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH - -#include -#include - -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -namespace psz { - -namespace cuda { -namespace __kernel { - -namespace prototype { // easy algorithmic description - -template -__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) -{ - __shared__ T buf[BLK]; - - auto id = blockIdx.x * blockDim.x + threadIdx.x; - if (id < len3.x) { - buf[threadIdx.x] = round(data[id] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); // necessary to ensure correctness - - T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]); - - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (id < len3.x) { // postquant - data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier - eq[id] = quantizable * static_cast(candidate); - } -} - -template -__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) -{ - __shared__ T buf[BLK][BLK + 1]; - - auto y = threadIdx.y, x = threadIdx.x; - auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x; - - auto id = gix + giy * stride3.y; // low to high dim, inner to outer - if (gix < len3.x and giy < len3.y) { - buf[y][x] = round(data[id] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); // necessary to ensure correctness - - T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) + // dist=1 - (y > 0 ? buf[y - 1][x] : 0) - // dist=1 - (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0)); // dist=2 - - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (gix < len3.x and giy < len3.y) { - data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier - eq[id] = quantizable * static_cast(candidate); - } -} - -template -__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) -{ - __shared__ T buf[BLK][BLK][BLK + 1]; - - auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x; - auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x; - - auto id = gix + giy * stride3.y + giz * stride3.z; // low to high in dim, inner to outer - if (gix < len3.x and giy < len3.y and giz < len3.z) { - buf[z][y][x] = round(data[id] * ebx2_r); // prequant (fp presence) - } - __syncthreads(); // necessary to ensure correctness - - T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0) // dist=3 - - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0) // dist=2 - - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0) // - - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0) // - + (x > 0 ? buf[z][y][x - 1] : 0) // dist=1 - + (y > 0 ? buf[z][y - 1][x] : 0) // - + (z > 0 ? buf[z - 1][y][x] : 0)); // - - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - if (gix < len3.x and giy < len3.y and giz < len3.z) { - data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier - eq[id] = quantizable * static_cast(candidate); - } -} - -template -__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) -{ - __shared__ T buf[BLK]; - - auto id = blockIdx.x * blockDim.x + threadIdx.x; - - if (id < len3.x) - buf[threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // fuse - else - buf[threadIdx.x] = 0; - __syncthreads(); - - for (auto d = 1; d < BLK; d *= 2) { - T n = 0; - if (threadIdx.x >= d) n = buf[threadIdx.x - d]; // like __shfl_up_sync(0x1f, var, d); warp_sync - __syncthreads(); - if (threadIdx.x >= d) buf[threadIdx.x] += n; - __syncthreads(); - } - - if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; } -} - -template -__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) -{ - __shared__ T buf[BLK][BLK + 1]; - - auto giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x; - size_t id = gix + giy * stride3.y; - - if (gix < len3.x and giy < len3.y) - buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // fuse - else - buf[threadIdx.y][threadIdx.x] = 0; - __syncthreads(); - - for (auto d = 1; d < BLK; d *= 2) { - T n = 0; - if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d]; - __syncthreads(); - if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n; - __syncthreads(); - } - - for (auto d = 1; d < BLK; d *= 2) { - T n = 0; - if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x]; - __syncthreads(); - if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n; - __syncthreads(); - } - - if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; } -} - -template -__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) -{ - __shared__ T buf[BLK][BLK][BLK + 1]; - - auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y, - gix = blockIdx.x * BLK + threadIdx.x; - size_t id = gix + giy * stride3.y + giz * stride3.z; // low to high in dim, inner to outer - - if (gix < len3.x and giy < len3.y and giz < len3.z) - buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // id - else - buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0; - __syncthreads(); - - for (auto dist = 1; dist < BLK; dist *= 2) { - T addend = 0; - if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist]; - __syncthreads(); - if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; - __syncthreads(); - } - - for (auto dist = 1; dist < BLK; dist *= 2) { - T addend = 0; - if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x]; - __syncthreads(); - if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; - __syncthreads(); - } - - for (auto dist = 1; dist < BLK; dist *= 2) { - T addend = 0; - if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x]; - __syncthreads(); - if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; - __syncthreads(); - } - - if (gix < len3.x and giy < len3.y and giz < len3.z) { - xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2; - } -} - -} // namespace prototype -} // namespace __kernel -} // namespace cuda -} // namespace psz - -#endif +/** + * @file lorenzo_proto.inl + * @author Jiannan Tian + * @brief (prototype) Dual-EQ Lorenzo method. + * @version 0.2 + * @date 2021-01-16 + * (create) 2019-09-23; (release) 2020-09-20; (rev1) 2021-01-16; (rev2) 2021-02-20; (rev3) 2021-04-11 + * (rev4) 2021-04-30 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH +#define CUSZ_KERNEL_LORENZO_PROTOTYPE_CUH + +#include +#include + +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +namespace psz { + +namespace cuda { +namespace __kernel { + +namespace prototype { // easy algorithmic description + +template +__global__ void c_lorenzo_1d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) +{ + __shared__ T buf[BLK]; + + auto id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < len3.x) { + buf[threadIdx.x] = round(data[id] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); // necessary to ensure correctness + + T delta = buf[threadIdx.x] - (threadIdx.x == 0 ? 0 : buf[threadIdx.x - 1]); + + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (id < len3.x) { // postquant + data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier + eq[id] = quantizable * static_cast(candidate); + } +} + +template +__global__ void c_lorenzo_2d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) +{ + __shared__ T buf[BLK][BLK + 1]; + + auto y = threadIdx.y, x = threadIdx.x; + auto giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x; + + auto id = gix + giy * stride3.y; // low to high dim, inner to outer + if (gix < len3.x and giy < len3.y) { + buf[y][x] = round(data[id] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); // necessary to ensure correctness + + T delta = buf[y][x] - ((x > 0 ? buf[y][x - 1] : 0) + // dist=1 + (y > 0 ? buf[y - 1][x] : 0) - // dist=1 + (x > 0 and y > 0 ? buf[y - 1][x - 1] : 0)); // dist=2 + + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (gix < len3.x and giy < len3.y) { + data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier + eq[id] = quantizable * static_cast(candidate); + } +} + +template +__global__ void c_lorenzo_3d1l(T* data, dim3 len3, dim3 stride3, int radius, FP ebx2_r, EQ* eq, T* outlier) +{ + __shared__ T buf[BLK][BLK][BLK + 1]; + + auto z = threadIdx.z, y = threadIdx.y, x = threadIdx.x; + auto giz = blockIdx.z * blockDim.z + z, giy = blockIdx.y * blockDim.y + y, gix = blockIdx.x * blockDim.x + x; + + auto id = gix + giy * stride3.y + giz * stride3.z; // low to high in dim, inner to outer + if (gix < len3.x and giy < len3.y and giz < len3.z) { + buf[z][y][x] = round(data[id] * ebx2_r); // prequant (fp presence) + } + __syncthreads(); // necessary to ensure correctness + + T delta = buf[z][y][x] - ((z > 0 and y > 0 and x > 0 ? buf[z - 1][y - 1][x - 1] : 0) // dist=3 + - (y > 0 and x > 0 ? buf[z][y - 1][x - 1] : 0) // dist=2 + - (z > 0 and x > 0 ? buf[z - 1][y][x - 1] : 0) // + - (z > 0 and y > 0 ? buf[z - 1][y - 1][x] : 0) // + + (x > 0 ? buf[z][y][x - 1] : 0) // dist=1 + + (y > 0 ? buf[z][y - 1][x] : 0) // + + (z > 0 ? buf[z - 1][y][x] : 0)); // + + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + if (gix < len3.x and giy < len3.y and giz < len3.z) { + data[id] = (1 - quantizable) * candidate; // output; reuse data for outlier + eq[id] = quantizable * static_cast(candidate); + } +} + +template +__global__ void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) +{ + __shared__ T buf[BLK]; + + auto id = blockIdx.x * blockDim.x + threadIdx.x; + + if (id < len3.x) + buf[threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // fuse + else + buf[threadIdx.x] = 0; + __syncthreads(); + + for (auto d = 1; d < BLK; d *= 2) { + T n = 0; + if (threadIdx.x >= d) n = buf[threadIdx.x - d]; // like __shfl_up_sync(0x1f, var, d); warp_sync + __syncthreads(); + if (threadIdx.x >= d) buf[threadIdx.x] += n; + __syncthreads(); + } + + if (id < len3.x) { xdata[id] = buf[threadIdx.x] * ebx2; } +} + +template +__global__ void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) +{ + __shared__ T buf[BLK][BLK + 1]; + + auto giy = blockIdx.y * blockDim.y + threadIdx.y, gix = blockIdx.x * blockDim.x + threadIdx.x; + size_t id = gix + giy * stride3.y; + + if (gix < len3.x and giy < len3.y) + buf[threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // fuse + else + buf[threadIdx.y][threadIdx.x] = 0; + __syncthreads(); + + for (auto d = 1; d < BLK; d *= 2) { + T n = 0; + if (threadIdx.x >= d) n = buf[threadIdx.y][threadIdx.x - d]; + __syncthreads(); + if (threadIdx.x >= d) buf[threadIdx.y][threadIdx.x] += n; + __syncthreads(); + } + + for (auto d = 1; d < BLK; d *= 2) { + T n = 0; + if (threadIdx.y >= d) n = buf[threadIdx.y - d][threadIdx.x]; + __syncthreads(); + if (threadIdx.y >= d) buf[threadIdx.y][threadIdx.x] += n; + __syncthreads(); + } + + if (gix < len3.x and giy < len3.y) { xdata[id] = buf[threadIdx.y][threadIdx.x] * ebx2; } +} + +template +__global__ void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, dim3 len3, dim3 stride3, int radius, FP ebx2, T* xdata) +{ + __shared__ T buf[BLK][BLK][BLK + 1]; + + auto giz = blockIdx.z * BLK + threadIdx.z, giy = blockIdx.y * BLK + threadIdx.y, + gix = blockIdx.x * BLK + threadIdx.x; + size_t id = gix + giy * stride3.y + giz * stride3.z; // low to high in dim, inner to outer + + if (gix < len3.x and giy < len3.y and giz < len3.z) + buf[threadIdx.z][threadIdx.y][threadIdx.x] = scattered_outlier[id] + static_cast(eq[id]) - radius; // id + else + buf[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + __syncthreads(); + + for (auto dist = 1; dist < BLK; dist *= 2) { + T addend = 0; + if (threadIdx.x >= dist) addend = buf[threadIdx.z][threadIdx.y][threadIdx.x - dist]; + __syncthreads(); + if (threadIdx.x >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; + __syncthreads(); + } + + for (auto dist = 1; dist < BLK; dist *= 2) { + T addend = 0; + if (threadIdx.y >= dist) addend = buf[threadIdx.z][threadIdx.y - dist][threadIdx.x]; + __syncthreads(); + if (threadIdx.y >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; + __syncthreads(); + } + + for (auto dist = 1; dist < BLK; dist *= 2) { + T addend = 0; + if (threadIdx.z >= dist) addend = buf[threadIdx.z - dist][threadIdx.y][threadIdx.x]; + __syncthreads(); + if (threadIdx.z >= dist) buf[threadIdx.z][threadIdx.y][threadIdx.x] += addend; + __syncthreads(); + } + + if (gix < len3.x and giy < len3.y and giz < len3.z) { + xdata[id] = buf[threadIdx.z][threadIdx.y][threadIdx.x] * ebx2; + } +} + +} // namespace prototype +} // namespace __kernel +} // namespace cuda +} // namespace psz + +#endif diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl index b00ec690..e82013d5 100644 --- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl +++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_serial.inl @@ -1,326 +1,326 @@ -/** - * @file lorenzo_serial.inl - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-03-13 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 -#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 - -#include -#include "cusz/it.hh" -#include "cusz/nd.h" - -using std::cout; -using std::endl; - -#define SETUP_1D_BASIC \ - psz_dim3 grid_dim, block_idx, thread_idx; \ - auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ - auto gidx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ - auto check_boundary = [&]() { return gx() < len3.x; }; \ - grid_dim.x = (len3.x - 1) / BLK + 1; -#define SETUP_1D_DATABUF \ - constexpr auto PADDING = 1; \ - auto _buf1 = new psz_buf(); \ - auto& buf1 = *_buf1; \ - auto databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); }; -#define SETUP_1D_EQBUF \ - auto _buf2 = new psz_buf(); \ - auto& buf2 = *_buf2; \ - auto eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); }; -#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) -#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) - -#define SETUP_2D_BASIC \ - psz_dim3 grid_dim, block_idx, thread_idx; \ - auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ - auto gy = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \ - auto gidx = [&]() -> uint32_t { return gy() * stride3.y + gx(); }; \ - auto check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; }; \ - grid_dim.x = (len3.x - 1) / BLK + 1; \ - grid_dim.y = (len3.y - 1) / BLK + 1; -#define SETUP_2D_DATABUF \ - constexpr auto PADDING = 1; \ - auto _buf1 = new psz_buf(); \ - auto& buf1 = *_buf1; \ - auto databuf_it = [&](auto dx, auto dy) -> T& { \ - return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \ - }; -#define SETUP_2D_EQBUF \ - auto _buf2 = new psz_buf(); \ - auto& buf2 = *_buf2; \ - auto eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); }; -#define PFOR_GRID_2D() \ - for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \ - for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) -#define PFOR_BLOCK_2D() \ - for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \ - for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) - -#define SETUP_3D_BASIC \ - psz_dim3 grid_dim, block_idx, thread_idx; \ - auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ - auto gy = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \ - auto gz = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; }; \ - auto gidx = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \ - auto check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; }; \ - grid_dim.x = (len3.x - 1) / BLK + 1; \ - grid_dim.y = (len3.y - 1) / BLK + 1; \ - grid_dim.z = (len3.z - 1) / BLK + 1; -#define SETUP_3D_DATABUF \ - constexpr auto PADDING = 1; \ - auto _buf1 = new psz_buf(); \ - auto& buf1 = *_buf1; \ - auto databuf_it = [&](auto dx, auto dy, auto dz) -> T& { \ - return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \ - }; -#define SETUP_3D_EQBUF \ - auto _buf2 = new psz_buf(); \ - auto& buf2 = *_buf2; \ - auto eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& { \ - return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \ - }; -#define PFOR_GRID_3D() \ - for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++) \ - for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \ - for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) -#define PFOR_BLOCK_3D() \ - for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++) \ - for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \ - for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) - -namespace psz { -namespace serial { -namespace __kernel { - -template < - typename T, - typename EQ = int32_t, - typename FP = T, - int BLK = 256, - typename OUTLIER = struct psz_outlier_serial> -void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { - SETUP_1D_BASIC; - SETUP_1D_DATABUF; - SETUP_1D_EQBUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r; - }; - auto threadview_process = [&]() { - auto delta = databuf_it(0) - databuf_it(-1); - if (delta > radius) { - outlier->record(delta, gidx()); - eqbuf_it(0) = 0; - } - else { - eqbuf_it(0) = delta; - } - }; - auto threadview_store = [&]() { - if (check_boundary()) eq[gidx()] = eqbuf_it(0); - }; - - //////////////////////////////////////// - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); } - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); } - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); } - - delete _buf1; - delete _buf2; - -} - -template -void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) -{ - SETUP_1D_BASIC; - SETUP_1D_DATABUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()]; - }; - auto threadview_partial_sum = [&]() { - if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1); - }; - auto threadview_store = [&]() { - if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2; - }; - - //////////////////////////////////////// - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); } - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); } - PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); } - - delete _buf1; -} - -template < - typename T, - typename EQ = int32_t, - typename FP = T, - int BLK = 16, - typename OUTLIER = struct psz_outlier_serial> -void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { - SETUP_2D_BASIC; - SETUP_2D_DATABUF; - SETUP_2D_EQBUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r; - }; - auto threadview_process = [&]() { - auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1)); - if (delta > radius) { - outlier->record(delta, gidx()); - eqbuf_it(0, 0) = 0; - } - else { - eqbuf_it(0, 0) = delta; - } - }; - auto threadview_store = [&]() { - if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0); - }; - - //////////////////////////////////////// - PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); } - PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); } - PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); } - - delete _buf1; - delete _buf2; -} - -template -void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) -{ - SETUP_2D_BASIC; - SETUP_2D_DATABUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()]; - }; - auto threadview_partial_sum_x = [&]() { - if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0); - }; - auto threadview_partial_sum_y = [&]() { - if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1); - }; - auto threadview_store = [&]() { - if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2; - }; - - //////////////////////////////////////// - PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); } - PFOR_GRID_2D() - { - PFOR_BLOCK_2D() threadview_partial_sum_x(); - PFOR_BLOCK_2D() threadview_partial_sum_y(); - } - PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); } - - delete _buf1; -} - -template < - typename T, - typename EQ = int32_t, - typename FP = T, - int BLK = 8, - typename OUTLIER = struct psz_outlier_serial> -void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { - SETUP_3D_BASIC; - SETUP_3D_DATABUF; - SETUP_3D_EQBUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r; - }; - auto threadview_process = [&]() { - auto delta = databuf_it(0, 0, 0) - - (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) + - databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0)); - if (delta > radius) { - outlier->record(delta, gidx()); - eqbuf_it(0, 0, 0) = 0; - } - else { - eqbuf_it(0, 0, 0) = delta; - } - }; - auto threadview_store = [&]() { - if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0); - }; - - //////////////////////////////////////// - PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); } - PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); } - PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); } - - delete _buf1; - delete _buf2; -} - -template -void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) -{ - SETUP_3D_BASIC; - SETUP_3D_DATABUF; - - // per-thread ("real" kernel) - auto threadview_load = [&]() { - if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()]; - }; - auto threadview_partial_sum_x = [&]() { - if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0); - }; - auto threadview_partial_sum_y = [&]() { - if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0); - }; - auto threadview_partial_sum_z = [&]() { - if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1); - }; - auto threadview_store = [&]() { - if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2; - }; - - //////////////////////////////////////// - PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); } - PFOR_GRID_3D() - { - PFOR_BLOCK_3D() threadview_partial_sum_x(); - PFOR_BLOCK_3D() threadview_partial_sum_y(); - PFOR_BLOCK_3D() threadview_partial_sum_z(); - } - PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); } - - delete _buf1; -} - -} // namespace __kernel -} // namespace serial -} // namespace psz - -#undef SETUP_1D -#undef PFOR_GRID_1D -#undef PFOR_BLOCK_1D -#undef SETUP_2D_BASIC -#undef PFOR_GRID_2D -#undef PFOR_BLOCK_2D -#undef SETUP_3D -#undef PFOR_GRID_3D -#undef PFOR_BLOCK_3D - -#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */ +/** + * @file lorenzo_serial.inl + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-03-13 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 +#define E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 + +#include +#include "cusz/it.hh" +#include "cusz/nd.h" + +using std::cout; +using std::endl; + +#define SETUP_1D_BASIC \ + psz_dim3 grid_dim, block_idx, thread_idx; \ + auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ + auto gidx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ + auto check_boundary = [&]() { return gx() < len3.x; }; \ + grid_dim.x = (len3.x - 1) / BLK + 1; +#define SETUP_1D_DATABUF \ + constexpr auto PADDING = 1; \ + auto _buf1 = new psz_buf(); \ + auto& buf1 = *_buf1; \ + auto databuf_it = [&](auto x) -> T& { return buf1(thread_idx.x + x + PADDING); }; +#define SETUP_1D_EQBUF \ + auto _buf2 = new psz_buf(); \ + auto& buf2 = *_buf2; \ + auto eqbuf_it = [&](auto dx) -> EQ& { return buf2(thread_idx.x + dx); }; +#define PFOR_GRID_1D() for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) +#define PFOR_BLOCK_1D() for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) + +#define SETUP_2D_BASIC \ + psz_dim3 grid_dim, block_idx, thread_idx; \ + auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ + auto gy = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \ + auto gidx = [&]() -> uint32_t { return gy() * stride3.y + gx(); }; \ + auto check_boundary = [&]() { return gx() < len3.x and gy() < len3.y; }; \ + grid_dim.x = (len3.x - 1) / BLK + 1; \ + grid_dim.y = (len3.y - 1) / BLK + 1; +#define SETUP_2D_DATABUF \ + constexpr auto PADDING = 1; \ + auto _buf1 = new psz_buf(); \ + auto& buf1 = *_buf1; \ + auto databuf_it = [&](auto dx, auto dy) -> T& { \ + return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING); \ + }; +#define SETUP_2D_EQBUF \ + auto _buf2 = new psz_buf(); \ + auto& buf2 = *_buf2; \ + auto eqbuf_it = [&](auto dx, auto dy) -> EQ& { return buf2(thread_idx.x + dx, thread_idx.y + dy); }; +#define PFOR_GRID_2D() \ + for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \ + for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) +#define PFOR_BLOCK_2D() \ + for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \ + for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) + +#define SETUP_3D_BASIC \ + psz_dim3 grid_dim, block_idx, thread_idx; \ + auto gx = [&]() -> uint32_t { return block_idx.x * BLK + thread_idx.x; }; \ + auto gy = [&]() -> uint32_t { return block_idx.y * BLK + thread_idx.y; }; \ + auto gz = [&]() -> uint32_t { return block_idx.z * BLK + thread_idx.z; }; \ + auto gidx = [&]() -> uint32_t { return gz() * stride3.z + gy() * stride3.y + gx(); }; \ + auto check_boundary = [&]() { return gx() < len3.x and gy() < len3.y and gz() < len3.z; }; \ + grid_dim.x = (len3.x - 1) / BLK + 1; \ + grid_dim.y = (len3.y - 1) / BLK + 1; \ + grid_dim.z = (len3.z - 1) / BLK + 1; +#define SETUP_3D_DATABUF \ + constexpr auto PADDING = 1; \ + auto _buf1 = new psz_buf(); \ + auto& buf1 = *_buf1; \ + auto databuf_it = [&](auto dx, auto dy, auto dz) -> T& { \ + return buf1(thread_idx.x + dx + PADDING, thread_idx.y + dy + PADDING, thread_idx.z + dz + PADDING); \ + }; +#define SETUP_3D_EQBUF \ + auto _buf2 = new psz_buf(); \ + auto& buf2 = *_buf2; \ + auto eqbuf_it = [&](auto dx, auto dy, auto dz) -> EQ& { \ + return buf2(thread_idx.x + dx, thread_idx.y + dy, thread_idx.z + dz); \ + }; +#define PFOR_GRID_3D() \ + for (block_idx.z = 0; block_idx.z < grid_dim.z; block_idx.z++) \ + for (block_idx.y = 0; block_idx.y < grid_dim.y; block_idx.y++) \ + for (block_idx.x = 0; block_idx.x < grid_dim.x; block_idx.x++) +#define PFOR_BLOCK_3D() \ + for (thread_idx.z = 0; thread_idx.z < BLK; thread_idx.z++) \ + for (thread_idx.y = 0; thread_idx.y < BLK; thread_idx.y++) \ + for (thread_idx.x = 0; thread_idx.x < BLK; thread_idx.x++) + +namespace psz { +namespace serial { +namespace __kernel { + +template < + typename T, + typename EQ = int32_t, + typename FP = T, + int BLK = 256, + typename OUTLIER = struct psz_outlier_serial> +void c_lorenzo_1d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { + SETUP_1D_BASIC; + SETUP_1D_DATABUF; + SETUP_1D_EQBUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0) = data[gidx()] * ebx2_r; + }; + auto threadview_process = [&]() { + auto delta = databuf_it(0) - databuf_it(-1); + if (delta > radius) { + outlier->record(delta, gidx()); + eqbuf_it(0) = 0; + } + else { + eqbuf_it(0) = delta; + } + }; + auto threadview_store = [&]() { + if (check_boundary()) eq[gidx()] = eqbuf_it(0); + }; + + //////////////////////////////////////// + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); } + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_process(); } + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); } + + delete _buf1; + delete _buf2; + +} + +template +void x_lorenzo_1d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) +{ + SETUP_1D_BASIC; + SETUP_1D_DATABUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0) = eq[gidx()] + scattered_outlier[gidx()]; + }; + auto threadview_partial_sum = [&]() { + if (thread_idx.x > 0) databuf_it(0) += databuf_it(-1); + }; + auto threadview_store = [&]() { + if (check_boundary()) xdata[gidx()] = databuf_it(0) * ebx2; + }; + + //////////////////////////////////////// + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_load(); } + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_partial_sum(); } + PFOR_GRID_1D() { PFOR_BLOCK_1D() threadview_store(); } + + delete _buf1; +} + +template < + typename T, + typename EQ = int32_t, + typename FP = T, + int BLK = 16, + typename OUTLIER = struct psz_outlier_serial> +void c_lorenzo_2d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { + SETUP_2D_BASIC; + SETUP_2D_DATABUF; + SETUP_2D_EQBUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0, 0) = data[gidx()] * ebx2_r; + }; + auto threadview_process = [&]() { + auto delta = databuf_it(0, 0) - (databuf_it(-1, 0) + databuf_it(0, -1) - databuf_it(-1, -1)); + if (delta > radius) { + outlier->record(delta, gidx()); + eqbuf_it(0, 0) = 0; + } + else { + eqbuf_it(0, 0) = delta; + } + }; + auto threadview_store = [&]() { + if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0); + }; + + //////////////////////////////////////// + PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); } + PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_process(); } + PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); } + + delete _buf1; + delete _buf2; +} + +template +void x_lorenzo_2d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) +{ + SETUP_2D_BASIC; + SETUP_2D_DATABUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0, 0) = eq[gidx()] + scattered_outlier[gidx()]; + }; + auto threadview_partial_sum_x = [&]() { + if (thread_idx.x > 0) databuf_it(0, 0) += databuf_it(-1, 0); + }; + auto threadview_partial_sum_y = [&]() { + if (thread_idx.y > 0) databuf_it(0, 0) += databuf_it(0, -1); + }; + auto threadview_store = [&]() { + if (check_boundary()) xdata[gidx()] = databuf_it(0, 0) * ebx2; + }; + + //////////////////////////////////////// + PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_load(); } + PFOR_GRID_2D() + { + PFOR_BLOCK_2D() threadview_partial_sum_x(); + PFOR_BLOCK_2D() threadview_partial_sum_y(); + } + PFOR_GRID_2D() { PFOR_BLOCK_2D() threadview_store(); } + + delete _buf1; +} + +template < + typename T, + typename EQ = int32_t, + typename FP = T, + int BLK = 8, + typename OUTLIER = struct psz_outlier_serial> +void c_lorenzo_3d1l(T* data, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2_r, EQ* eq, OUTLIER* outlier) { + SETUP_3D_BASIC; + SETUP_3D_DATABUF; + SETUP_3D_EQBUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0, 0, 0) = data[gidx()] * ebx2_r; + }; + auto threadview_process = [&]() { + auto delta = databuf_it(0, 0, 0) - + (databuf_it(-1, -1, -1) - databuf_it(0, -1, -1) - databuf_it(-1, 0, -1) - databuf_it(-1, -1, 0) + + databuf_it(0, 0, -1) + databuf_it(0, -1, 0) + databuf_it(-1, 0, 0)); + if (delta > radius) { + outlier->record(delta, gidx()); + eqbuf_it(0, 0, 0) = 0; + } + else { + eqbuf_it(0, 0, 0) = delta; + } + }; + auto threadview_store = [&]() { + if (check_boundary()) eq[gidx()] = eqbuf_it(0, 0, 0); + }; + + //////////////////////////////////////// + PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); } + PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_process(); } + PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); } + + delete _buf1; + delete _buf2; +} + +template +void x_lorenzo_3d1l(EQ* eq, T* scattered_outlier, psz_dim3 len3, psz_dim3 stride3, int radius, FP ebx2, T* xdata) +{ + SETUP_3D_BASIC; + SETUP_3D_DATABUF; + + // per-thread ("real" kernel) + auto threadview_load = [&]() { + if (check_boundary()) databuf_it(0, 0, 0) = eq[gidx()] + scattered_outlier[gidx()]; + }; + auto threadview_partial_sum_x = [&]() { + if (thread_idx.x > 0) databuf_it(0, 0, 0) += databuf_it(-1, 0, 0); + }; + auto threadview_partial_sum_y = [&]() { + if (thread_idx.y > 0) databuf_it(0, 0, 0) += databuf_it(0, -1, 0); + }; + auto threadview_partial_sum_z = [&]() { + if (thread_idx.z > 0) databuf_it(0, 0, 0) += databuf_it(0, 0, -1); + }; + auto threadview_store = [&]() { + if (check_boundary()) xdata[gidx()] = databuf_it(0, 0, 0) * ebx2; + }; + + //////////////////////////////////////// + PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_load(); } + PFOR_GRID_3D() + { + PFOR_BLOCK_3D() threadview_partial_sum_x(); + PFOR_BLOCK_3D() threadview_partial_sum_y(); + PFOR_BLOCK_3D() threadview_partial_sum_z(); + } + PFOR_GRID_3D() { PFOR_BLOCK_3D() threadview_store(); } + + delete _buf1; +} + +} // namespace __kernel +} // namespace serial +} // namespace psz + +#undef SETUP_1D +#undef PFOR_GRID_1D +#undef PFOR_BLOCK_1D +#undef SETUP_2D_BASIC +#undef PFOR_GRID_2D +#undef PFOR_BLOCK_2D +#undef SETUP_3D +#undef PFOR_GRID_3D +#undef PFOR_BLOCK_3D + +#endif /* E0B87BA8_BEDC_4CBE_B5EE_C0C5875E07D6 */ diff --git a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl index b5563275..2f58d1ad 100644 --- a/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl +++ b/qtensor/compression/cusz/src/kernel/detail/lorenzo_var.inl @@ -1,530 +1,530 @@ -/** - * @file lorenzo_var.inl - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-09-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762 -#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762 - -#include - -#if __has_include() -// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" -#include -#else -// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" -#include "../../third_party/cub/cub/cub.cuh" -#endif - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -#define TIX threadIdx.x -#define TIY threadIdx.y -#define TIZ threadIdx.z -#define BIX blockIdx.x -#define BIY blockIdx.y -#define BIZ blockIdx.z -#define BDX blockDim.x -#define BDY blockDim.y -#define BDZ blockDim.z - -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -namespace cusz { -namespace experimental { - -template -__forceinline__ __device__ void -pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0) -{ - if CONSTEXPR (FIRST_POINT) { // i == 0 - Data delta = thread_scope[0] - from_last_stripe; - shmem_signum[0 + TIX * SEQ] = delta < 0; // signnum - shmem_delta[0 + TIX * SEQ] = static_cast(fabs(delta)); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) { - Data delta = thread_scope[i] - thread_scope[i - 1]; - shmem_signum[i + TIX * SEQ] = delta < 0; // signum - shmem_delta[i + TIX * SEQ] = static_cast(fabs(delta)); - } - __syncthreads(); - } -} - -template -__forceinline__ __device__ void load1d( - Data* data, - unsigned int dimx, - unsigned int id_base, - volatile Data* shmem_data, - Data thread_scope[SEQ], - Data& from_last_stripe, - FP ebx2_r) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + TIX + i * NTHREAD; - if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); } - } - __syncthreads(); - - for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i]; - - if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1]; - __syncthreads(); -} - -template -__forceinline__ __device__ void write1d( - volatile bool* shmem_signum, - bool* signum, - unsigned int dimx, - unsigned int id_base, - volatile ErrCtrl* shmem_delta = nullptr, - ErrCtrl* delta = nullptr) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + TIX + i * NTHREAD; - if (id < dimx) { - signum[id] = shmem_signum[TIX + i * NTHREAD]; - delta[id] = shmem_delta[TIX + i * NTHREAD]; - } - } -} - -template -__forceinline__ __device__ void load2d_prequant( - Data* data, - Data center[YSEQ + 1], - unsigned int dimx, - unsigned int dimy, - unsigned int stridey, - unsigned int gix, - unsigned int giy_base, - FP ebx2_r) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r); - } - auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16); // same-warp, next-16 - if (TIY == 1) center[0] = tmp; -} - -template -__forceinline__ __device__ void pred2d(Data center[YSEQ + 1]) -{ - /* prediction - original form: Data delta = center[i] - center[i - 1] + west[i] - west[i - 1]; - short form: Data delta = center[i] - west[i]; - */ -#pragma unroll - for (auto i = YSEQ; i > 0; i--) { - center[i] -= center[i - 1]; - auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); - if (TIX > 0) center[i] -= west; - } - __syncthreads(); -} - -template -__forceinline__ __device__ void postquant_write2d( - Data center[YSEQ + 1], - ErrCtrl* delta, - bool* signum, - unsigned int dimx, - unsigned int dimy, - unsigned int stridey, - unsigned int gix, - unsigned int giy_base) -{ - /******************************************************************************** - * Depending on whether postquant is delayed in compression, deside separating - * data-type signum and uint-type quantcode when writing to DRAM (or not). - ********************************************************************************/ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - - if (gix < dimx and giy_base + i - 1 < dimy) { - signum[gid] = center[i] < 0; // output; reuse data for signum - delta[gid] = static_cast(fabs(center[i])); - } - } -} - -template < - typename Data, - typename ErrCtrl, - typename FP, - int BLOCK, - int SEQ> -__global__ void c_lorenzo_1d1l( // - Data* data, - ErrCtrl* delta, - bool* signum, - dim3 len3, - dim3 stride3, - FP ebx2_r) -{ - constexpr auto NTHREAD = BLOCK / SEQ; - - __shared__ struct { - Data data[BLOCK]; - ErrCtrl delta[BLOCK]; - bool signum[BLOCK]; - } shmem; - - auto id_base = BIX * BLOCK; - - Data thread_scope[SEQ]; - Data from_last_stripe{0}; - - /******************************************************************************** - * load from DRAM using striped layout, perform prequant - ********************************************************************************/ - load1d(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r); - - /******************************************************************************** - * delta and signum - ********************************************************************************/ - pred1d(thread_scope, shmem.signum, shmem.delta, from_last_stripe); - pred1d(thread_scope, shmem.signum, shmem.delta); - write1d(shmem.signum, signum, len3.x, id_base, shmem.delta, delta); -} - -template -__global__ void c_lorenzo_2d1l_16x16data_mapto16x2( - Data* data, // input - ErrCtrl* delta, // output - bool* signum, // output - dim3 len3, - dim3 stride3, - FP ebx2_r) -{ - constexpr auto BLOCK = 16; - constexpr auto YSEQ = 8; - - Data center[YSEQ + 1] = {0}; // nw n - // w center - - auto gix = BIX * BDX + TIX; // BDX == 16 - auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 - // clang-format off - load2d_prequant(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r); - pred2d(center); - postquant_write2d(center, delta, signum, len3.x, len3.y, stride3.y, gix, giy_base); - // clang-format on -} - -template -__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8( - Data* data, // input - ErrCtrl* delta, // output - bool* signum, // output - dim3 len3, - dim3 stride3, - FP ebx2_r) -{ - constexpr auto BLOCK = 8; - __shared__ Data shmem[8][8][32]; - - auto z = TIZ; - - auto gix = BIX * (BLOCK * 4) + TIX; - auto giy_base = BIY * BLOCK; - auto giz = BIZ * BLOCK + z; - auto base_id = gix + giy_base * stride3.y + giz * stride3.z; - - /******************************************************************************** - * load from DRAM, perform prequant - ********************************************************************************/ - if (gix < len3.x and giz < len3.z) { - for (auto y = 0; y < BLOCK; y++) { - if (giy_base + y < len3.y) { - shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r); // prequant (fp presence) - } - } - } - __syncthreads(); // necessary to ensure correctness - - auto x = TIX % 8; - - for (auto y = 0; y < BLOCK; y++) { - Data delta_val; - - // prediction - delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0) // dist=3 - - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0) // dist=2 - - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0) // - - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0) // - + (x > 0 ? shmem[z][y][TIX - 1] : 0) // dist=1 - + (y > 0 ? shmem[z][y - 1][TIX] : 0) // - + (z > 0 ? shmem[z - 1][y][TIX] : 0)); // - - auto id = base_id + (y * stride3.y); - - // delta and signum - if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) { - signum[id] = delta_val < 0; - delta[id] = static_cast(fabs(delta_val)); - } - } - /* EOF */ -} - -template -__global__ void x_lorenzo_1d1l( // - bool* signum, - ErrCtrl* delta, - Data* xdata, - dim3 len3, - dim3 stride3, - FP ebx2) -{ - constexpr auto block_dim = BLOCK / SEQ; // dividable - - // coalesce-load (warp-striped) and transpose in shmem (similar for store) - typedef cub::BlockLoad BlockLoadT_signum; - typedef cub::BlockLoad BlockLoadT_delta; - typedef cub::BlockStore BlockStoreT_xdata; - typedef cub::BlockScan - BlockScanT_xdata; // TODO autoselect algorithm - - __shared__ union TempStorage { // overlap shared memory space - typename BlockLoadT_signum::TempStorage load_signum; - typename BlockLoadT_delta::TempStorage load_delta; - typename BlockStoreT_xdata::TempStorage store_xdata; - typename BlockScanT_xdata::TempStorage scan_xdata; - } temp_storage; - - // thread-scope tiled data - struct ThreadData { - Data xdata[SEQ]; - bool signum[SEQ]; - } thread_scope; - ErrCtrl thread_scope_delta[SEQ]; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block - ********************************************************************************/ - BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta); - __syncthreads(); // barrier for shmem reuse - BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum); - __syncthreads(); // barrier for shmem reuse - -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = (BIX * BDX + TIX) * SEQ + i; - thread_scope.xdata[i] = id < len3.x // - ? (thread_scope.signum[i] ? -1 : 1) * static_cast(thread_scope_delta[i]) - : 0; - } - __syncthreads(); - - /******************************************************************************** - * perform partial-sum using cub::InclusiveSum - ********************************************************************************/ - BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata); - __syncthreads(); // barrier for shmem reuse - - /******************************************************************************** - * scale by ebx2 and write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2; - __syncthreads(); // barrier for shmem reuse - - BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata); -} - -template -__global__ void -x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2) -{ - constexpr auto BLOCK = 16; - constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction - static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); - - __shared__ Data intermediate[BLOCK]; // TODO use warp shuffle to eliminate this - Data thread_scope[YSEQ]; - /* - . ------> gix (x) - | t00 t01 t02 t03 ... t0f - | ts00_0 ts00_0 ts00_0 ts00_0 - giy ts00_1 ts00_1 ts00_1 ts00_1 - (y) | | | | - ts00_7 ts00_7 ts00_7 ts00_7 - - | t10 t11 t12 t13 ... t1f - | ts00_0 ts00_0 ts00_0 ts00_0 - giy ts00_1 ts00_1 ts00_1 ts00_1 - (y) | | | | - ts00_7 ts00_7 ts00_7 ts00_7 - */ - - auto gix = BIX * BLOCK + TIX; - auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 - auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < len3.x and giy_base + i < len3.y) - thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast(delta[gid]); // fuse - else - thread_scope[i] = 0; // TODO set as init state? - } - - /******************************************************************************** - * partial-sum along y-axis, sequantially - ********************************************************************************/ - for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1]; - // two-pass: store for cross-threadscope update - if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1]; - __syncthreads(); - // two-pass: load and update - if (TIY == 1) { - auto tmp = intermediate[TIX]; -#pragma unroll - for (auto& i : thread_scope) i += tmp; - } - - /******************************************************************************** - * in-warp partial-sum along x-axis - ********************************************************************************/ -#pragma unroll - for (auto& i : thread_scope) { - for (auto d = 1; d < BLOCK; d *= 2) { - Data n = __shfl_up_sync(0xffffffff, i, d, 16); - if (TIX >= d) i += n; - } - i *= ebx2; - } - - /******************************************************************************** - * write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i]; - } -} - -template -__global__ void -x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2) -{ - constexpr auto BLOCK = 8; - constexpr auto YSEQ = BLOCK; - static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); - - __shared__ Data intermediate[BLOCK][4][8]; - Data thread_scope[YSEQ]; - - auto seg_id = TIX / 8; - auto seg_tix = TIX % 8; - - auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; - auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; - - /******************************************************************************** - * load to thread-private array (fuse at the same time) - ********************************************************************************/ -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - auto gid = get_gid(y); - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) - thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast(delta[gid]); - else - thread_scope[y] = 0; - } - - /******************************************************************************** - * partial-sum along y-axis, sequantially - ********************************************************************************/ - for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1]; - - /******************************************************************************** - * ND partial-sums along x- and z-axis - * in-warp shuffle used: in order to perform, it's transposed after X-partial sum - ********************************************************************************/ - auto dist = 1; - Data addend; - -#pragma unroll - for (auto i = 0; i < BLOCK; i++) { - Data val = thread_scope[i]; - - for (dist = 1; dist < BLOCK; dist *= 2) { - addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - // x-z transpose - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - for (dist = 1; dist < BLOCK; dist *= 2) { - addend = __shfl_up_sync(0xffffffff, val, dist, 8); - if (seg_tix >= dist) val += addend; - } - - intermediate[TIZ][seg_id][seg_tix] = val; - __syncthreads(); - val = intermediate[seg_tix][seg_id][TIZ]; - __syncthreads(); - - thread_scope[i] = val; - } - - /******************************************************************************** - * write to DRAM - ********************************************************************************/ -#pragma unroll - for (auto y = 0; y < YSEQ; y++) { - if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; } - } - /* EOF */ -} - -} // namespace experimental -} // namespace cusz - -#undef TIX -#undef TIY -#undef TIZ -#undef BIX -#undef BIY -#undef BIZ -#undef BDX -#undef BDY -#undef BDZ - -#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */ +/** + * @file lorenzo_var.inl + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-09-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef E2BEA52A_4D2E_4966_9135_6CE8B8E05762 +#define E2BEA52A_4D2E_4966_9135_6CE8B8E05762 + +#include + +#if __has_include() +// #pragma message __FILE__ ": (CUDA 11 onward), cub from system path" +#include +#else +// #pragma message __FILE__ ": (CUDA 10 or earlier), cub from git submodule" +#include "../../third_party/cub/cub/cub.cuh" +#endif + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +#define TIX threadIdx.x +#define TIY threadIdx.y +#define TIZ threadIdx.z +#define BIX blockIdx.x +#define BIY blockIdx.y +#define BIZ blockIdx.z +#define BDX blockDim.x +#define BDY blockDim.y +#define BDZ blockDim.z + +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +namespace cusz { +namespace experimental { + +template +__forceinline__ __device__ void +pred1d(Data thread_scope[SEQ], volatile bool* shmem_signum, volatile ErrCtrl* shmem_delta, Data from_last_stripe = 0) +{ + if CONSTEXPR (FIRST_POINT) { // i == 0 + Data delta = thread_scope[0] - from_last_stripe; + shmem_signum[0 + TIX * SEQ] = delta < 0; // signnum + shmem_delta[0 + TIX * SEQ] = static_cast(fabs(delta)); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) { + Data delta = thread_scope[i] - thread_scope[i - 1]; + shmem_signum[i + TIX * SEQ] = delta < 0; // signum + shmem_delta[i + TIX * SEQ] = static_cast(fabs(delta)); + } + __syncthreads(); + } +} + +template +__forceinline__ __device__ void load1d( + Data* data, + unsigned int dimx, + unsigned int id_base, + volatile Data* shmem_data, + Data thread_scope[SEQ], + Data& from_last_stripe, + FP ebx2_r) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + TIX + i * NTHREAD; + if (id < dimx) { shmem_data[TIX + i * NTHREAD] = round(data[id] * ebx2_r); } + } + __syncthreads(); + + for (auto i = 0; i < SEQ; i++) thread_scope[i] = shmem_data[TIX * SEQ + i]; + + if (TIX > 0) from_last_stripe = shmem_data[TIX * SEQ - 1]; + __syncthreads(); +} + +template +__forceinline__ __device__ void write1d( + volatile bool* shmem_signum, + bool* signum, + unsigned int dimx, + unsigned int id_base, + volatile ErrCtrl* shmem_delta = nullptr, + ErrCtrl* delta = nullptr) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + TIX + i * NTHREAD; + if (id < dimx) { + signum[id] = shmem_signum[TIX + i * NTHREAD]; + delta[id] = shmem_delta[TIX + i * NTHREAD]; + } + } +} + +template +__forceinline__ __device__ void load2d_prequant( + Data* data, + Data center[YSEQ + 1], + unsigned int dimx, + unsigned int dimy, + unsigned int stridey, + unsigned int gix, + unsigned int giy_base, + FP ebx2_r) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + if (gix < dimx and giy_base + i < dimy) center[i + 1] = round(data[get_gid(i)] * ebx2_r); + } + auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16); // same-warp, next-16 + if (TIY == 1) center[0] = tmp; +} + +template +__forceinline__ __device__ void pred2d(Data center[YSEQ + 1]) +{ + /* prediction + original form: Data delta = center[i] - center[i - 1] + west[i] - west[i - 1]; + short form: Data delta = center[i] - west[i]; + */ +#pragma unroll + for (auto i = YSEQ; i > 0; i--) { + center[i] -= center[i - 1]; + auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); + if (TIX > 0) center[i] -= west; + } + __syncthreads(); +} + +template +__forceinline__ __device__ void postquant_write2d( + Data center[YSEQ + 1], + ErrCtrl* delta, + bool* signum, + unsigned int dimx, + unsigned int dimy, + unsigned int stridey, + unsigned int gix, + unsigned int giy_base) +{ + /******************************************************************************** + * Depending on whether postquant is delayed in compression, deside separating + * data-type signum and uint-type quantcode when writing to DRAM (or not). + ********************************************************************************/ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + + if (gix < dimx and giy_base + i - 1 < dimy) { + signum[gid] = center[i] < 0; // output; reuse data for signum + delta[gid] = static_cast(fabs(center[i])); + } + } +} + +template < + typename Data, + typename ErrCtrl, + typename FP, + int BLOCK, + int SEQ> +__global__ void c_lorenzo_1d1l( // + Data* data, + ErrCtrl* delta, + bool* signum, + dim3 len3, + dim3 stride3, + FP ebx2_r) +{ + constexpr auto NTHREAD = BLOCK / SEQ; + + __shared__ struct { + Data data[BLOCK]; + ErrCtrl delta[BLOCK]; + bool signum[BLOCK]; + } shmem; + + auto id_base = BIX * BLOCK; + + Data thread_scope[SEQ]; + Data from_last_stripe{0}; + + /******************************************************************************** + * load from DRAM using striped layout, perform prequant + ********************************************************************************/ + load1d(data, len3.x, id_base, shmem.data, thread_scope, from_last_stripe, ebx2_r); + + /******************************************************************************** + * delta and signum + ********************************************************************************/ + pred1d(thread_scope, shmem.signum, shmem.delta, from_last_stripe); + pred1d(thread_scope, shmem.signum, shmem.delta); + write1d(shmem.signum, signum, len3.x, id_base, shmem.delta, delta); +} + +template +__global__ void c_lorenzo_2d1l_16x16data_mapto16x2( + Data* data, // input + ErrCtrl* delta, // output + bool* signum, // output + dim3 len3, + dim3 stride3, + FP ebx2_r) +{ + constexpr auto BLOCK = 16; + constexpr auto YSEQ = 8; + + Data center[YSEQ + 1] = {0}; // nw n + // w center + + auto gix = BIX * BDX + TIX; // BDX == 16 + auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 + // clang-format off + load2d_prequant(data, center, len3.x, len3.y, stride3.y, gix, giy_base, ebx2_r); + pred2d(center); + postquant_write2d(center, delta, signum, len3.x, len3.y, stride3.y, gix, giy_base); + // clang-format on +} + +template +__global__ void c_lorenzo_3d1l_32x8x8data_mapto32x1x8( + Data* data, // input + ErrCtrl* delta, // output + bool* signum, // output + dim3 len3, + dim3 stride3, + FP ebx2_r) +{ + constexpr auto BLOCK = 8; + __shared__ Data shmem[8][8][32]; + + auto z = TIZ; + + auto gix = BIX * (BLOCK * 4) + TIX; + auto giy_base = BIY * BLOCK; + auto giz = BIZ * BLOCK + z; + auto base_id = gix + giy_base * stride3.y + giz * stride3.z; + + /******************************************************************************** + * load from DRAM, perform prequant + ********************************************************************************/ + if (gix < len3.x and giz < len3.z) { + for (auto y = 0; y < BLOCK; y++) { + if (giy_base + y < len3.y) { + shmem[z][y][TIX] = round(data[base_id + y * stride3.y] * ebx2_r); // prequant (fp presence) + } + } + } + __syncthreads(); // necessary to ensure correctness + + auto x = TIX % 8; + + for (auto y = 0; y < BLOCK; y++) { + Data delta_val; + + // prediction + delta_val = shmem[z][y][TIX] - ((z > 0 and y > 0 and x > 0 ? shmem[z - 1][y - 1][TIX - 1] : 0) // dist=3 + - (y > 0 and x > 0 ? shmem[z][y - 1][TIX - 1] : 0) // dist=2 + - (z > 0 and x > 0 ? shmem[z - 1][y][TIX - 1] : 0) // + - (z > 0 and y > 0 ? shmem[z - 1][y - 1][TIX] : 0) // + + (x > 0 ? shmem[z][y][TIX - 1] : 0) // dist=1 + + (y > 0 ? shmem[z][y - 1][TIX] : 0) // + + (z > 0 ? shmem[z - 1][y][TIX] : 0)); // + + auto id = base_id + (y * stride3.y); + + // delta and signum + if (gix < len3.x and (giy_base + y) < len3.y and giz < len3.z) { + signum[id] = delta_val < 0; + delta[id] = static_cast(fabs(delta_val)); + } + } + /* EOF */ +} + +template +__global__ void x_lorenzo_1d1l( // + bool* signum, + ErrCtrl* delta, + Data* xdata, + dim3 len3, + dim3 stride3, + FP ebx2) +{ + constexpr auto block_dim = BLOCK / SEQ; // dividable + + // coalesce-load (warp-striped) and transpose in shmem (similar for store) + typedef cub::BlockLoad BlockLoadT_signum; + typedef cub::BlockLoad BlockLoadT_delta; + typedef cub::BlockStore BlockStoreT_xdata; + typedef cub::BlockScan + BlockScanT_xdata; // TODO autoselect algorithm + + __shared__ union TempStorage { // overlap shared memory space + typename BlockLoadT_signum::TempStorage load_signum; + typename BlockLoadT_delta::TempStorage load_delta; + typename BlockStoreT_xdata::TempStorage store_xdata; + typename BlockScanT_xdata::TempStorage scan_xdata; + } temp_storage; + + // thread-scope tiled data + struct ThreadData { + Data xdata[SEQ]; + bool signum[SEQ]; + } thread_scope; + ErrCtrl thread_scope_delta[SEQ]; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + * (BIX * BDX * SEQ) denotes the start of the data chunk that belongs to this thread block + ********************************************************************************/ + BlockLoadT_delta(temp_storage.load_delta).Load(delta + (BIX * BDX) * SEQ, thread_scope_delta); + __syncthreads(); // barrier for shmem reuse + BlockLoadT_signum(temp_storage.load_signum).Load(signum + (BIX * BDX) * SEQ, thread_scope.signum); + __syncthreads(); // barrier for shmem reuse + +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = (BIX * BDX + TIX) * SEQ + i; + thread_scope.xdata[i] = id < len3.x // + ? (thread_scope.signum[i] ? -1 : 1) * static_cast(thread_scope_delta[i]) + : 0; + } + __syncthreads(); + + /******************************************************************************** + * perform partial-sum using cub::InclusiveSum + ********************************************************************************/ + BlockScanT_xdata(temp_storage.scan_xdata).InclusiveSum(thread_scope.xdata, thread_scope.xdata); + __syncthreads(); // barrier for shmem reuse + + /******************************************************************************** + * scale by ebx2 and write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < SEQ; i++) thread_scope.xdata[i] *= ebx2; + __syncthreads(); // barrier for shmem reuse + + BlockStoreT_xdata(temp_storage.store_xdata).Store(xdata + (BIX * BDX) * SEQ, thread_scope.xdata); +} + +template +__global__ void +x_lorenzo_2d1l_16x16data_mapto16x2(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2) +{ + constexpr auto BLOCK = 16; + constexpr auto YSEQ = BLOCK / 2; // sequentiality in y direction + static_assert(BLOCK == 16, "In one case, we need BLOCK for 2D == 16"); + + __shared__ Data intermediate[BLOCK]; // TODO use warp shuffle to eliminate this + Data thread_scope[YSEQ]; + /* + . ------> gix (x) + | t00 t01 t02 t03 ... t0f + | ts00_0 ts00_0 ts00_0 ts00_0 + giy ts00_1 ts00_1 ts00_1 ts00_1 + (y) | | | | + ts00_7 ts00_7 ts00_7 ts00_7 + + | t10 t11 t12 t13 ... t1f + | ts00_0 ts00_0 ts00_0 ts00_0 + giy ts00_1 ts00_1 ts00_1 ts00_1 + (y) | | | | + ts00_7 ts00_7 ts00_7 ts00_7 + */ + + auto gix = BIX * BLOCK + TIX; + auto giy_base = BIY * BLOCK + TIY * YSEQ; // BDY * YSEQ = BLOCK == 16 + auto get_gid = [&](auto i) { return (giy_base + i) * stride3.y + gix; }; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < len3.x and giy_base + i < len3.y) + thread_scope[i] = (signum[gid] ? -1 : 1) * static_cast(delta[gid]); // fuse + else + thread_scope[i] = 0; // TODO set as init state? + } + + /******************************************************************************** + * partial-sum along y-axis, sequantially + ********************************************************************************/ + for (auto i = 1; i < YSEQ; i++) thread_scope[i] += thread_scope[i - 1]; + // two-pass: store for cross-threadscope update + if (TIY == 0) intermediate[TIX] = thread_scope[YSEQ - 1]; + __syncthreads(); + // two-pass: load and update + if (TIY == 1) { + auto tmp = intermediate[TIX]; +#pragma unroll + for (auto& i : thread_scope) i += tmp; + } + + /******************************************************************************** + * in-warp partial-sum along x-axis + ********************************************************************************/ +#pragma unroll + for (auto& i : thread_scope) { + for (auto d = 1; d < BLOCK; d *= 2) { + Data n = __shfl_up_sync(0xffffffff, i, d, 16); + if (TIX >= d) i += n; + } + i *= ebx2; + } + + /******************************************************************************** + * write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + if (gix < len3.x and giy_base + i < len3.y) xdata[gid] = thread_scope[i]; + } +} + +template +__global__ void +x_lorenzo_3d1l_32x8x8data_mapto32x1x8(bool* signum, ErrCtrl* delta, Data* xdata, dim3 len3, dim3 stride3, FP ebx2) +{ + constexpr auto BLOCK = 8; + constexpr auto YSEQ = BLOCK; + static_assert(BLOCK == 8, "In one case, we need BLOCK for 3D == 8"); + + __shared__ Data intermediate[BLOCK][4][8]; + Data thread_scope[YSEQ]; + + auto seg_id = TIX / 8; + auto seg_tix = TIX % 8; + + auto gix = BIX * (4 * BLOCK) + TIX, giy_base = BIY * BLOCK, giz = BIZ * BLOCK + TIZ; + auto get_gid = [&](auto y) { return giz * stride3.z + (giy_base + y) * stride3.y + gix; }; + + /******************************************************************************** + * load to thread-private array (fuse at the same time) + ********************************************************************************/ +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + auto gid = get_gid(y); + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) + thread_scope[y] = (signum[gid] ? -1 : 1) * static_cast(delta[gid]); + else + thread_scope[y] = 0; + } + + /******************************************************************************** + * partial-sum along y-axis, sequantially + ********************************************************************************/ + for (auto y = 1; y < YSEQ; y++) thread_scope[y] += thread_scope[y - 1]; + + /******************************************************************************** + * ND partial-sums along x- and z-axis + * in-warp shuffle used: in order to perform, it's transposed after X-partial sum + ********************************************************************************/ + auto dist = 1; + Data addend; + +#pragma unroll + for (auto i = 0; i < BLOCK; i++) { + Data val = thread_scope[i]; + + for (dist = 1; dist < BLOCK; dist *= 2) { + addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + // x-z transpose + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + for (dist = 1; dist < BLOCK; dist *= 2) { + addend = __shfl_up_sync(0xffffffff, val, dist, 8); + if (seg_tix >= dist) val += addend; + } + + intermediate[TIZ][seg_id][seg_tix] = val; + __syncthreads(); + val = intermediate[seg_tix][seg_id][TIZ]; + __syncthreads(); + + thread_scope[i] = val; + } + + /******************************************************************************** + * write to DRAM + ********************************************************************************/ +#pragma unroll + for (auto y = 0; y < YSEQ; y++) { + if (gix < len3.x and giy_base + y < len3.y and giz < len3.z) { xdata[get_gid(y)] = thread_scope[y] * ebx2; } + } + /* EOF */ +} + +} // namespace experimental +} // namespace cusz + +#undef TIX +#undef TIY +#undef TIZ +#undef BIX +#undef BIY +#undef BIZ +#undef BDX +#undef BDY +#undef BDZ + +#endif /* E2BEA52A_4D2E_4966_9135_6CE8B8E05762 */ diff --git a/qtensor/compression/cusz/src/kernel/detail/spline3.inl b/qtensor/compression/cusz/src/kernel/detail/spline3.inl index 2c4f1213..5e3526bd 100644 --- a/qtensor/compression/cusz/src/kernel/detail/spline3.inl +++ b/qtensor/compression/cusz/src/kernel/detail/spline3.inl @@ -1,746 +1,746 @@ -/** - * @file spline3.inl - * @author Jiannan Tian - * @brief - * @version 0.2 - * @date 2021-05-15 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef CUSZ_KERNEL_SPLINE3_CUH -#define CUSZ_KERNEL_SPLINE3_CUH - -#include -#include -#include -#include "utils/cuda_err.cuh" - -#define SPLINE3_COMPR true -#define SPLINE3_DECOMPR false - -#if __cplusplus >= 201703L -#define CONSTEXPR constexpr -#else -#define CONSTEXPR -#endif - -#define TIX threadIdx.x -#define TIY threadIdx.y -#define TIZ threadIdx.z -#define BIX blockIdx.x -#define BIY blockIdx.y -#define BIZ blockIdx.z -#define BDX blockDim.x -#define BDY blockDim.y -#define BDZ blockDim. - -using DIM = unsigned int; -using STRIDE = unsigned int; -using DIM3 = dim3; -using STRIDE3 = dim3; - -constexpr int BLOCK8 = 8; -constexpr int BLOCK32 = 32; - -#define SHM_ERROR shm_errctrl - -namespace cusz { - -/******************************************************************************** - * host API - ********************************************************************************/ - -template < - typename TITER, - typename EITER, - typename FP = float, - int LINEAR_BLOCK_SIZE = 256, - bool PROBE_PRED_ERROR = false> -__global__ void c_spline3d_infprecis_32x8x8data( - TITER data, - DIM3 data_size, - STRIDE3 data_leap, - EITER errctrl, - DIM3 errctrl_size, - STRIDE3 errctrl_leap, - TITER anchor, - STRIDE3 anchor_leap, - FP eb_r, - FP ebx2, - int radius, - TITER pred_error = nullptr, - TITER compress_error = nullptr); - -template < - typename EITER, - typename TITER, - typename FP = float, - int LINEAR_BLOCK_SIZE = 256> -__global__ void x_spline3d_infprecis_32x8x8data( - EITER errctrl, // input 1 - DIM3 errctrl_size, // - STRIDE3 errctrl_leap, // - TITER anchor, // input 2 - DIM3 anchor_size, // - STRIDE3 anchor_leap, // - TITER data, // output - DIM3 data_size, // - STRIDE3 data_leap, // - FP eb_r, - FP ebx2, - int radius); - -namespace device_api { -/******************************************************************************** - * device API - ********************************************************************************/ -template < - typename T1, - typename T2, - typename FP, - int LINEAR_BLOCK_SIZE, - bool WORKFLOW = SPLINE3_COMPR, - bool PROBE_PRED_ERROR = false> -__device__ void spline3d_layout2_interpolate( - volatile T1 shm_data[9][9][33], - volatile T2 shm_errctrl[9][9][33], - FP eb_r, - FP ebx2, - int radius); -} // namespace device_api - -} // namespace cusz - -/******************************************************************************** - * helper function - ********************************************************************************/ - -namespace { - -template -__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z) -{ - if CONSTEXPR (INCLUSIVE) { // - return x <= 32 and y <= 8 and z <= 8; - } - else { - return x < 32 and y < 8 and z < 8; - } -} - -// control block_id3 in function call -template -__device__ void -spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true) -{ - for (auto z = 0; z < ZEND; z++) { - printf("\nprint from GPU, z=%d\n", z); - printf(" "); - for (auto i = 0; i < 33; i++) printf("%3d", i); - printf("\n"); - - for (auto y = 0; y < YEND; y++) { - printf("y=%d ", y); - for (auto x = 0; x < XEND; x++) { // - if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); } - else { - T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x]; - if (compress) { - if (c == 0) { printf("%3c", '.'); } - else { - if (abs(c) >= 10) { printf("%3c", '*'); } - else { - if (print_errctrl) { printf("%3d", c); } - else { - printf("%4.2f", c); - } - } - } - } - else { - if (print_errctrl) { printf("%3d", c); } - else { - printf("%4.2f", c); - } - } - } - } - printf("\n"); - } - } - printf("\nGPU print end\n\n"); -} - -template -__device__ void -c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius) -{ - // alternatively, reinterprete cast volatile T?[][][] to 1D - for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) { - auto x = (_tix % 33); - auto y = (_tix / 33) % 9; - auto z = (_tix / 33) / 9; - - shm_data[z][y][x] = 0; - /***************************************************************************** - okay to use - ******************************************************************************/ - if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius; - /***************************************************************************** - alternatively - ******************************************************************************/ - // shm_errctrl[z][y][x] = radius; - } - __syncthreads(); -} - -template -__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap) -{ - auto x = (TIX % 32) + BIX * 32; - auto y = (TIX / 32) % 8 + BIY * 8; - auto z = (TIX / 32) / 8 + BIZ * 8; - - bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0; - bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z; - - if (pred1 and pred2) { - auto data_id = x + y * data_leap.y + z * data_leap.z; - auto anchor_id = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z; - anchor[anchor_id] = data[data_id]; - } - __syncthreads(); -} - -/* - * use shmem, erroneous -template -__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap) -{ - constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1; // 11 iterations - for (auto i = 0; i < NUM_ITERS; i++) { - auto _tix = i * LINEAR_BLOCK_SIZE + TIX; - - if (_tix < 33 * 9 * 9) { - auto x = (_tix % 33); - auto y = (_tix / 33) % 9; - auto z = (_tix / 33) / 9; - - if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) { - auto aid = ((x / 8) + BIX * 4) + // - ((y / 8) + BIY) * anchor_leap.y + // - ((z / 8) + BIZ) * anchor_leap.z; // - anchor[aid] = shm_data[z][y][x]; - } - } - } - __syncthreads(); -} -*/ - -template -__device__ void x_reset_scratch_33x9x9data( - volatile T1 shm_xdata[9][9][33], - volatile T2 shm_errctrl[9][9][33], - T1* anchor, // - DIM3 anchor_size, // - STRIDE3 anchor_leap) -{ - for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) { - auto x = (_tix % 33); - auto y = (_tix / 33) % 9; - auto z = (_tix / 33) / 9; - - shm_errctrl[z][y][x] = 0; // TODO explicitly handle zero-padding - /***************************************************************************** - okay to use - ******************************************************************************/ - if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) { - shm_xdata[z][y][x] = 0; - - auto ax = ((x / 8) + BIX * 4); - auto ay = ((y / 8) + BIY); - auto az = ((z / 8) + BIZ); - - if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z) - shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z]; - } - /***************************************************************************** - alternatively - ******************************************************************************/ - // shm_errctrl[z][y][x] = radius; - } - - __syncthreads(); -} - -template -__device__ void -global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33]) -{ - constexpr auto TOTAL = 33 * 9 * 9; - - for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { - auto x = (_tix % 33); - auto y = (_tix / 33) % 9; - auto z = (_tix / 33) / 9; - auto gx = (x + BIX * BLOCK32); - auto gy = (y + BIY * BLOCK8); - auto gz = (z + BIZ * BLOCK8); - auto gid = gx + gy * data_leap.y + gz * data_leap.z; - - if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid]; - } - __syncthreads(); -} - -template -__device__ void -shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap) -{ - constexpr auto TOTAL = 32 * 8 * 8; - - for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { - auto x = (_tix % 32); - auto y = (_tix / 32) % 8; - auto z = (_tix / 32) / 8; - auto gx = (x + BIX * BLOCK32); - auto gy = (y + BIY * BLOCK8); - auto gz = (z + BIZ * BLOCK8); - auto gid = gx + gy * data_leap.y + gz * data_leap.z; - - if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x]; - } - __syncthreads(); -} - -template < - typename T1, - typename T2, - typename FP, - typename LAMBDAX, - typename LAMBDAY, - typename LAMBDAZ, - bool BLUE, - bool YELLOW, - bool HOLLOW, - int LINEAR_BLOCK_SIZE, - int BLOCK_DIMX, - int BLOCK_DIMY, - bool COARSEN, - int BLOCK_DIMZ, - bool BORDER_INCLUSIVE, - bool WORKFLOW> -__forceinline__ __device__ void interpolate_stage( - volatile T1 shm_data[9][9][33], - volatile T2 shm_errctrl[9][9][33], - LAMBDAX xmap, - LAMBDAY ymap, - LAMBDAZ zmap, - int unit, - FP eb_r, - FP ebx2, - int radius) -{ - static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized"); - static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot"); - static_assert((BLUE and YELLOW) == false, "must be only one hot (1)"); - static_assert((BLUE and YELLOW) == false, "must be only one hot (2)"); - static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)"); - - auto run = [&](auto x, auto y, auto z) { - if (xyz33x9x9_predicate(x, y, z)) { - T1 pred = 0; - - if CONSTEXPR (BLUE) { // - pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2; - } - if CONSTEXPR (YELLOW) { // - pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2; - } - if CONSTEXPR (HOLLOW) { // - pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2; - } - - if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) { - auto err = shm_data[z][y][x] - pred; - decltype(err) code; - // TODO unsafe, did not deal with the out-of-cap case - { - code = fabs(err) * eb_r + 1; - code = err < 0 ? -code : code; - code = int(code / 2) + radius; - } - shm_errctrl[z][y][x] = code; // TODO double check if unsigned type works - shm_data[z][y][x] = pred + (code - radius) * ebx2; - } - else { // TODO == DECOMPRESSS and static_assert - auto code = shm_errctrl[z][y][x]; - shm_data[z][y][x] = pred + (code - radius) * ebx2; - } - } - }; - // -------------------------------------------------------------------------------- // - - if CONSTEXPR (COARSEN) { - constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ; - for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { - auto itix = (_tix % BLOCK_DIMX); - auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY; - auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY; - auto x = xmap(itix, unit); - auto y = ymap(itiy, unit); - auto z = zmap(itiz, unit); - run(x, y, z); - } - } - else { - auto itix = (TIX % BLOCK_DIMX); - auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY; - auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY; - auto x = xmap(itix, unit); - auto y = ymap(itiy, unit); - auto z = zmap(itiz, unit); - run(x, y, z); - } - __syncthreads(); -} - -} // namespace - -/********************************************************************************/ - -template -__device__ void cusz::device_api::spline3d_layout2_interpolate( - volatile T1 shm_data[9][9][33], - volatile T2 shm_errctrl[9][9][33], - FP eb_r, - FP ebx2, - int radius) -{ - auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); }; - auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); }; - auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); }; - - auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); }; - auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); }; - auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); }; - - auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); }; - auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); }; - auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); }; - - constexpr auto COARSEN = true; - constexpr auto NO_COARSEN = false; - constexpr auto BORDER_INCLUSIVE = true; - constexpr auto BORDER_EXCLUSIVE = false; - - int unit = 4; - - // iteration 1 - interpolate_stage< - T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // - true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); - interpolate_stage< - T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // - false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); - interpolate_stage< - T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // - false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); - - unit = 2; - - // iteration 2, TODO switch y-z order - interpolate_stage< - T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // - true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); - interpolate_stage< - T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // - false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); - interpolate_stage< - T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // - false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); - - unit = 1; - - // iteration 3 - interpolate_stage< - T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // - true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); - interpolate_stage< - T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // - false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); - /****************************************************************************** - test only: last step inclusive - ******************************************************************************/ - // interpolate_stage< - // T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // - // false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>( - // shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); - /****************************************************************************** - production - ******************************************************************************/ - interpolate_stage< - T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // - false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>( - shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); - - /****************************************************************************** - test only: print a block - ******************************************************************************/ - // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); } - // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); } -} - -/******************************************************************************** - * host API/kernel - ********************************************************************************/ - -template -__global__ void cusz::c_spline3d_infprecis_32x8x8data( - TITER data, - DIM3 data_size, - STRIDE3 data_leap, - EITER errctrl, - DIM3 errctrl_size, - STRIDE3 errctrl_leap, - TITER anchor, - STRIDE3 anchor_leap, - FP eb_r, - FP ebx2, - int radius, - TITER pred_error, - TITER compress_error) -{ - // compile time variables - using T = typename std::remove_pointer::type; - using E = typename std::remove_pointer::type; - - if CONSTEXPR (PROBE_PRED_ERROR) { - // TODO - } - else { - __shared__ struct { - T data[9][9][33]; - E errctrl[9][9][33]; - } shmem; - - c_reset_scratch_33x9x9data(shmem.data, shmem.errctrl, radius); - global2shmem_33x9x9data(data, data_size, data_leap, shmem.data); - - // version 1, use shmem, erroneous - // c_gather_anchor(shmem.data, anchor, anchor_leap); - // version 2, use global mem, correct - c_gather_anchor(data, data_size, data_leap, anchor, anchor_leap); - - cusz::device_api::spline3d_layout2_interpolate( - shmem.data, shmem.errctrl, eb_r, ebx2, radius); - shmem2global_32x8x8data(shmem.errctrl, errctrl, errctrl_size, errctrl_leap); - } -} - -template < - typename EITER, - typename TITER, - typename FP, - int LINEAR_BLOCK_SIZE> -__global__ void cusz::x_spline3d_infprecis_32x8x8data( - EITER errctrl, // input 1 - DIM3 errctrl_size, // - STRIDE3 errctrl_leap, // - TITER anchor, // input 2 - DIM3 anchor_size, // - STRIDE3 anchor_leap, // - TITER data, // output - DIM3 data_size, // - STRIDE3 data_leap, // - FP eb_r, - FP ebx2, - int radius) -{ - // compile time variables - using E = typename std::remove_pointer::type; - using T = typename std::remove_pointer::type; - - __shared__ struct { - E errctrl[9][9][33]; - T data[9][9][33]; - } shmem; - - x_reset_scratch_33x9x9data(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap); - global2shmem_33x9x9data(errctrl, errctrl_size, errctrl_leap, shmem.errctrl); - cusz::device_api::spline3d_layout2_interpolate( - shmem.data, shmem.errctrl, eb_r, ebx2, radius); - shmem2global_32x8x8data(shmem.data, data, data_size, data_leap); -} - -#undef TIX -#undef TIY -#undef TIZ -#undef BIX -#undef BIY -#undef BIZ -#undef BDX -#undef BDY -#undef BDZ - -template -void launch_construct_Spline3( - T* data, - dim3 const len3, - T* anchor, - dim3 const an_len3, - E* errctrl, - dim3 const ec_len3, - double const eb, - int const radius, - float& time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - constexpr auto SEQ_3D = dim3(1, 8, 1); - constexpr auto BLOCK_3D = dim3(256, 1, 1); - auto GRID_3D = divide3(len3, SUBLEN_3D); - - { - constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z; - constexpr auto SEQ_TOTAL = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z; - constexpr auto BLOCK_TOTAL = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z; - - // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!"); - if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!"); - } - - //////////////////////////////////////// - - auto ebx2 = eb * 2; - auto eb_r = 1 / eb; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y); - auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - auto d = ndim(); - - if (d == 1) { // - throw std::runtime_error("Spline1 not implemented"); - } - else if (d == 2) { - throw std::runtime_error("Spline2 not implemented"); - } - else if (d == 3) { - cusz::c_spline3d_infprecis_32x8x8data // - <<>> // - (data, len3, leap3, // - errctrl, ec_len3, ec_leap3, // - anchor, an_leap3, // - eb_r, ebx2, radius); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - TIME_ELAPSED_CUDAEVENT(&time_elapsed); - - DESTROY_CUDAEVENT_PAIR; -} - -template -void launch_reconstruct_Spline3( - T* xdata, - dim3 const len3, - T* anchor, - dim3 const an_len3, - E* errctrl, - dim3 const ec_len3, - double const eb, - int const radius, - float& time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - /* - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - */ - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - constexpr auto SEQ_3D = dim3(1, 8, 1); - constexpr auto BLOCK_3D = dim3(256, 1, 1); - auto GRID_3D = divide3(len3, SUBLEN_3D); - - { - constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z; - constexpr auto SEQ_TOTAL = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z; - constexpr auto BLOCK_TOTAL = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z; - - // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!"); - if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!"); - } - - //////////////////////////////////////// - - auto ebx2 = eb * 2; - auto eb_r = 1 / eb; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y); - auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - cusz::x_spline3d_infprecis_32x8x8data // - <<>> // - (errctrl, ec_len3, ec_leap3, // - anchor, an_len3, an_leap3, // - xdata, len3, leap3, // - eb_r, ebx2, radius); - - STOP_CUDAEVENT_RECORDING(stream); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - - TIME_ELAPSED_CUDAEVENT(&time_elapsed); - DESTROY_CUDAEVENT_PAIR; -} - -#endif +/** + * @file spline3.inl + * @author Jiannan Tian + * @brief + * @version 0.2 + * @date 2021-05-15 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef CUSZ_KERNEL_SPLINE3_CUH +#define CUSZ_KERNEL_SPLINE3_CUH + +#include +#include +#include +#include "utils/cuda_err.cuh" + +#define SPLINE3_COMPR true +#define SPLINE3_DECOMPR false + +#if __cplusplus >= 201703L +#define CONSTEXPR constexpr +#else +#define CONSTEXPR +#endif + +#define TIX threadIdx.x +#define TIY threadIdx.y +#define TIZ threadIdx.z +#define BIX blockIdx.x +#define BIY blockIdx.y +#define BIZ blockIdx.z +#define BDX blockDim.x +#define BDY blockDim.y +#define BDZ blockDim. + +using DIM = unsigned int; +using STRIDE = unsigned int; +using DIM3 = dim3; +using STRIDE3 = dim3; + +constexpr int BLOCK8 = 8; +constexpr int BLOCK32 = 32; + +#define SHM_ERROR shm_errctrl + +namespace cusz { + +/******************************************************************************** + * host API + ********************************************************************************/ + +template < + typename TITER, + typename EITER, + typename FP = float, + int LINEAR_BLOCK_SIZE = 256, + bool PROBE_PRED_ERROR = false> +__global__ void c_spline3d_infprecis_32x8x8data( + TITER data, + DIM3 data_size, + STRIDE3 data_leap, + EITER errctrl, + DIM3 errctrl_size, + STRIDE3 errctrl_leap, + TITER anchor, + STRIDE3 anchor_leap, + FP eb_r, + FP ebx2, + int radius, + TITER pred_error = nullptr, + TITER compress_error = nullptr); + +template < + typename EITER, + typename TITER, + typename FP = float, + int LINEAR_BLOCK_SIZE = 256> +__global__ void x_spline3d_infprecis_32x8x8data( + EITER errctrl, // input 1 + DIM3 errctrl_size, // + STRIDE3 errctrl_leap, // + TITER anchor, // input 2 + DIM3 anchor_size, // + STRIDE3 anchor_leap, // + TITER data, // output + DIM3 data_size, // + STRIDE3 data_leap, // + FP eb_r, + FP ebx2, + int radius); + +namespace device_api { +/******************************************************************************** + * device API + ********************************************************************************/ +template < + typename T1, + typename T2, + typename FP, + int LINEAR_BLOCK_SIZE, + bool WORKFLOW = SPLINE3_COMPR, + bool PROBE_PRED_ERROR = false> +__device__ void spline3d_layout2_interpolate( + volatile T1 shm_data[9][9][33], + volatile T2 shm_errctrl[9][9][33], + FP eb_r, + FP ebx2, + int radius); +} // namespace device_api + +} // namespace cusz + +/******************************************************************************** + * helper function + ********************************************************************************/ + +namespace { + +template +__forceinline__ __device__ bool xyz33x9x9_predicate(unsigned int x, unsigned int y, unsigned int z) +{ + if CONSTEXPR (INCLUSIVE) { // + return x <= 32 and y <= 8 and z <= 8; + } + else { + return x < 32 and y < 8 and z < 8; + } +} + +// control block_id3 in function call +template +__device__ void +spline3d_print_block_from_GPU(T volatile a[9][9][33], int radius = 512, bool compress = true, bool print_errctrl = true) +{ + for (auto z = 0; z < ZEND; z++) { + printf("\nprint from GPU, z=%d\n", z); + printf(" "); + for (auto i = 0; i < 33; i++) printf("%3d", i); + printf("\n"); + + for (auto y = 0; y < YEND; y++) { + printf("y=%d ", y); + for (auto x = 0; x < XEND; x++) { // + if CONSTEXPR (PRINT_FP) { printf("%.2e\t", (float)a[z][y][x]); } + else { + T c = print_errctrl ? a[z][y][x] - radius : a[z][y][x]; + if (compress) { + if (c == 0) { printf("%3c", '.'); } + else { + if (abs(c) >= 10) { printf("%3c", '*'); } + else { + if (print_errctrl) { printf("%3d", c); } + else { + printf("%4.2f", c); + } + } + } + } + else { + if (print_errctrl) { printf("%3d", c); } + else { + printf("%4.2f", c); + } + } + } + } + printf("\n"); + } + } + printf("\nGPU print end\n\n"); +} + +template +__device__ void +c_reset_scratch_33x9x9data(volatile T1 shm_data[9][9][33], volatile T2 shm_errctrl[9][9][33], int radius) +{ + // alternatively, reinterprete cast volatile T?[][][] to 1D + for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) { + auto x = (_tix % 33); + auto y = (_tix / 33) % 9; + auto z = (_tix / 33) / 9; + + shm_data[z][y][x] = 0; + /***************************************************************************** + okay to use + ******************************************************************************/ + if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) shm_errctrl[z][y][x] = radius; + /***************************************************************************** + alternatively + ******************************************************************************/ + // shm_errctrl[z][y][x] = radius; + } + __syncthreads(); +} + +template +__device__ void c_gather_anchor(T1* data, DIM3 data_size, STRIDE3 data_leap, T1* anchor, STRIDE3 anchor_leap) +{ + auto x = (TIX % 32) + BIX * 32; + auto y = (TIX / 32) % 8 + BIY * 8; + auto z = (TIX / 32) / 8 + BIZ * 8; + + bool pred1 = x % 8 == 0 and y % 8 == 0 and z % 8 == 0; + bool pred2 = x < data_size.x and y < data_size.y and z < data_size.z; + + if (pred1 and pred2) { + auto data_id = x + y * data_leap.y + z * data_leap.z; + auto anchor_id = (x / 8) + (y / 8) * anchor_leap.y + (z / 8) * anchor_leap.z; + anchor[anchor_id] = data[data_id]; + } + __syncthreads(); +} + +/* + * use shmem, erroneous +template +__device__ void c_gather_anchor(volatile T1 shm_data[9][9][33], T1* anchor, STRIDE3 anchor_leap) +{ + constexpr auto NUM_ITERS = 33 * 9 * 9 / LINEAR_BLOCK_SIZE + 1; // 11 iterations + for (auto i = 0; i < NUM_ITERS; i++) { + auto _tix = i * LINEAR_BLOCK_SIZE + TIX; + + if (_tix < 33 * 9 * 9) { + auto x = (_tix % 33); + auto y = (_tix / 33) % 9; + auto z = (_tix / 33) / 9; + + if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) { + auto aid = ((x / 8) + BIX * 4) + // + ((y / 8) + BIY) * anchor_leap.y + // + ((z / 8) + BIZ) * anchor_leap.z; // + anchor[aid] = shm_data[z][y][x]; + } + } + } + __syncthreads(); +} +*/ + +template +__device__ void x_reset_scratch_33x9x9data( + volatile T1 shm_xdata[9][9][33], + volatile T2 shm_errctrl[9][9][33], + T1* anchor, // + DIM3 anchor_size, // + STRIDE3 anchor_leap) +{ + for (auto _tix = TIX; _tix < 33 * 9 * 9; _tix += LINEAR_BLOCK_SIZE) { + auto x = (_tix % 33); + auto y = (_tix / 33) % 9; + auto z = (_tix / 33) / 9; + + shm_errctrl[z][y][x] = 0; // TODO explicitly handle zero-padding + /***************************************************************************** + okay to use + ******************************************************************************/ + if (x % 8 == 0 and y % 8 == 0 and z % 8 == 0) { + shm_xdata[z][y][x] = 0; + + auto ax = ((x / 8) + BIX * 4); + auto ay = ((y / 8) + BIY); + auto az = ((z / 8) + BIZ); + + if (ax < anchor_size.x and ay < anchor_size.y and az < anchor_size.z) + shm_xdata[z][y][x] = anchor[ax + ay * anchor_leap.y + az * anchor_leap.z]; + } + /***************************************************************************** + alternatively + ******************************************************************************/ + // shm_errctrl[z][y][x] = radius; + } + + __syncthreads(); +} + +template +__device__ void +global2shmem_33x9x9data(Input* data, DIM3 data_size, STRIDE3 data_leap, volatile Input shm_data[9][9][33]) +{ + constexpr auto TOTAL = 33 * 9 * 9; + + for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { + auto x = (_tix % 33); + auto y = (_tix / 33) % 9; + auto z = (_tix / 33) / 9; + auto gx = (x + BIX * BLOCK32); + auto gy = (y + BIY * BLOCK8); + auto gz = (z + BIZ * BLOCK8); + auto gid = gx + gy * data_leap.y + gz * data_leap.z; + + if (gx < data_size.x and gy < data_size.y and gz < data_size.z) shm_data[z][y][x] = data[gid]; + } + __syncthreads(); +} + +template +__device__ void +shmem2global_32x8x8data(volatile Output shm_data[9][9][33], Output* data, DIM3 data_size, STRIDE3 data_leap) +{ + constexpr auto TOTAL = 32 * 8 * 8; + + for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { + auto x = (_tix % 32); + auto y = (_tix / 32) % 8; + auto z = (_tix / 32) / 8; + auto gx = (x + BIX * BLOCK32); + auto gy = (y + BIY * BLOCK8); + auto gz = (z + BIZ * BLOCK8); + auto gid = gx + gy * data_leap.y + gz * data_leap.z; + + if (gx < data_size.x and gy < data_size.y and gz < data_size.z) data[gid] = shm_data[z][y][x]; + } + __syncthreads(); +} + +template < + typename T1, + typename T2, + typename FP, + typename LAMBDAX, + typename LAMBDAY, + typename LAMBDAZ, + bool BLUE, + bool YELLOW, + bool HOLLOW, + int LINEAR_BLOCK_SIZE, + int BLOCK_DIMX, + int BLOCK_DIMY, + bool COARSEN, + int BLOCK_DIMZ, + bool BORDER_INCLUSIVE, + bool WORKFLOW> +__forceinline__ __device__ void interpolate_stage( + volatile T1 shm_data[9][9][33], + volatile T2 shm_errctrl[9][9][33], + LAMBDAX xmap, + LAMBDAY ymap, + LAMBDAZ zmap, + int unit, + FP eb_r, + FP ebx2, + int radius) +{ + static_assert(BLOCK_DIMX * BLOCK_DIMY * (COARSEN ? 1 : BLOCK_DIMZ) <= LINEAR_BLOCK_SIZE, "block oversized"); + static_assert((BLUE or YELLOW or HOLLOW) == true, "must be one hot"); + static_assert((BLUE and YELLOW) == false, "must be only one hot (1)"); + static_assert((BLUE and YELLOW) == false, "must be only one hot (2)"); + static_assert((YELLOW and HOLLOW) == false, "must be only one hot (3)"); + + auto run = [&](auto x, auto y, auto z) { + if (xyz33x9x9_predicate(x, y, z)) { + T1 pred = 0; + + if CONSTEXPR (BLUE) { // + pred = (shm_data[z - unit][y][x] + shm_data[z + unit][y][x]) / 2; + } + if CONSTEXPR (YELLOW) { // + pred = (shm_data[z][y][x - unit] + shm_data[z][y][x + unit]) / 2; + } + if CONSTEXPR (HOLLOW) { // + pred = (shm_data[z][y - unit][x] + shm_data[z][y + unit][x]) / 2; + } + + if CONSTEXPR (WORKFLOW == SPLINE3_COMPR) { + auto err = shm_data[z][y][x] - pred; + decltype(err) code; + // TODO unsafe, did not deal with the out-of-cap case + { + code = fabs(err) * eb_r + 1; + code = err < 0 ? -code : code; + code = int(code / 2) + radius; + } + shm_errctrl[z][y][x] = code; // TODO double check if unsigned type works + shm_data[z][y][x] = pred + (code - radius) * ebx2; + } + else { // TODO == DECOMPRESSS and static_assert + auto code = shm_errctrl[z][y][x]; + shm_data[z][y][x] = pred + (code - radius) * ebx2; + } + } + }; + // -------------------------------------------------------------------------------- // + + if CONSTEXPR (COARSEN) { + constexpr auto TOTAL = BLOCK_DIMX * BLOCK_DIMY * BLOCK_DIMZ; + for (auto _tix = TIX; _tix < TOTAL; _tix += LINEAR_BLOCK_SIZE) { + auto itix = (_tix % BLOCK_DIMX); + auto itiy = (_tix / BLOCK_DIMX) % BLOCK_DIMY; + auto itiz = (_tix / BLOCK_DIMX) / BLOCK_DIMY; + auto x = xmap(itix, unit); + auto y = ymap(itiy, unit); + auto z = zmap(itiz, unit); + run(x, y, z); + } + } + else { + auto itix = (TIX % BLOCK_DIMX); + auto itiy = (TIX / BLOCK_DIMX) % BLOCK_DIMY; + auto itiz = (TIX / BLOCK_DIMX) / BLOCK_DIMY; + auto x = xmap(itix, unit); + auto y = ymap(itiy, unit); + auto z = zmap(itiz, unit); + run(x, y, z); + } + __syncthreads(); +} + +} // namespace + +/********************************************************************************/ + +template +__device__ void cusz::device_api::spline3d_layout2_interpolate( + volatile T1 shm_data[9][9][33], + volatile T2 shm_errctrl[9][9][33], + FP eb_r, + FP ebx2, + int radius) +{ + auto xblue = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2); }; + auto yblue = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); }; + auto zblue = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz * 2 + 1); }; + + auto xyellow = [] __device__(int _tix, int unit) -> int { return unit * (_tix * 2 + 1); }; + auto yyellow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2); }; + auto zyellow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); }; + + auto xhollow = [] __device__(int _tix, int unit) -> int { return unit * (_tix); }; + auto yhollow = [] __device__(int _tiy, int unit) -> int { return unit * (_tiy * 2 + 1); }; + auto zhollow = [] __device__(int _tiz, int unit) -> int { return unit * (_tiz); }; + + constexpr auto COARSEN = true; + constexpr auto NO_COARSEN = false; + constexpr auto BORDER_INCLUSIVE = true; + constexpr auto BORDER_EXCLUSIVE = false; + + int unit = 4; + + // iteration 1 + interpolate_stage< + T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // + true, false, false, LINEAR_BLOCK_SIZE, 5, 2, NO_COARSEN, 1, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); + interpolate_stage< + T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // + false, true, false, LINEAR_BLOCK_SIZE, 4, 2, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); + interpolate_stage< + T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // + false, false, true, LINEAR_BLOCK_SIZE, 9, 1, NO_COARSEN, 3, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); + + unit = 2; + + // iteration 2, TODO switch y-z order + interpolate_stage< + T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // + true, false, false, LINEAR_BLOCK_SIZE, 9, 3, NO_COARSEN, 2, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); + interpolate_stage< + T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // + false, true, false, LINEAR_BLOCK_SIZE, 8, 3, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); + interpolate_stage< + T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // + false, false, true, LINEAR_BLOCK_SIZE, 17, 2, NO_COARSEN, 5, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); + + unit = 1; + + // iteration 3 + interpolate_stage< + T1, T2, FP, decltype(xblue), decltype(yblue), decltype(zblue), // + true, false, false, LINEAR_BLOCK_SIZE, 17, 5, COARSEN, 4, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xblue, yblue, zblue, unit, eb_r, ebx2, radius); + interpolate_stage< + T1, T2, FP, decltype(xyellow), decltype(yyellow), decltype(zyellow), // + false, true, false, LINEAR_BLOCK_SIZE, 16, 5, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xyellow, yyellow, zyellow, unit, eb_r, ebx2, radius); + /****************************************************************************** + test only: last step inclusive + ******************************************************************************/ + // interpolate_stage< + // T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // + // false, false, true, LINEAR_BLOCK_SIZE, 33, 4, COARSEN, 9, BORDER_INCLUSIVE, WORKFLOW>( + // shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); + /****************************************************************************** + production + ******************************************************************************/ + interpolate_stage< + T1, T2, FP, decltype(xhollow), decltype(yhollow), decltype(zhollow), // + false, false, true, LINEAR_BLOCK_SIZE, 32, 4, COARSEN, 8, BORDER_EXCLUSIVE, WORKFLOW>( + shm_data, shm_errctrl, xhollow, yhollow, zhollow, unit, eb_r, ebx2, radius); + + /****************************************************************************** + test only: print a block + ******************************************************************************/ + // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_errctrl); } + // if (TIX == 0 and BIX == 0 and BIY == 0 and BIZ == 0) { spline3d_print_block_from_GPU(shm_data); } +} + +/******************************************************************************** + * host API/kernel + ********************************************************************************/ + +template +__global__ void cusz::c_spline3d_infprecis_32x8x8data( + TITER data, + DIM3 data_size, + STRIDE3 data_leap, + EITER errctrl, + DIM3 errctrl_size, + STRIDE3 errctrl_leap, + TITER anchor, + STRIDE3 anchor_leap, + FP eb_r, + FP ebx2, + int radius, + TITER pred_error, + TITER compress_error) +{ + // compile time variables + using T = typename std::remove_pointer::type; + using E = typename std::remove_pointer::type; + + if CONSTEXPR (PROBE_PRED_ERROR) { + // TODO + } + else { + __shared__ struct { + T data[9][9][33]; + E errctrl[9][9][33]; + } shmem; + + c_reset_scratch_33x9x9data(shmem.data, shmem.errctrl, radius); + global2shmem_33x9x9data(data, data_size, data_leap, shmem.data); + + // version 1, use shmem, erroneous + // c_gather_anchor(shmem.data, anchor, anchor_leap); + // version 2, use global mem, correct + c_gather_anchor(data, data_size, data_leap, anchor, anchor_leap); + + cusz::device_api::spline3d_layout2_interpolate( + shmem.data, shmem.errctrl, eb_r, ebx2, radius); + shmem2global_32x8x8data(shmem.errctrl, errctrl, errctrl_size, errctrl_leap); + } +} + +template < + typename EITER, + typename TITER, + typename FP, + int LINEAR_BLOCK_SIZE> +__global__ void cusz::x_spline3d_infprecis_32x8x8data( + EITER errctrl, // input 1 + DIM3 errctrl_size, // + STRIDE3 errctrl_leap, // + TITER anchor, // input 2 + DIM3 anchor_size, // + STRIDE3 anchor_leap, // + TITER data, // output + DIM3 data_size, // + STRIDE3 data_leap, // + FP eb_r, + FP ebx2, + int radius) +{ + // compile time variables + using E = typename std::remove_pointer::type; + using T = typename std::remove_pointer::type; + + __shared__ struct { + E errctrl[9][9][33]; + T data[9][9][33]; + } shmem; + + x_reset_scratch_33x9x9data(shmem.data, shmem.errctrl, anchor, anchor_size, anchor_leap); + global2shmem_33x9x9data(errctrl, errctrl_size, errctrl_leap, shmem.errctrl); + cusz::device_api::spline3d_layout2_interpolate( + shmem.data, shmem.errctrl, eb_r, ebx2, radius); + shmem2global_32x8x8data(shmem.data, data, data_size, data_leap); +} + +#undef TIX +#undef TIY +#undef TIZ +#undef BIX +#undef BIY +#undef BIZ +#undef BDX +#undef BDY +#undef BDZ + +template +void launch_construct_Spline3( + T* data, + dim3 const len3, + T* anchor, + dim3 const an_len3, + E* errctrl, + dim3 const ec_len3, + double const eb, + int const radius, + float& time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + constexpr auto SEQ_3D = dim3(1, 8, 1); + constexpr auto BLOCK_3D = dim3(256, 1, 1); + auto GRID_3D = divide3(len3, SUBLEN_3D); + + { + constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z; + constexpr auto SEQ_TOTAL = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z; + constexpr auto BLOCK_TOTAL = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z; + + // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!"); + if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!"); + } + + //////////////////////////////////////// + + auto ebx2 = eb * 2; + auto eb_r = 1 / eb; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y); + auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + auto d = ndim(); + + if (d == 1) { // + throw std::runtime_error("Spline1 not implemented"); + } + else if (d == 2) { + throw std::runtime_error("Spline2 not implemented"); + } + else if (d == 3) { + cusz::c_spline3d_infprecis_32x8x8data // + <<>> // + (data, len3, leap3, // + errctrl, ec_len3, ec_leap3, // + anchor, an_leap3, // + eb_r, ebx2, radius); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + TIME_ELAPSED_CUDAEVENT(&time_elapsed); + + DESTROY_CUDAEVENT_PAIR; +} + +template +void launch_reconstruct_Spline3( + T* xdata, + dim3 const len3, + T* anchor, + dim3 const an_len3, + E* errctrl, + dim3 const ec_len3, + double const eb, + int const radius, + float& time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + /* + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + */ + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + constexpr auto SEQ_3D = dim3(1, 8, 1); + constexpr auto BLOCK_3D = dim3(256, 1, 1); + auto GRID_3D = divide3(len3, SUBLEN_3D); + + { + constexpr auto SUBLEN_TOTAL = SUBLEN_3D.x * SUBLEN_3D.y * SUBLEN_3D.z; + constexpr auto SEQ_TOTAL = SEQ_3D.x * SEQ_3D.y * SEQ_3D.z; + constexpr auto BLOCK_TOTAL = BLOCK_3D.x * BLOCK_3D.y * BLOCK_3D.z; + + // static_assert(SUBLEN_TOTAL / SEQ_TOTAL == BLOCK_TOTAL, "parallelism does not match!"); + if (SUBLEN_TOTAL / SEQ_TOTAL != BLOCK_TOTAL) throw std::runtime_error("parallelism does not match!"); + } + + //////////////////////////////////////// + + auto ebx2 = eb * 2; + auto eb_r = 1 / eb; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + auto ec_leap3 = dim3(1, ec_len3.x, ec_len3.x * ec_len3.y); + auto an_leap3 = dim3(1, an_len3.x, an_len3.x * an_len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + cusz::x_spline3d_infprecis_32x8x8data // + <<>> // + (errctrl, ec_len3, ec_leap3, // + anchor, an_len3, an_leap3, // + xdata, len3, leap3, // + eb_r, ebx2, radius); + + STOP_CUDAEVENT_RECORDING(stream); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + + TIME_ELAPSED_CUDAEVENT(&time_elapsed); + DESTROY_CUDAEVENT_PAIR; +} + +#endif diff --git a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl index 2aa5bb5c..15d10ade 100644 --- a/qtensor/compression/cusz/src/kernel/detail/subroutine.inl +++ b/qtensor/compression/cusz/src/kernel/detail/subroutine.inl @@ -1,1074 +1,1074 @@ -/** - * @file subroutine.inl - * @author Jiannan Tian - * @brief subroutines of kernels - * @version 0.4 - * @date 2022-12-22 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include -#include "cusz/pn.hh" -#include "pipeline/compaction_g.inl" -#include "subsub.inl" - -namespace psz { -namespace cuda { -namespace __device { - -//////// 1D - -namespace v0 { - -// compression load -template -__forceinline__ __device__ void load_prequant_1d( - T* data, - uint32_t dimx, - uint32_t id_base, - volatile T* shmem, - T private_buffer[SEQ], - T& prev, - FP ebx2_r); - -// decompression load -template -__forceinline__ __device__ void load_fuse_1d( - EQ* quant, - T* outlier, - uint32_t dimx, - uint32_t id_base, - int radius, - volatile T* shmem, - T private_buffer[SEQ]); - -namespace delta_only { - -template -__forceinline__ __device__ void -load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); - -} - -// compression and decompression store -template -__forceinline__ __device__ void write_1d( // - volatile T1* shmem_a1, - volatile T2* shmem_a2, - uint32_t dimx, - uint32_t id_base, - T1* a1, - T2* a2); - -// compression pred-quant, method 1 -template -__forceinline__ __device__ void predict_quantize__no_outlier_1d( // - T private_buffer[SEQ], - volatile EQ* shmem_quant, - T prev = 0); - -// compression pred-quant, method 2 -template -__forceinline__ __device__ void predict_quantize_1d( // - T private_buffer[SEQ], - volatile EQ* shmem_quant, - volatile T* shmem_outlier, - int radius, - T prev = 0); - -namespace compaction { - -template < - typename T, - typename EQ, - int SEQ, - bool FIRST_POINT, - typename Compaction = CompactionDRAM> -__forceinline__ __device__ void predict_quantize_1d( // - T thp_buffer[SEQ], - volatile EQ* s_quant, - uint32_t dimx, - int radius, - uint32_t g_id_base, - Compaction g_outlier, - T prev = 0); - -} - -// decompression pred-quant -template -__forceinline__ __device__ void block_scan_1d( - T private_buffer[SEQ], - T ebx2, - volatile T* exchange_in, - volatile T* exchange_out, - volatile T* shmem_buffer); - -} // namespace v0 - -namespace v1_pn { - -template -__forceinline__ __device__ void -load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); - -template -__forceinline__ __device__ void -predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev); - -template -__forceinline__ __device__ void -predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev); - -namespace compaction { - -template -__forceinline__ __device__ void predict_quantize_1d( - T thp_buffer[SEQ], - volatile EQ* s_quant, - uint32_t dimx, - int radius, - uint32_t g_idx_base, - Compaction outlier, - T prev); - -} - -namespace delta_only { - -template -__forceinline__ __device__ void -load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); - -} - -} // namespace v1_pn - -//////// 2D - -namespace v0 { - -template -__forceinline__ __device__ void load_prequant_2d( - T* data, - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - FP ebx2_r, - T center[YSEQ + 1]); - -template -__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]); - -template -__forceinline__ __device__ void quantize_write_2d( - T delta[YSEQ + 1], - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - int radius, - EQ* quant, - T* outlier); - -namespace delta_only { - -template -__forceinline__ __device__ void quantize_write_2d( - T delta[YSEQ + 1], - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - EQ* quant); - -} - -namespace compaction { - -template -__forceinline__ __device__ void quantize_write_2d( - T delta[YSEQ + 1], - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - int radius, - EQ* quant, - Compaction outlier); - -}; - -// decompression load -template -__forceinline__ __device__ void load_fuse_2d( - EQ* quant, - T* outlier, - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - int radius, - T private_buffer[YSEQ]); - -namespace delta_only { -// decompression load -template -__forceinline__ __device__ void load_2d( - EQ* quant, - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - T private_buffer[YSEQ]); - -} // namespace delta_only - -template -__forceinline__ __device__ void block_scan_2d( // - T thread_private[YSEQ], - volatile T* intermediate, - FP ebx2); - -template -__forceinline__ __device__ void decomp_write_2d( - T thread_private[YSEQ], - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - T* xdata); - -} // namespace v0 - -namespace v1_pn { - -namespace compaction { -template -__forceinline__ __device__ void quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - EQ* quant, - Compaction outlier - // clang-format on -); - -} - -template -__forceinline__ __device__ void load_fuse_2d( - // clang-format off - EQ* quant, - T* outlier, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - T thread_private[YSEQ] - // clang-format on -); - -namespace delta_only { - -template -__forceinline__ __device__ void load_2d( - // clang-format off - EQ* quant, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - T thread_private[YSEQ] - // clang-format on -); - -template -__forceinline__ __device__ void quantize_write_2d( - T delta[YSEQ + 1], - uint32_t dimx, - uint32_t gix, - uint32_t dimy, - uint32_t giy_base, - uint32_t stridey, - EQ* quant); - -} // namespace delta_only - -} // namespace v1_pn - -//////// 3D - -namespace v0 { - -// TODO move subroutines for 3D here - -} - -} // namespace __device -} // namespace cuda -} // namespace psz - -//////////////////////////////////////////////////////////////////////////////// - -//////// 1D - -template -__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d( - T* data, - uint32_t dimx, - uint32_t id_base, - volatile T* shmem, - T private_buffer[SEQ], - T& prev, // TODO use pointer? - FP ebx2_r) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + threadIdx.x + i * NTHREAD; - if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r); - } - __syncthreads(); - -#pragma unroll - for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; - if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1]; - __syncthreads(); -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d( - EQ* quant, - T* outlier, - uint32_t dimx, - uint32_t id_base, - int radius, - volatile T* shmem, - T private_buffer[SEQ]) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto local_id = threadIdx.x + i * NTHREAD; - auto id = id_base + local_id; - if (id < dimx) shmem[local_id] = outlier[id] + static_cast(quant[id]) - radius; - } - __syncthreads(); - -#pragma unroll - for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; - __syncthreads(); -} - -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d( - EQ* quant, - T* outlier, - uint32_t dimx, - uint32_t id_base, - volatile T* shmem, - T private_buffer[SEQ]) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto local_id = threadIdx.x + i * NTHREAD; - auto id = id_base + local_id; - if (id < dimx) shmem[local_id] = outlier[id] + PN::decode(quant[id]); - } - __syncthreads(); - -#pragma unroll - for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; - __syncthreads(); -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d( - EQ* quant, - uint32_t dimx, - uint32_t id_base, - volatile T* shmem, - T private_buffer[SEQ]) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto local_id = threadIdx.x + i * NTHREAD; - auto id = id_base + local_id; - if (id < dimx) shmem[local_id] = static_cast(quant[id]); - } - __syncthreads(); - -#pragma unroll - for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; - __syncthreads(); -} - -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d( - EQ* quant, - uint32_t dimx, - uint32_t id_base, - volatile T* shmem, - T private_buffer[SEQ]) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto local_id = threadIdx.x + i * NTHREAD; - auto id = id_base + local_id; - if (id < dimx) shmem[local_id] = PN::decode(quant[id]); - } - __syncthreads(); - -#pragma unroll - for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; - __syncthreads(); -} - -template // TODO remove NO_OUTLIER, use nullable -__forceinline__ __device__ void psz::cuda::__device::v0::write_1d( - volatile T1* shmem_a1, - volatile T2* shmem_a2, - uint32_t dimx, - uint32_t id_base, - T1* a1, - T2* a2) -{ -#pragma unroll - for (auto i = 0; i < SEQ; i++) { - auto id = id_base + threadIdx.x + i * NTHREAD; - if (id < dimx) { - if (NO_OUTLIER) { // - a1[id] = shmem_a1[threadIdx.x + i * NTHREAD]; - } - else { - a1[id] = shmem_a1[threadIdx.x + i * NTHREAD]; - a2[id] = shmem_a2[threadIdx.x + i * NTHREAD]; - } - } - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d( // - T private_buffer[SEQ], - volatile EQ* shmem_quant, - T prev) -{ - auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { - shmem_quant[idx + threadIdx.x * SEQ] = static_cast(cur - prev); - }; - - if (FIRST_POINT) { // i == 0 - quantize_1d(private_buffer[0], prev, 0); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); - __syncthreads(); - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d( - T private_buffer[SEQ], - volatile EQ* shmem_quant, - volatile T* shmem_outlier, - int radius, - T prev) -{ - auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { - T delta = cur - prev; - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - - // otherwise, need to reset shared memory (to 0) - shmem_quant[idx + threadIdx.x * SEQ] = quantizable * static_cast(candidate); - shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate; - }; - - if (FIRST_POINT) { // i == 0 - quantize_1d(private_buffer[0], prev, 0); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); - __syncthreads(); - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d( - T thp_buffer[SEQ], - volatile EQ* s_quant, - uint32_t dimx, // put x-related - int radius, - uint32_t g_idx_base, // TODO this file `id_base` to `g_idx_base` - Compaction outlier, - T prev) -{ - auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) { - T delta = cur - prev; - bool quantizable = fabs(delta) < radius; - T candidate = delta + radius; - - auto inblock_idx = inloop_idx + threadIdx.x * SEQ; // TODO this file use `inblock_idx` - - // though quantizable, need to set non-quantizable position as 0 - s_quant[inblock_idx] = quantizable * static_cast(candidate); - - // very small chance running into this block - if (not quantizable) { - auto g_idx = inblock_idx + g_idx_base; - if (g_idx < dimx) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.val[cur_idx] = candidate; - outlier.idx[cur_idx] = g_idx; - } - } - }; - - if (FIRST_POINT) { // i == 0 - quantize_1d(thp_buffer[0], prev, 0); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i); - __syncthreads(); // TODO move __syncthreads() outside this subroutine? - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d( - T thp_buffer[SEQ], - volatile EQ* s_quant, - uint32_t dimx, // put x-related - int radius, - uint32_t g_idx_base, // TODO this file `id_base` to `g_idx_base` - Compaction outlier, - T prev) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) { - T delta = cur - prev; - bool quantizable = fabs(delta) < radius; - UI UI_delta = PN::encode(static_cast(delta)); - - auto inblock_idx = inloop_idx + threadIdx.x * SEQ; // TODO this file use `inblock_idx` - - // though quantizable, need to set non-quantizable position as 0 - s_quant[inblock_idx] = quantizable * UI_delta; - - // very small chance running into this block - if (not quantizable) { - auto g_idx = inblock_idx + g_idx_base; - if (g_idx < dimx) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.val[cur_idx] = delta; - outlier.idx[cur_idx] = g_idx; - } - } - }; - - if (FIRST_POINT) { // i == 0 - quantize_1d(thp_buffer[0], prev, 0); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i); - __syncthreads(); // TODO move __syncthreads() outside this subroutine? - } -} - -// decompression pred-quant -template -__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d( - T private_buffer[SEQ], - T ebx2, - volatile T* exchange_in, - volatile T* exchange_out, - volatile T* shmem_buffer) -{ - namespace wave32 = psz::cuda::__device::wave32; - wave32::intrawarp_inclusivescan_1d(private_buffer); - wave32::intrablock_exclusivescan_1d(private_buffer, exchange_in, exchange_out); - - // put back to shmem -#pragma unroll - for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2; - __syncthreads(); -} - -// v1_pn: quantization code uses PN::encode -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d( // - T private_buffer[SEQ], - volatile EQ* shmem_quant, - T prev) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { - UI UI_delta = PN::encode(static_cast(cur - prev)); - shmem_quant[idx + threadIdx.x * SEQ] = UI_delta; - }; - - if (FIRST_POINT) { // i == 0 - quantize_1d(private_buffer[0], prev, 0); - } - else { -#pragma unroll - for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); - __syncthreads(); - } -} - -// template -// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d( -// T private_buffer[SEQ], -// volatile EQ* shmem_quant, -// volatile T* shmem_outlier, -// int radius, -// T prev) -// { -// constexpr auto BYTEWIDTH = sizeof(EQ); -// using UI = EQ; -// using I = typename psz::typing::Int::T; - -// auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { -// T delta = cur - prev; -// bool quantizable = fabs(delta) < radius; -// UI UI_delta = PN::encode(static_cast(delta)); - -// // otherwise, need to reset shared memory (to 0) -// shmem_quant[idx + threadIdx.x * SEQ] = quantizable * UI_delta; -// shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta; -// }; - -// if (FIRST_POINT) { // i == 0 -// quantize_1d(private_buffer[0], prev, 0); -// } -// else { -// #pragma unroll -// for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); -// __syncthreads(); -// } -// } - -//////////////////////////////////////////////////////////////////////////////// - -//////// 2D - -template -__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d( - // clang-format off - T* data, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - FP ebx2_r, - T center[YSEQ + 1] - // clang-format on -) -{ - auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - - // use a warp as two half-warps - // block_dim = (16, 2, 1) makes a full warp internally - -#pragma unroll - for (auto iy = 0; iy < YSEQ; iy++) { - if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r); - } - auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32); // same-warp, next-16 - if (threadIdx.y == 1) center[0] = tmp; -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1]) -{ - /* - Lorenzo 2D (1-layer) illustration - NW N NE - notation W C E "->" to predict - -------- SW S SE - - normal data layout | considering register file - col(k-1) col(k) | thread(k-1) thread(k) - | - r(i-1) -west[i-1] +center[i-1] | -center(k-1)[i-1] +center(k)[i-1] - r(i ) +west[i] ->center[i] | +center(k-1)[i] ->center(k)[i] - - calculation - ----------- - delta = center[i] - (center[i-1] + west[i] - west[i-1]) - = (center[i] - center[i-1]) - (west[i] - west[i-1]) - - With center[i] -= center[i-1] and west[i] -= west[i-1], - delta = center[i] - west[i] - - For thread(k), - delta(k) = center(k)[i] - center(k-1)[i] - = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP) - */ - -#pragma unroll - for (auto i = YSEQ; i > 0; i--) { - // with center[i-1] intact in this iteration - center[i] -= center[i - 1]; - // within a halfwarp (32/2) - auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); - if (threadIdx.x > 0) center[i] -= west; // delta - } - __syncthreads(); -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - EQ* quant, - T* outlier - // clang-format on -) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - - if (gix < dimx and giy_base + (i - 1) < dimy) { - bool quantizable = fabs(delta[i]) < radius; - T candidate = delta[i] + radius; - - // outlier array is not in sparse form in this version - quant[gid] = quantizable * static_cast(candidate); - outlier[gid] = (not quantizable) * candidate; - } - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - EQ* quant - // clang-format on -) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast(delta[i]); - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - EQ* quant - // clang-format on -) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN::encode(static_cast(delta[i])); - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - EQ* quant, - Compaction outlier - // clang-format on -) -{ - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - - if (gix < dimx and giy_base + (i - 1) < dimy) { - bool quantizable = fabs(delta[i]) < radius; - T candidate = delta[i] + radius; - - // The non-quantizable is recorded as "0" (radius). - quant[gid] = quantizable * static_cast(candidate); - - if (not quantizable) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.idx[cur_idx] = gid; - outlier.val[cur_idx] = candidate; - } - } - } -} - -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d( - // clang-format off - T delta[YSEQ + 1], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - EQ* quant, - Compaction outlier - // clang-format on -) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; - -#pragma unroll - for (auto i = 1; i < YSEQ + 1; i++) { - auto gid = get_gid(i - 1); - - if (gix < dimx and giy_base + (i - 1) < dimy) { - bool quantizable = fabs(delta[i]) < radius; - UI UI_delta = PN::encode(static_cast(delta[i])); - - // The non-quantizable is recorded as "0" (radius). - quant[gid] = quantizable * UI_delta; - - if (not quantizable) { - auto cur_idx = atomicAdd(outlier.count, 1); - outlier.idx[cur_idx] = gid; - outlier.val[cur_idx] = delta[i]; - } - } - } -} - -// load to thread-private array (fuse at the same time) -template -__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d( - // clang-format off - EQ* quant, - T* outlier, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - T thread_private[YSEQ] - // clang-format on -) -{ - auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < dimx and (giy_base + i) < dimy) - thread_private[i] = outlier[gid] + static_cast(quant[gid]) - radius; // fuse - else - thread_private[i] = 0; // TODO set as init state? - } -} - -// load to thread-private array (fuse at the same time) -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d( - // clang-format off - EQ* quant, - T* outlier, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - int radius, - T thread_private[YSEQ] - // clang-format on -) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < dimx and (giy_base + i) < dimy) - thread_private[i] = outlier[gid] + PN::decode(quant[gid]); // fuse - else - thread_private[i] = 0; // TODO set as init state? - } -} - -// load to thread-private array (fuse at the same time) -template -__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d( - // clang-format off - EQ* quant, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - T thread_private[YSEQ] - // clang-format on -) -{ - auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < dimx and (giy_base + i) < dimy) - thread_private[i] = static_cast(quant[gid]); - else - thread_private[i] = 0; // TODO set as init state? - } -} - -// load to thread-private array (fuse at the same time) -template -__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d( - // clang-format off - EQ* quant, - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - T thread_private[YSEQ] - // clang-format on -) -{ - constexpr auto BYTEWIDTH = sizeof(EQ); - - using UI = EQ; - using I = typename psz::typing::Int::T; - - auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously - if (gix < dimx and (giy_base + i) < dimy) - thread_private[i] = PN::decode(quant[gid]); - else - thread_private[i] = 0; // TODO set as init state? - } -} - -// partial-sum along y-axis, sequantially -// then, in-warp partial-sum along x-axis -template -__forceinline__ __device__ void -psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2) -{ - // ------> gix (x) - // - // | t(0,0) t(0,1) t(0,2) t(0,3) ... t(0,f) - // | - // | thp(0,0)[0] thp(0,0)[0] thp(0,0)[0] thp(0,0)[0] - // giy thp(0,0)[1] thp(0,0)[1] thp(0,0)[1] thp(0,0)[1] - // (y) | | | | - // thp(0,0)[7] thp(0,0)[7] thp(0,0)[7] thp(0,0)[7] - // - // | t(1,0) t(1,1) t(1,2) t(1,3) ... t(1,f) - // | - // | thp(1,0)[0] thp(1,0)[0] thp(1,0)[0] thp(1,0)[0] - // giy thp(1,0)[1] thp(1,0)[1] thp(1,0)[1] thp(1,0)[1] - // (y) | | | | - // thp(1,0)[7] thp(1,0)[7] thp(1,0)[7] thp(1,0)[7] - - constexpr auto BLOCK = 16; - - for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1]; - // two-pass: store for cross-thread-private update - // TODO shuffle up by 16 in the same warp - if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1]; - __syncthreads(); - // broadcast the partial-sum result from a previous segment - if (threadIdx.y == 1) { - auto tmp = intermediate[threadIdx.x]; -#pragma unroll - for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp; // regression as pointer - } - // implicit sync as there is half-warp divergence - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - for (auto d = 1; d < BLOCK; d *= 2) { - T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16); // half-warp shuffle - if (threadIdx.x >= d) thread_private[i] += n; - } - thread_private[i] *= ebx2; // scale accordingly - } -} - -// write to DRAM -template -__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d( - // clang-format off - T thread_private[YSEQ], - uint32_t dimx, uint32_t gix, - uint32_t dimy, uint32_t giy_base, uint32_t stridey, - T* xdata - // clang-format on -) -{ - auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; - -#pragma unroll - for (auto i = 0; i < YSEQ; i++) { - auto gid = get_gid(i); - if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i]; - } -} - -//////////////////////////////////////////////////////////////////////////////// - -//////// 3D +/** + * @file subroutine.inl + * @author Jiannan Tian + * @brief subroutines of kernels + * @version 0.4 + * @date 2022-12-22 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include +#include "cusz/pn.hh" +#include "pipeline/compaction_g.inl" +#include "subsub.inl" + +namespace psz { +namespace cuda { +namespace __device { + +//////// 1D + +namespace v0 { + +// compression load +template +__forceinline__ __device__ void load_prequant_1d( + T* data, + uint32_t dimx, + uint32_t id_base, + volatile T* shmem, + T private_buffer[SEQ], + T& prev, + FP ebx2_r); + +// decompression load +template +__forceinline__ __device__ void load_fuse_1d( + EQ* quant, + T* outlier, + uint32_t dimx, + uint32_t id_base, + int radius, + volatile T* shmem, + T private_buffer[SEQ]); + +namespace delta_only { + +template +__forceinline__ __device__ void +load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); + +} + +// compression and decompression store +template +__forceinline__ __device__ void write_1d( // + volatile T1* shmem_a1, + volatile T2* shmem_a2, + uint32_t dimx, + uint32_t id_base, + T1* a1, + T2* a2); + +// compression pred-quant, method 1 +template +__forceinline__ __device__ void predict_quantize__no_outlier_1d( // + T private_buffer[SEQ], + volatile EQ* shmem_quant, + T prev = 0); + +// compression pred-quant, method 2 +template +__forceinline__ __device__ void predict_quantize_1d( // + T private_buffer[SEQ], + volatile EQ* shmem_quant, + volatile T* shmem_outlier, + int radius, + T prev = 0); + +namespace compaction { + +template < + typename T, + typename EQ, + int SEQ, + bool FIRST_POINT, + typename Compaction = CompactionDRAM> +__forceinline__ __device__ void predict_quantize_1d( // + T thp_buffer[SEQ], + volatile EQ* s_quant, + uint32_t dimx, + int radius, + uint32_t g_id_base, + Compaction g_outlier, + T prev = 0); + +} + +// decompression pred-quant +template +__forceinline__ __device__ void block_scan_1d( + T private_buffer[SEQ], + T ebx2, + volatile T* exchange_in, + volatile T* exchange_out, + volatile T* shmem_buffer); + +} // namespace v0 + +namespace v1_pn { + +template +__forceinline__ __device__ void +load_fuse_1d(EQ* quant, T* outlier, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); + +template +__forceinline__ __device__ void +predict_quantize__no_outlier_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, T prev); + +template +__forceinline__ __device__ void +predict_quantize_1d(T private_buffer[SEQ], volatile EQ* shmem_quant, volatile T* shmem_outlier, int radius, T prev); + +namespace compaction { + +template +__forceinline__ __device__ void predict_quantize_1d( + T thp_buffer[SEQ], + volatile EQ* s_quant, + uint32_t dimx, + int radius, + uint32_t g_idx_base, + Compaction outlier, + T prev); + +} + +namespace delta_only { + +template +__forceinline__ __device__ void +load_1d(EQ* quant, uint32_t dimx, uint32_t id_base, volatile T* shmem, T private_buffer[SEQ]); + +} + +} // namespace v1_pn + +//////// 2D + +namespace v0 { + +template +__forceinline__ __device__ void load_prequant_2d( + T* data, + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + FP ebx2_r, + T center[YSEQ + 1]); + +template +__forceinline__ __device__ void predict_2d(T center[YSEQ + 1]); + +template +__forceinline__ __device__ void quantize_write_2d( + T delta[YSEQ + 1], + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + int radius, + EQ* quant, + T* outlier); + +namespace delta_only { + +template +__forceinline__ __device__ void quantize_write_2d( + T delta[YSEQ + 1], + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + EQ* quant); + +} + +namespace compaction { + +template +__forceinline__ __device__ void quantize_write_2d( + T delta[YSEQ + 1], + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + int radius, + EQ* quant, + Compaction outlier); + +}; + +// decompression load +template +__forceinline__ __device__ void load_fuse_2d( + EQ* quant, + T* outlier, + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + int radius, + T private_buffer[YSEQ]); + +namespace delta_only { +// decompression load +template +__forceinline__ __device__ void load_2d( + EQ* quant, + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + T private_buffer[YSEQ]); + +} // namespace delta_only + +template +__forceinline__ __device__ void block_scan_2d( // + T thread_private[YSEQ], + volatile T* intermediate, + FP ebx2); + +template +__forceinline__ __device__ void decomp_write_2d( + T thread_private[YSEQ], + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + T* xdata); + +} // namespace v0 + +namespace v1_pn { + +namespace compaction { +template +__forceinline__ __device__ void quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + EQ* quant, + Compaction outlier + // clang-format on +); + +} + +template +__forceinline__ __device__ void load_fuse_2d( + // clang-format off + EQ* quant, + T* outlier, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + T thread_private[YSEQ] + // clang-format on +); + +namespace delta_only { + +template +__forceinline__ __device__ void load_2d( + // clang-format off + EQ* quant, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + T thread_private[YSEQ] + // clang-format on +); + +template +__forceinline__ __device__ void quantize_write_2d( + T delta[YSEQ + 1], + uint32_t dimx, + uint32_t gix, + uint32_t dimy, + uint32_t giy_base, + uint32_t stridey, + EQ* quant); + +} // namespace delta_only + +} // namespace v1_pn + +//////// 3D + +namespace v0 { + +// TODO move subroutines for 3D here + +} + +} // namespace __device +} // namespace cuda +} // namespace psz + +//////////////////////////////////////////////////////////////////////////////// + +//////// 1D + +template +__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_1d( + T* data, + uint32_t dimx, + uint32_t id_base, + volatile T* shmem, + T private_buffer[SEQ], + T& prev, // TODO use pointer? + FP ebx2_r) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + threadIdx.x + i * NTHREAD; + if (id < dimx) shmem[threadIdx.x + i * NTHREAD] = round(data[id] * ebx2_r); + } + __syncthreads(); + +#pragma unroll + for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; + if (threadIdx.x > 0) prev = shmem[threadIdx.x * SEQ - 1]; + __syncthreads(); +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_1d( + EQ* quant, + T* outlier, + uint32_t dimx, + uint32_t id_base, + int radius, + volatile T* shmem, + T private_buffer[SEQ]) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto local_id = threadIdx.x + i * NTHREAD; + auto id = id_base + local_id; + if (id < dimx) shmem[local_id] = outlier[id] + static_cast(quant[id]) - radius; + } + __syncthreads(); + +#pragma unroll + for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; + __syncthreads(); +} + +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_1d( + EQ* quant, + T* outlier, + uint32_t dimx, + uint32_t id_base, + volatile T* shmem, + T private_buffer[SEQ]) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto local_id = threadIdx.x + i * NTHREAD; + auto id = id_base + local_id; + if (id < dimx) shmem[local_id] = outlier[id] + PN::decode(quant[id]); + } + __syncthreads(); + +#pragma unroll + for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; + __syncthreads(); +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_1d( + EQ* quant, + uint32_t dimx, + uint32_t id_base, + volatile T* shmem, + T private_buffer[SEQ]) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto local_id = threadIdx.x + i * NTHREAD; + auto id = id_base + local_id; + if (id < dimx) shmem[local_id] = static_cast(quant[id]); + } + __syncthreads(); + +#pragma unroll + for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; + __syncthreads(); +} + +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_1d( + EQ* quant, + uint32_t dimx, + uint32_t id_base, + volatile T* shmem, + T private_buffer[SEQ]) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto local_id = threadIdx.x + i * NTHREAD; + auto id = id_base + local_id; + if (id < dimx) shmem[local_id] = PN::decode(quant[id]); + } + __syncthreads(); + +#pragma unroll + for (auto i = 0; i < SEQ; i++) private_buffer[i] = shmem[threadIdx.x * SEQ + i]; + __syncthreads(); +} + +template // TODO remove NO_OUTLIER, use nullable +__forceinline__ __device__ void psz::cuda::__device::v0::write_1d( + volatile T1* shmem_a1, + volatile T2* shmem_a2, + uint32_t dimx, + uint32_t id_base, + T1* a1, + T2* a2) +{ +#pragma unroll + for (auto i = 0; i < SEQ; i++) { + auto id = id_base + threadIdx.x + i * NTHREAD; + if (id < dimx) { + if (NO_OUTLIER) { // + a1[id] = shmem_a1[threadIdx.x + i * NTHREAD]; + } + else { + a1[id] = shmem_a1[threadIdx.x + i * NTHREAD]; + a2[id] = shmem_a2[threadIdx.x + i * NTHREAD]; + } + } + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize__no_outlier_1d( // + T private_buffer[SEQ], + volatile EQ* shmem_quant, + T prev) +{ + auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { + shmem_quant[idx + threadIdx.x * SEQ] = static_cast(cur - prev); + }; + + if (FIRST_POINT) { // i == 0 + quantize_1d(private_buffer[0], prev, 0); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); + __syncthreads(); + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::predict_quantize_1d( + T private_buffer[SEQ], + volatile EQ* shmem_quant, + volatile T* shmem_outlier, + int radius, + T prev) +{ + auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { + T delta = cur - prev; + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + + // otherwise, need to reset shared memory (to 0) + shmem_quant[idx + threadIdx.x * SEQ] = quantizable * static_cast(candidate); + shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * candidate; + }; + + if (FIRST_POINT) { // i == 0 + quantize_1d(private_buffer[0], prev, 0); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); + __syncthreads(); + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::compaction::predict_quantize_1d( + T thp_buffer[SEQ], + volatile EQ* s_quant, + uint32_t dimx, // put x-related + int radius, + uint32_t g_idx_base, // TODO this file `id_base` to `g_idx_base` + Compaction outlier, + T prev) +{ + auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) { + T delta = cur - prev; + bool quantizable = fabs(delta) < radius; + T candidate = delta + radius; + + auto inblock_idx = inloop_idx + threadIdx.x * SEQ; // TODO this file use `inblock_idx` + + // though quantizable, need to set non-quantizable position as 0 + s_quant[inblock_idx] = quantizable * static_cast(candidate); + + // very small chance running into this block + if (not quantizable) { + auto g_idx = inblock_idx + g_idx_base; + if (g_idx < dimx) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.val[cur_idx] = candidate; + outlier.idx[cur_idx] = g_idx; + } + } + }; + + if (FIRST_POINT) { // i == 0 + quantize_1d(thp_buffer[0], prev, 0); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i); + __syncthreads(); // TODO move __syncthreads() outside this subroutine? + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::predict_quantize_1d( + T thp_buffer[SEQ], + volatile EQ* s_quant, + uint32_t dimx, // put x-related + int radius, + uint32_t g_idx_base, // TODO this file `id_base` to `g_idx_base` + Compaction outlier, + T prev) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto quantize_1d = [&](T& cur, T& prev, uint32_t inloop_idx) { + T delta = cur - prev; + bool quantizable = fabs(delta) < radius; + UI UI_delta = PN::encode(static_cast(delta)); + + auto inblock_idx = inloop_idx + threadIdx.x * SEQ; // TODO this file use `inblock_idx` + + // though quantizable, need to set non-quantizable position as 0 + s_quant[inblock_idx] = quantizable * UI_delta; + + // very small chance running into this block + if (not quantizable) { + auto g_idx = inblock_idx + g_idx_base; + if (g_idx < dimx) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.val[cur_idx] = delta; + outlier.idx[cur_idx] = g_idx; + } + } + }; + + if (FIRST_POINT) { // i == 0 + quantize_1d(thp_buffer[0], prev, 0); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) quantize_1d(thp_buffer[i], thp_buffer[i - 1], i); + __syncthreads(); // TODO move __syncthreads() outside this subroutine? + } +} + +// decompression pred-quant +template +__forceinline__ __device__ void psz::cuda::__device::v0::block_scan_1d( + T private_buffer[SEQ], + T ebx2, + volatile T* exchange_in, + volatile T* exchange_out, + volatile T* shmem_buffer) +{ + namespace wave32 = psz::cuda::__device::wave32; + wave32::intrawarp_inclusivescan_1d(private_buffer); + wave32::intrablock_exclusivescan_1d(private_buffer, exchange_in, exchange_out); + + // put back to shmem +#pragma unroll + for (auto i = 0; i < SEQ; i++) shmem_buffer[threadIdx.x * SEQ + i] = private_buffer[i] * ebx2; + __syncthreads(); +} + +// v1_pn: quantization code uses PN::encode +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize__no_outlier_1d( // + T private_buffer[SEQ], + volatile EQ* shmem_quant, + T prev) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { + UI UI_delta = PN::encode(static_cast(cur - prev)); + shmem_quant[idx + threadIdx.x * SEQ] = UI_delta; + }; + + if (FIRST_POINT) { // i == 0 + quantize_1d(private_buffer[0], prev, 0); + } + else { +#pragma unroll + for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); + __syncthreads(); + } +} + +// template +// __forceinline__ __device__ void psz::cuda::__device::v1_pn::predict_quantize_1d( +// T private_buffer[SEQ], +// volatile EQ* shmem_quant, +// volatile T* shmem_outlier, +// int radius, +// T prev) +// { +// constexpr auto BYTEWIDTH = sizeof(EQ); +// using UI = EQ; +// using I = typename psz::typing::Int::T; + +// auto quantize_1d = [&](T& cur, T& prev, uint32_t idx) { +// T delta = cur - prev; +// bool quantizable = fabs(delta) < radius; +// UI UI_delta = PN::encode(static_cast(delta)); + +// // otherwise, need to reset shared memory (to 0) +// shmem_quant[idx + threadIdx.x * SEQ] = quantizable * UI_delta; +// shmem_outlier[idx + threadIdx.x * SEQ] = (not quantizable) * delta; +// }; + +// if (FIRST_POINT) { // i == 0 +// quantize_1d(private_buffer[0], prev, 0); +// } +// else { +// #pragma unroll +// for (auto i = 1; i < SEQ; i++) quantize_1d(private_buffer[i], private_buffer[i - 1], i); +// __syncthreads(); +// } +// } + +//////////////////////////////////////////////////////////////////////////////// + +//////// 2D + +template +__forceinline__ __device__ void psz::cuda::__device::v0::load_prequant_2d( + // clang-format off + T* data, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + FP ebx2_r, + T center[YSEQ + 1] + // clang-format on +) +{ + auto g_id = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + + // use a warp as two half-warps + // block_dim = (16, 2, 1) makes a full warp internally + +#pragma unroll + for (auto iy = 0; iy < YSEQ; iy++) { + if (gix < dimx and giy_base + iy < dimy) center[iy + 1] = round(data[g_id(iy)] * ebx2_r); + } + auto tmp = __shfl_up_sync(0xffffffff, center[YSEQ], 16, 32); // same-warp, next-16 + if (threadIdx.y == 1) center[0] = tmp; +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::predict_2d(T center[YSEQ + 1]) +{ + /* + Lorenzo 2D (1-layer) illustration + NW N NE + notation W C E "->" to predict + -------- SW S SE + + normal data layout | considering register file + col(k-1) col(k) | thread(k-1) thread(k) + | + r(i-1) -west[i-1] +center[i-1] | -center(k-1)[i-1] +center(k)[i-1] + r(i ) +west[i] ->center[i] | +center(k-1)[i] ->center(k)[i] + + calculation + ----------- + delta = center[i] - (center[i-1] + west[i] - west[i-1]) + = (center[i] - center[i-1]) - (west[i] - west[i-1]) + + With center[i] -= center[i-1] and west[i] -= west[i-1], + delta = center[i] - west[i] + + For thread(k), + delta(k) = center(k)[i] - center(k-1)[i] + = center(k)[i] - SHFL_UP(center(k)[i], 1, HALF_WARP) + */ + +#pragma unroll + for (auto i = YSEQ; i > 0; i--) { + // with center[i-1] intact in this iteration + center[i] -= center[i - 1]; + // within a halfwarp (32/2) + auto west = __shfl_up_sync(0xffffffff, center[i], 1, 16); + if (threadIdx.x > 0) center[i] -= west; // delta + } + __syncthreads(); +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + EQ* quant, + T* outlier + // clang-format on +) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + + if (gix < dimx and giy_base + (i - 1) < dimy) { + bool quantizable = fabs(delta[i]) < radius; + T candidate = delta[i] + radius; + + // outlier array is not in sparse form in this version + quant[gid] = quantizable * static_cast(candidate); + outlier[gid] = (not quantizable) * candidate; + } + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + EQ* quant + // clang-format on +) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = static_cast(delta[i]); + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + EQ* quant + // clang-format on +) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + if (gix < dimx and giy_base + (i - 1) < dimy) quant[gid] = PN::encode(static_cast(delta[i])); + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v0::compaction::quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + EQ* quant, + Compaction outlier + // clang-format on +) +{ + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + + if (gix < dimx and giy_base + (i - 1) < dimy) { + bool quantizable = fabs(delta[i]) < radius; + T candidate = delta[i] + radius; + + // The non-quantizable is recorded as "0" (radius). + quant[gid] = quantizable * static_cast(candidate); + + if (not quantizable) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.idx[cur_idx] = gid; + outlier.val[cur_idx] = candidate; + } + } + } +} + +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::compaction::quantize_write_2d( + // clang-format off + T delta[YSEQ + 1], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + EQ* quant, + Compaction outlier + // clang-format on +) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto get_gid = [&](auto i) { return (giy_base + i) * stridey + gix; }; + +#pragma unroll + for (auto i = 1; i < YSEQ + 1; i++) { + auto gid = get_gid(i - 1); + + if (gix < dimx and giy_base + (i - 1) < dimy) { + bool quantizable = fabs(delta[i]) < radius; + UI UI_delta = PN::encode(static_cast(delta[i])); + + // The non-quantizable is recorded as "0" (radius). + quant[gid] = quantizable * UI_delta; + + if (not quantizable) { + auto cur_idx = atomicAdd(outlier.count, 1); + outlier.idx[cur_idx] = gid; + outlier.val[cur_idx] = delta[i]; + } + } + } +} + +// load to thread-private array (fuse at the same time) +template +__forceinline__ __device__ void psz::cuda::__device::v0::load_fuse_2d( + // clang-format off + EQ* quant, + T* outlier, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + T thread_private[YSEQ] + // clang-format on +) +{ + auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < dimx and (giy_base + i) < dimy) + thread_private[i] = outlier[gid] + static_cast(quant[gid]) - radius; // fuse + else + thread_private[i] = 0; // TODO set as init state? + } +} + +// load to thread-private array (fuse at the same time) +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::load_fuse_2d( + // clang-format off + EQ* quant, + T* outlier, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + int radius, + T thread_private[YSEQ] + // clang-format on +) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < dimx and (giy_base + i) < dimy) + thread_private[i] = outlier[gid] + PN::decode(quant[gid]); // fuse + else + thread_private[i] = 0; // TODO set as init state? + } +} + +// load to thread-private array (fuse at the same time) +template +__forceinline__ __device__ void psz::cuda::__device::v0::delta_only::load_2d( + // clang-format off + EQ* quant, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + T thread_private[YSEQ] + // clang-format on +) +{ + auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < dimx and (giy_base + i) < dimy) + thread_private[i] = static_cast(quant[gid]); + else + thread_private[i] = 0; // TODO set as init state? + } +} + +// load to thread-private array (fuse at the same time) +template +__forceinline__ __device__ void psz::cuda::__device::v1_pn::delta_only::load_2d( + // clang-format off + EQ* quant, + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + T thread_private[YSEQ] + // clang-format on +) +{ + constexpr auto BYTEWIDTH = sizeof(EQ); + + using UI = EQ; + using I = typename psz::typing::Int::T; + + auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + // even if we hit the else branch, all threads in a warp hit the y-boundary simultaneously + if (gix < dimx and (giy_base + i) < dimy) + thread_private[i] = PN::decode(quant[gid]); + else + thread_private[i] = 0; // TODO set as init state? + } +} + +// partial-sum along y-axis, sequantially +// then, in-warp partial-sum along x-axis +template +__forceinline__ __device__ void +psz::cuda::__device::v0::block_scan_2d(T thread_private[YSEQ], volatile T* intermediate, FP ebx2) +{ + // ------> gix (x) + // + // | t(0,0) t(0,1) t(0,2) t(0,3) ... t(0,f) + // | + // | thp(0,0)[0] thp(0,0)[0] thp(0,0)[0] thp(0,0)[0] + // giy thp(0,0)[1] thp(0,0)[1] thp(0,0)[1] thp(0,0)[1] + // (y) | | | | + // thp(0,0)[7] thp(0,0)[7] thp(0,0)[7] thp(0,0)[7] + // + // | t(1,0) t(1,1) t(1,2) t(1,3) ... t(1,f) + // | + // | thp(1,0)[0] thp(1,0)[0] thp(1,0)[0] thp(1,0)[0] + // giy thp(1,0)[1] thp(1,0)[1] thp(1,0)[1] thp(1,0)[1] + // (y) | | | | + // thp(1,0)[7] thp(1,0)[7] thp(1,0)[7] thp(1,0)[7] + + constexpr auto BLOCK = 16; + + for (auto i = 1; i < YSEQ; i++) thread_private[i] += thread_private[i - 1]; + // two-pass: store for cross-thread-private update + // TODO shuffle up by 16 in the same warp + if (threadIdx.y == 0) intermediate[threadIdx.x] = thread_private[YSEQ - 1]; + __syncthreads(); + // broadcast the partial-sum result from a previous segment + if (threadIdx.y == 1) { + auto tmp = intermediate[threadIdx.x]; +#pragma unroll + for (auto i = 0; i < YSEQ; i++) thread_private[i] += tmp; // regression as pointer + } + // implicit sync as there is half-warp divergence + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + for (auto d = 1; d < BLOCK; d *= 2) { + T n = __shfl_up_sync(0xffffffff, thread_private[i], d, 16); // half-warp shuffle + if (threadIdx.x >= d) thread_private[i] += n; + } + thread_private[i] *= ebx2; // scale accordingly + } +} + +// write to DRAM +template +__forceinline__ __device__ void psz::cuda::__device::v0::decomp_write_2d( + // clang-format off + T thread_private[YSEQ], + uint32_t dimx, uint32_t gix, + uint32_t dimy, uint32_t giy_base, uint32_t stridey, + T* xdata + // clang-format on +) +{ + auto get_gid = [&](auto iy) { return (giy_base + iy) * stridey + gix; }; + +#pragma unroll + for (auto i = 0; i < YSEQ; i++) { + auto gid = get_gid(i); + if (gix < dimx and (giy_base + i) < dimy) xdata[gid] = thread_private[i]; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +//////// 3D diff --git a/qtensor/compression/cusz/src/kernel/detail/subsub.inl b/qtensor/compression/cusz/src/kernel/detail/subsub.inl index 4d34fdc6..e8da624f 100644 --- a/qtensor/compression/cusz/src/kernel/detail/subsub.inl +++ b/qtensor/compression/cusz/src/kernel/detail/subsub.inl @@ -1,92 +1,92 @@ -/** - * @file subsub.inl - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2022-12-26 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -namespace psz { -namespace cuda { -namespace __device { - -namespace wave32 { -template -__forceinline__ __device__ void intrawarp_inclusivescan_1d( // - T private_buffer[SEQ]); - -template -__forceinline__ __device__ void intrablock_exclusivescan_1d( // - T private_buffer[SEQ], - volatile T* exchange_in, - volatile T* exchange_out); -} // namespace wave32 - -} // namespace __device -} // namespace cuda -} // namespace psz - -template -__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ]) -{ - for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1]; - T addend = private_buffer[SEQ - 1]; - - // in-warp shuffle - for (auto d = 1; d < 32; d *= 2) { - T n = __shfl_up_sync(0xffffffff, addend, d, 32); - if (threadIdx.x % 32 >= d) addend += n; - } - // exclusive scan - T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32); - - // propagate - if (threadIdx.x % 32 > 0) - for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend; -} - -template -__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d( - T private_buffer[SEQ], - volatile T* exchange_in, - volatile T* exchange_out) -{ - constexpr auto NWARP = NTHREAD / 32; - static_assert(NWARP <= 32, "too big"); - - auto warp_id = threadIdx.x / 32; - auto lane_id = threadIdx.x % 32; - - if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1]; - __syncthreads(); - - if (NWARP <= 8) { - if (threadIdx.x == 0) { - exchange_out[0] = 0; - for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1]; - } - } - else if (NWARP <= 32) { - if (threadIdx.x <= 32) { - auto addend = exchange_in[threadIdx.x]; - - for (auto d = 1; d < 32; d *= 2) { - T n = __shfl_up_sync(0xffffffff, addend, d, 32); - if (threadIdx.x >= d) addend += n; - } - // exclusive scan - T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32); - exchange_out[warp_id] = (warp_id > 0) * prev_addend; - } - } - // else-case handled by static_assert - __syncthreads(); - - // propagate - auto addend = exchange_out[warp_id]; - for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend; - __syncthreads(); -}; +/** + * @file subsub.inl + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2022-12-26 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +namespace psz { +namespace cuda { +namespace __device { + +namespace wave32 { +template +__forceinline__ __device__ void intrawarp_inclusivescan_1d( // + T private_buffer[SEQ]); + +template +__forceinline__ __device__ void intrablock_exclusivescan_1d( // + T private_buffer[SEQ], + volatile T* exchange_in, + volatile T* exchange_out); +} // namespace wave32 + +} // namespace __device +} // namespace cuda +} // namespace psz + +template +__forceinline__ __device__ void psz::cuda::__device::wave32::intrawarp_inclusivescan_1d(T private_buffer[SEQ]) +{ + for (auto i = 1; i < SEQ; i++) private_buffer[i] += private_buffer[i - 1]; + T addend = private_buffer[SEQ - 1]; + + // in-warp shuffle + for (auto d = 1; d < 32; d *= 2) { + T n = __shfl_up_sync(0xffffffff, addend, d, 32); + if (threadIdx.x % 32 >= d) addend += n; + } + // exclusive scan + T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32); + + // propagate + if (threadIdx.x % 32 > 0) + for (auto i = 0; i < SEQ; i++) private_buffer[i] += prev_addend; +} + +template +__forceinline__ __device__ void psz::cuda::__device::wave32::intrablock_exclusivescan_1d( + T private_buffer[SEQ], + volatile T* exchange_in, + volatile T* exchange_out) +{ + constexpr auto NWARP = NTHREAD / 32; + static_assert(NWARP <= 32, "too big"); + + auto warp_id = threadIdx.x / 32; + auto lane_id = threadIdx.x % 32; + + if (lane_id == 31) exchange_in[warp_id] = private_buffer[SEQ - 1]; + __syncthreads(); + + if (NWARP <= 8) { + if (threadIdx.x == 0) { + exchange_out[0] = 0; + for (auto i = 1; i < NWARP; i++) exchange_out[i] = exchange_out[i - 1] + exchange_in[i - 1]; + } + } + else if (NWARP <= 32) { + if (threadIdx.x <= 32) { + auto addend = exchange_in[threadIdx.x]; + + for (auto d = 1; d < 32; d *= 2) { + T n = __shfl_up_sync(0xffffffff, addend, d, 32); + if (threadIdx.x >= d) addend += n; + } + // exclusive scan + T prev_addend = __shfl_up_sync(0xffffffff, addend, 1, 32); + exchange_out[warp_id] = (warp_id > 0) * prev_addend; + } + } + // else-case handled by static_assert + __syncthreads(); + + // propagate + auto addend = exchange_out[warp_id]; + for (auto i = 0; i < SEQ; i++) private_buffer[i] += addend; + __syncthreads(); +}; diff --git a/qtensor/compression/cusz/src/kernel/lorenzo.cu b/qtensor/compression/cusz/src/kernel/lorenzo.cu index fe5e6a25..ff46e548 100644 --- a/qtensor/compression/cusz/src/kernel/lorenzo.cu +++ b/qtensor/compression/cusz/src/kernel/lorenzo.cu @@ -1,209 +1,209 @@ -/** - * @file lorenzo.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-01 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "cusz/type.h" -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -#include "kernel/lorenzo_all.hh" - -// #include "detail/lorenzo.inl" -#include "detail/lorenzo23.inl" - -template -cusz_error_status compress_predict_lorenzo_i( - T* const data, - dim3 const len3, - double const eb, - int const radius, - EQ* const eq, - T* const outlier, - uint32_t* outlier_idx, - uint32_t* num_outliers, - float* time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto SEQ_1D = 4; // x-sequentiality == 4 - constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); - auto GRID_1D = divide3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_2D = dim3(16, 2, 1); - auto GRID_2D = divide3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 - // constexpr auto BLOCK_3D = dim3(32, 1, 8); // for v0 - constexpr auto BLOCK_3D = dim3(32, 8, 1); // for v0::r1_shfl - auto GRID_3D = divide3(len3, SUBLEN_3D); - - auto d = ndim(); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - if (d == 1) { - //::cusz::c_lorenzo_1d1l - //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); - - psz::cuda::__kernel::v0::c_lorenzo_1d1l - <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (d == 2) { - //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2 - //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); - psz::cuda::__kernel::v0::c_lorenzo_2d1l - <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (d == 3) { - //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8 - //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); - psz::cuda::__kernel::v0::c_lorenzo_3d1l - <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -template -cusz_error_status decompress_predict_lorenzo_i( - EQ* eq, - dim3 const len3, - T* outlier, - uint32_t* outlier_idx, - uint32_t const num_outliers, - double const eb, - int const radius, - T* xdata, - float* time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto SEQ_1D = 8; // x-sequentiality == 8 - constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1); - auto GRID_1D = divide3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_2D = dim3(16, 2, 1); - auto GRID_2D = divide3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_3D = dim3(32, 1, 8); - auto GRID_3D = divide3(len3, SUBLEN_3D); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - auto d = ndim(); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - if (d == 1) { - //::cusz::x_lorenzo_1d1l - //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); - psz::cuda::__kernel::v0::x_lorenzo_1d1l - <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (d == 2) { - //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2 - //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); - psz::cuda::__kernel::v0::x_lorenzo_2d1l - <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (d == 3) { - //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8 - //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); - psz::cuda::__kernel::v0::x_lorenzo_3d1l - <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ) \ - template cusz_error_status compress_predict_lorenzo_i( \ - T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier, \ - uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream); \ - \ - template cusz_error_status decompress_predict_lorenzo_i( \ - EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \ - int const radius, T* xdata, float* time_elapsed, cudaStream_t stream); - -// before 2023 -CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float); - -// 2023 -CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t); - -#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER +/** + * @file lorenzo.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-01 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "cusz/type.h" +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +#include "kernel/lorenzo_all.hh" + +// #include "detail/lorenzo.inl" +#include "detail/lorenzo23.inl" + +template +cusz_error_status compress_predict_lorenzo_i( + T* const data, + dim3 const len3, + double const eb, + int const radius, + EQ* const eq, + T* const outlier, + uint32_t* outlier_idx, + uint32_t* num_outliers, + float* time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto SEQ_1D = 4; // x-sequentiality == 4 + constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); + auto GRID_1D = divide3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_2D = dim3(16, 2, 1); + auto GRID_2D = divide3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 + // constexpr auto BLOCK_3D = dim3(32, 1, 8); // for v0 + constexpr auto BLOCK_3D = dim3(32, 8, 1); // for v0::r1_shfl + auto GRID_3D = divide3(len3, SUBLEN_3D); + + auto d = ndim(); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + if (d == 1) { + //::cusz::c_lorenzo_1d1l + //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); + + psz::cuda::__kernel::v0::c_lorenzo_1d1l + <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (d == 2) { + //::cusz::c_lorenzo_2d1l_16x16data_mapto16x2 + //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); + psz::cuda::__kernel::v0::c_lorenzo_2d1l + <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (d == 3) { + //::cusz::c_lorenzo_3d1l_32x8x8data_mapto32x1x8 + //<<>>(data, eq, outlier, len3, leap3, radius, ebx2_r); + psz::cuda::__kernel::v0::c_lorenzo_3d1l + <<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +template +cusz_error_status decompress_predict_lorenzo_i( + EQ* eq, + dim3 const len3, + T* outlier, + uint32_t* outlier_idx, + uint32_t const num_outliers, + double const eb, + int const radius, + T* xdata, + float* time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto SEQ_1D = 8; // x-sequentiality == 8 + constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1); + auto GRID_1D = divide3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_2D = dim3(16, 2, 1); + auto GRID_2D = divide3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_3D = dim3(32, 1, 8); + auto GRID_3D = divide3(len3, SUBLEN_3D); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + auto d = ndim(); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + if (d == 1) { + //::cusz::x_lorenzo_1d1l + //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); + psz::cuda::__kernel::v0::x_lorenzo_1d1l + <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (d == 2) { + //::cusz::x_lorenzo_2d1l_16x16data_mapto16x2 + //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); + psz::cuda::__kernel::v0::x_lorenzo_2d1l + <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (d == 3) { + //::cusz::x_lorenzo_3d1l_32x8x8data_mapto32x1x8 + //<<>>(outlier, eq, xdata, len3, leap3, radius, ebx2); + psz::cuda::__kernel::v0::x_lorenzo_3d1l + <<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(T, EQ) \ + template cusz_error_status compress_predict_lorenzo_i( \ + T* const data, dim3 const len3, double const eb, int const radius, EQ* const eq, T* const outlier, \ + uint32_t* outlier_idx, uint32_t* num_outliers, float* time_elapsed, cudaStream_t stream); \ + \ + template cusz_error_status decompress_predict_lorenzo_i( \ + EQ * eq, dim3 const len3, T* outlier, uint32_t* outlier_idx, uint32_t const num_outliers, double const eb, \ + int const radius, T* xdata, float* time_elapsed, cudaStream_t stream); + +// before 2023 +CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint8_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint16_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, uint32_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, float); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint8_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint16_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, uint32_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, float); + +// 2023 +CPP_TEMPLATE_INIT_AND_C_WRAPPER(float, int32_t); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(double, int32_t); + +#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu index 3dcbadb3..061aebb4 100644 --- a/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu +++ b/qtensor/compression/cusz/src/kernel/lorenzo_proto.cu @@ -1,176 +1,176 @@ -/** - * @file claunch_cuda_proto.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-09-22 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "cusz/type.h" -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -#include "kernel/lorenzo_all.h" -#include "kernel/lorenzo_all.hh" - -#include "detail/lorenzo_proto.inl" - -template -cusz_error_status compress_predict_lorenzo_iproto( - T* const data, - dim3 const len3, - double const eb, - int const radius, - EQ* const eq, - T* outlier, - uint32_t* outlier_idx, - uint32_t* num_outliers, - float* time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto BLOCK_1D = dim3(256, 1, 1); - auto GRID_1D = divide3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - constexpr auto BLOCK_2D = dim3(16, 16, 1); - auto GRID_2D = divide3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(8, 8, 8); - constexpr auto BLOCK_3D = dim3(8, 8, 8); - auto GRID_3D = divide3(len3, SUBLEN_3D); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - using namespace psz::cuda::__kernel::prototype; - - if (ndim() == 1) { - c_lorenzo_1d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (ndim() == 2) { - c_lorenzo_2d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (ndim() == 3) { - c_lorenzo_3d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else { - throw std::runtime_error("Lorenzo only works for 123-D."); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -template -cusz_error_status decompress_predict_lorenzo_iproto( - EQ* eq, - dim3 const len3, - T* outlier, - uint32_t* outlier_idx, - uint32_t const num_outliers, - double const eb, - int const radius, - T* xdata, - float* time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto BLOCK_1D = dim3(256, 1, 1); - auto GRID_1D = divide3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - constexpr auto BLOCK_2D = dim3(16, 16, 1); - auto GRID_2D = divide3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(8, 8, 8); - constexpr auto BLOCK_3D = dim3(8, 8, 8); - auto GRID_3D = divide3(len3, SUBLEN_3D); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - using namespace psz::cuda::__kernel::prototype; - - if (ndim() == 1) { - x_lorenzo_1d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (ndim() == 2) { - x_lorenzo_2d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (ndim() == 3) { - x_lorenzo_3d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP) \ - template cusz_error_status compress_predict_lorenzo_iproto( \ - T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \ - cudaStream_t); \ - \ - template cusz_error_status decompress_predict_lorenzo_iproto( \ - EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); - -#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER +/** + * @file claunch_cuda_proto.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-09-22 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "cusz/type.h" +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +#include "kernel/lorenzo_all.h" +#include "kernel/lorenzo_all.hh" + +#include "detail/lorenzo_proto.inl" + +template +cusz_error_status compress_predict_lorenzo_iproto( + T* const data, + dim3 const len3, + double const eb, + int const radius, + EQ* const eq, + T* outlier, + uint32_t* outlier_idx, + uint32_t* num_outliers, + float* time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto BLOCK_1D = dim3(256, 1, 1); + auto GRID_1D = divide3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + constexpr auto BLOCK_2D = dim3(16, 16, 1); + auto GRID_2D = divide3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(8, 8, 8); + constexpr auto BLOCK_3D = dim3(8, 8, 8); + auto GRID_3D = divide3(len3, SUBLEN_3D); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + using namespace psz::cuda::__kernel::prototype; + + if (ndim() == 1) { + c_lorenzo_1d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (ndim() == 2) { + c_lorenzo_2d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (ndim() == 3) { + c_lorenzo_3d1l<<>>(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else { + throw std::runtime_error("Lorenzo only works for 123-D."); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +template +cusz_error_status decompress_predict_lorenzo_iproto( + EQ* eq, + dim3 const len3, + T* outlier, + uint32_t* outlier_idx, + uint32_t const num_outliers, + double const eb, + int const radius, + T* xdata, + float* time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3((len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto BLOCK_1D = dim3(256, 1, 1); + auto GRID_1D = divide3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + constexpr auto BLOCK_2D = dim3(16, 16, 1); + auto GRID_2D = divide3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(8, 8, 8); + constexpr auto BLOCK_3D = dim3(8, 8, 8); + auto GRID_3D = divide3(len3, SUBLEN_3D); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + using namespace psz::cuda::__kernel::prototype; + + if (ndim() == 1) { + x_lorenzo_1d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (ndim() == 2) { + x_lorenzo_2d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (ndim() == 3) { + x_lorenzo_3d1l<<>>(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP) \ + template cusz_error_status compress_predict_lorenzo_iproto( \ + T* const, dim3 const, double const, int const, EQ* const, T* const, uint32_t*, uint32_t*, float*, \ + cudaStream_t); \ + \ + template cusz_error_status decompress_predict_lorenzo_iproto( \ + EQ*, dim3 const, T*, uint32_t*, uint32_t const, double const, int const, T*, float*, cudaStream_t); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); + +#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc index b274bc23..0ef5b9f5 100644 --- a/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc +++ b/qtensor/compression/cusz/src/kernel/lorenzo_serial.cc @@ -1,118 +1,118 @@ -/** - * @file lorenzo.cu - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-03-16 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "detail/lorenzo_serial.inl" -#include "cusz/type.h" - -template > -cusz_error_status serial_compress_predict_lorenzo_i( - T* const data, - psz_dim3 const len3, - double const eb, - int const radius, - EQ* const eq, - OUTLIER* outlier, - float* time_elapsed) -{ - auto divide3 = [](psz_dim3 len, psz_dim3 sublen) { - return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1}; - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - auto d = ndim(); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = psz_dim3{1, len3.x, len3.x * len3.y}; - - if (d == 1) { - psz::serial::__kernel::c_lorenzo_1d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (d == 2) { - psz::serial::__kernel::c_lorenzo_2d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - else if (d == 3) { - psz::serial::__kernel::c_lorenzo_3d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); - } - - return CUSZ_SUCCESS; -} - -template -cusz_error_status serial_decompress_predict_lorenzo_i( - EQ* eq, - psz_dim3 const len3, - T* outlier, - double const eb, - int const radius, - T* xdata, - float* time_elapsed) -{ - auto divide3 = [](psz_dim3 len, psz_dim3 sublen) { - return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1}; - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = psz_dim3{1, len3.x, len3.x * len3.y}; - - auto d = ndim(); - - if (d == 1) { - psz::serial::__kernel::x_lorenzo_1d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (d == 2) { - psz::serial::__kernel::x_lorenzo_2d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - else if (d == 3) { - psz::serial::__kernel::x_lorenzo_3d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); - } - - return CUSZ_SUCCESS; -} - -#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP) \ - template cusz_error_status serial_compress_predict_lorenzo_i( \ - T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial*, float*); \ - \ - template cusz_error_status serial_decompress_predict_lorenzo_i( \ - EQ*, psz_dim3 const, T*, double const, int const, T*, float*); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); - -#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER +/** + * @file lorenzo.cu + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-03-16 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "detail/lorenzo_serial.inl" +#include "cusz/type.h" + +template > +cusz_error_status serial_compress_predict_lorenzo_i( + T* const data, + psz_dim3 const len3, + double const eb, + int const radius, + EQ* const eq, + OUTLIER* outlier, + float* time_elapsed) +{ + auto divide3 = [](psz_dim3 len, psz_dim3 sublen) { + return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1}; + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + auto d = ndim(); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = psz_dim3{1, len3.x, len3.x * len3.y}; + + if (d == 1) { + psz::serial::__kernel::c_lorenzo_1d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (d == 2) { + psz::serial::__kernel::c_lorenzo_2d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + else if (d == 3) { + psz::serial::__kernel::c_lorenzo_3d1l(data, len3, leap3, radius, ebx2_r, eq, outlier); + } + + return CUSZ_SUCCESS; +} + +template +cusz_error_status serial_decompress_predict_lorenzo_i( + EQ* eq, + psz_dim3 const len3, + T* outlier, + double const eb, + int const radius, + T* xdata, + float* time_elapsed) +{ + auto divide3 = [](psz_dim3 len, psz_dim3 sublen) { + return psz_dim3{(len.x - 1) / sublen.x + 1, (len.y - 1) / sublen.y + 1, (len.z - 1) / sublen.z + 1}; + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = psz_dim3{1, len3.x, len3.x * len3.y}; + + auto d = ndim(); + + if (d == 1) { + psz::serial::__kernel::x_lorenzo_1d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (d == 2) { + psz::serial::__kernel::x_lorenzo_2d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + else if (d == 3) { + psz::serial::__kernel::x_lorenzo_3d1l(eq, outlier, len3, leap3, radius, ebx2, xdata); + } + + return CUSZ_SUCCESS; +} + +#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, EQ, FP) \ + template cusz_error_status serial_compress_predict_lorenzo_i( \ + T* const, psz_dim3 const, double const, int const, EQ* const, psz_outlier_serial*, float*); \ + \ + template cusz_error_status serial_decompress_predict_lorenzo_i( \ + EQ*, psz_dim3 const, T*, double const, int const, T*, float*); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); + +#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER diff --git a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu index 8fc3ff39..12773d35 100644 --- a/qtensor/compression/cusz/src/kernel/lorenzo_var.cu +++ b/qtensor/compression/cusz/src/kernel/lorenzo_var.cu @@ -1,206 +1,206 @@ -/** - * @file lorenzo_var.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-27 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "cusz/type.h" -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -#include "kernel/lorenzo_all.h" -#include "kernel/lorenzo_all.hh" - -#include "detail/lorenzo_var.inl" - -template -cusz_error_status asz::experimental::compress_predict_lorenzo_ivar( - T* data, - dim3 const len3, - double const eb, - DeltaT* delta, - bool* signum, - float* time_elapsed, - cudaStream_t stream) -{ - auto pardeg3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto SEQ_1D = 4; // x-sequentiality == 4 - constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); - auto GRID_1D = pardeg3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_2D = dim3(16, 2, 1); - auto GRID_2D = pardeg3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_3D = dim3(32, 1, 8); - auto GRID_3D = pardeg3(len3, SUBLEN_3D); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - if (ndim() == 1) { - cusz::experimental::c_lorenzo_1d1l // - <<>> // - (data, delta, signum, len3, leap3, ebx2_r); - } - else if (ndim() == 2) { - cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2 // - <<>> // - (data, delta, signum, len3, leap3, ebx2_r); - } - else if (ndim() == 3) { - cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8 // - <<>> // - (data, delta, signum, len3, leap3, ebx2_r); - } - else { - throw std::runtime_error("Lorenzo only works for 123-D."); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -template -cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar( - DeltaT* delta, - bool* signum, - dim3 const len3, - double const eb, - T* xdata, - float* time_elapsed, - cudaStream_t stream) -{ - auto pardeg3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - // constexpr auto SEQ_1D = 8; // x-sequentiality == 8 - constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1); - auto GRID_1D = pardeg3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_2D = dim3(16, 2, 1); - auto GRID_2D = pardeg3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 - constexpr auto BLOCK_3D = dim3(32, 1, 8); - auto GRID_3D = pardeg3(len3, SUBLEN_3D); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - if (ndim() == 1) { - cusz::experimental::x_lorenzo_1d1l // - <<>> // - (signum, delta, xdata, len3, leap3, ebx2); - } - else if (ndim() == 2) { - cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2 // - <<>> // - (signum, delta, xdata, len3, leap3, ebx2); - } - else { - cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8 // - <<>> // - (signum, delta, xdata, len3, leap3, ebx2); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP) \ - template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar( \ - T*, dim3 const, double const, E*, bool*, float*, cudaStream_t); \ - \ - template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar( \ - E*, bool*, dim3 const, double const, T*, float*, cudaStream_t); \ - \ - cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \ - cudaStream_t stream) \ - { \ - asz::experimental::compress_predict_lorenzo_ivar( \ - data, len3, eb, delta, signum, time_elapsed, stream); \ - return CUSZ_SUCCESS; \ - } \ - \ - cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \ - { \ - asz::experimental::decompress_predict_lorenzo_ivar( \ - delta, signum, len3, eb, xdata, time_elapsed, stream); \ - return CUSZ_SUCCESS; \ - } - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); - -#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER +/** + * @file lorenzo_var.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-27 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "cusz/type.h" +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +#include "kernel/lorenzo_all.h" +#include "kernel/lorenzo_all.hh" + +#include "detail/lorenzo_var.inl" + +template +cusz_error_status asz::experimental::compress_predict_lorenzo_ivar( + T* data, + dim3 const len3, + double const eb, + DeltaT* delta, + bool* signum, + float* time_elapsed, + cudaStream_t stream) +{ + auto pardeg3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto SEQ_1D = 4; // x-sequentiality == 4 + constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); + auto GRID_1D = pardeg3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_2D = dim3(16, 2, 1); + auto GRID_2D = pardeg3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_3D = dim3(32, 1, 8); + auto GRID_3D = pardeg3(len3, SUBLEN_3D); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + if (ndim() == 1) { + cusz::experimental::c_lorenzo_1d1l // + <<>> // + (data, delta, signum, len3, leap3, ebx2_r); + } + else if (ndim() == 2) { + cusz::experimental::c_lorenzo_2d1l_16x16data_mapto16x2 // + <<>> // + (data, delta, signum, len3, leap3, ebx2_r); + } + else if (ndim() == 3) { + cusz::experimental::c_lorenzo_3d1l_32x8x8data_mapto32x1x8 // + <<>> // + (data, delta, signum, len3, leap3, ebx2_r); + } + else { + throw std::runtime_error("Lorenzo only works for 123-D."); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +template +cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar( + DeltaT* delta, + bool* signum, + dim3 const len3, + double const eb, + T* xdata, + float* time_elapsed, + cudaStream_t stream) +{ + auto pardeg3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + // constexpr auto SEQ_1D = 8; // x-sequentiality == 8 + constexpr auto BLOCK_1D = dim3(256 / 8, 1, 1); + auto GRID_1D = pardeg3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + // constexpr auto SEQ_2D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_2D = dim3(16, 2, 1); + auto GRID_2D = pardeg3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + // constexpr auto SEQ_3D = dim3(1, 8, 1); // y-sequentiality == 8 + constexpr auto BLOCK_3D = dim3(32, 1, 8); + auto GRID_3D = pardeg3(len3, SUBLEN_3D); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + if (ndim() == 1) { + cusz::experimental::x_lorenzo_1d1l // + <<>> // + (signum, delta, xdata, len3, leap3, ebx2); + } + else if (ndim() == 2) { + cusz::experimental::x_lorenzo_2d1l_16x16data_mapto16x2 // + <<>> // + (signum, delta, xdata, len3, leap3, ebx2); + } + else { + cusz::experimental::x_lorenzo_3d1l_32x8x8data_mapto32x1x8 // + <<>> // + (signum, delta, xdata, len3, leap3, ebx2); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP) \ + template cusz_error_status asz::experimental::compress_predict_lorenzo_ivar( \ + T*, dim3 const, double const, E*, bool*, float*, cudaStream_t); \ + \ + template cusz_error_status asz::experimental::decompress_predict_lorenzo_ivar( \ + E*, bool*, dim3 const, double const, T*, float*, cudaStream_t); \ + \ + cusz_error_status compress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + T* const data, dim3 const len3, double const eb, E* delta, bool* signum, float* time_elapsed, \ + cudaStream_t stream) \ + { \ + asz::experimental::compress_predict_lorenzo_ivar( \ + data, len3, eb, delta, signum, time_elapsed, stream); \ + return CUSZ_SUCCESS; \ + } \ + \ + cusz_error_status decompress_predict_lorenzo_ivar_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + E* delta, bool* signum, dim3 const len3, double const eb, T* xdata, float* time_elapsed, cudaStream_t stream) \ + { \ + asz::experimental::decompress_predict_lorenzo_ivar( \ + delta, signum, len3, eb, xdata, time_elapsed, stream); \ + return CUSZ_SUCCESS; \ + } + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); + +#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER diff --git a/qtensor/compression/cusz/src/kernel/preprocess.cuh b/qtensor/compression/cusz/src/kernel/preprocess.cuh index f7c321f7..f082c193 100644 --- a/qtensor/compression/cusz/src/kernel/preprocess.cuh +++ b/qtensor/compression/cusz/src/kernel/preprocess.cuh @@ -1,65 +1,65 @@ -/** - * @file preprocess.cuh - * @author Jiannan Tian - * @brief Filters for preprocessing of cuSZ. - * @version 0.3 - * @date 2020-09-20 - * (created) 2020-05-03 (rev) 2021-06-21 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#ifndef CUSZ_KERNEL_PREPROCESS_CUH -#define CUSZ_KERNEL_PREPROCESS_CUH - -#include - -#include "common.hh" - -using std::cout; -using std::endl; - -namespace cusz { - -#include - -template -__global__ void log_transform() -{ - static_assert(std::is_floating_point::value, "[log_transform] must be floating-point type."); -} - -template -__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1) -{ - auto y = threadIdx.y; - auto x = threadIdx.x; - auto yid = blockIdx.y * blockDim.y + y; - auto xid = blockIdx.x * blockDim.x + x; - - __shared__ Data s[tBLK][tBLK]; - - if (yid >= new_d1 or xid >= new_d0) return; - - int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR; - int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR; - s[y][x] = 0; - - for (int j = 0; j < yblk; j++) - for (int i = 0; i < xblk; i++) - s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)]; - - output[yid * new_d0 + xid] = s[y][x] / static_cast(yblk * xblk); -} -} // namespace cusz - -template __global__ void cusz::binning2d(float*, float*, size_t, size_t, size_t, size_t); -template __global__ void cusz::binning2d(double*, double*, size_t, size_t, size_t, size_t); -// template __global__ void cusz::binning2d(I1*, I1*, size_t, size_t, size_t, size_t); -// template __global__ void cusz::binning2d(I2*, I2*, size_t, size_t, size_t, size_t); -// template __global__ void cusz::binning2d(I4*, I4*, size_t, size_t, size_t, size_t); -// template __global__ void cusz::binning2d(I8*, I8*, size_t, size_t, size_t, size_t); - -#endif +/** + * @file preprocess.cuh + * @author Jiannan Tian + * @brief Filters for preprocessing of cuSZ. + * @version 0.3 + * @date 2020-09-20 + * (created) 2020-05-03 (rev) 2021-06-21 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#ifndef CUSZ_KERNEL_PREPROCESS_CUH +#define CUSZ_KERNEL_PREPROCESS_CUH + +#include + +#include "common.hh" + +using std::cout; +using std::endl; + +namespace cusz { + +#include + +template +__global__ void log_transform() +{ + static_assert(std::is_floating_point::value, "[log_transform] must be floating-point type."); +} + +template +__global__ void binning2d(Data* input, Data* output, size_t d0, size_t d1, size_t new_d0, size_t new_d1) +{ + auto y = threadIdx.y; + auto x = threadIdx.x; + auto yid = blockIdx.y * blockDim.y + y; + auto xid = blockIdx.x * blockDim.x + x; + + __shared__ Data s[tBLK][tBLK]; + + if (yid >= new_d1 or xid >= new_d0) return; + + int xblk = (xid + 1) * DOWNSCALE_FACTOR >= d0 ? d0 - xid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR; + int yblk = (yid + 1) * DOWNSCALE_FACTOR >= d1 ? d1 - yid * DOWNSCALE_FACTOR : DOWNSCALE_FACTOR; + s[y][x] = 0; + + for (int j = 0; j < yblk; j++) + for (int i = 0; i < xblk; i++) + s[y][x] += input[(yid * DOWNSCALE_FACTOR + j) * d0 + (xid * DOWNSCALE_FACTOR + i)]; + + output[yid * new_d0 + xid] = s[y][x] / static_cast(yblk * xblk); +} +} // namespace cusz + +template __global__ void cusz::binning2d(float*, float*, size_t, size_t, size_t, size_t); +template __global__ void cusz::binning2d(double*, double*, size_t, size_t, size_t, size_t); +// template __global__ void cusz::binning2d(I1*, I1*, size_t, size_t, size_t, size_t); +// template __global__ void cusz::binning2d(I2*, I2*, size_t, size_t, size_t, size_t); +// template __global__ void cusz::binning2d(I4*, I4*, size_t, size_t, size_t, size_t); +// template __global__ void cusz::binning2d(I8*, I8*, size_t, size_t, size_t, size_t); + +#endif diff --git a/qtensor/compression/cusz/src/kernel/rle.cuh b/qtensor/compression/cusz/src/kernel/rle.cuh index 6f01cff4..f8fe36ed 100644 --- a/qtensor/compression/cusz/src/kernel/rle.cuh +++ b/qtensor/compression/cusz/src/kernel/rle.cuh @@ -1,74 +1,74 @@ -// modified from thrust example -// attach the license below when push to master branch -// https://github.com/NVIDIA/thrust/blob/main/LICENSE - -/** - * @file rle.cuh - * @author Jiannan Tian - * @brief - * @version 0.2 - * @date 2021-04-01 - * - * (C) 2021 by Washington State University, Argonne National Laboratory - * - */ - -#ifndef KERNEL_RLE_CUH -#define KERNEL_RLE_CUH - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -using const_gen = thrust::constant_iterator; -using counter = thrust::counting_iterator; - -namespace kernel { - -template -void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs) -{ - thrust::device_ptr input = thrust::device_pointer_cast(d_fullfmt_data); - thrust::device_ptr output = thrust::device_pointer_cast(d_compact_data); - thrust::device_ptr lengths = thrust::device_pointer_cast(d_lengths); - // compute the output size (run lengths) - num_runs = thrust::reduce_by_key( - input, input + N, // input::key (symbol) - const_gen(1), // input::value (count) - output, // output::key (symbol) - lengths) // output::value (count) - .first - - output; -} - -template -void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs) -{ - thrust::device_ptr output = thrust::device_pointer_cast(d_fullfmt_data); - thrust::device_ptr input = thrust::device_pointer_cast(d_compact_data); - thrust::device_ptr lengths = thrust::device_pointer_cast(d_lengths); - - // scan the lengths - thrust::inclusive_scan(lengths, lengths + num_runs, lengths); - - // compute input index for each output element - thrust::device_vector indices(N); - thrust::lower_bound( - lengths, lengths + N, // - counter(1), counter(N + 1), // - indices.begin()); - - thrust::encode(indices.begin(), indices.end(), input, output); -} - -} // namespace kernel - -#endif +// modified from thrust example +// attach the license below when push to master branch +// https://github.com/NVIDIA/thrust/blob/main/LICENSE + +/** + * @file rle.cuh + * @author Jiannan Tian + * @brief + * @version 0.2 + * @date 2021-04-01 + * + * (C) 2021 by Washington State University, Argonne National Laboratory + * + */ + +#ifndef KERNEL_RLE_CUH +#define KERNEL_RLE_CUH + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using const_gen = thrust::constant_iterator; +using counter = thrust::counting_iterator; + +namespace kernel { + +template +void RunLengthEncoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, size_t& num_runs) +{ + thrust::device_ptr input = thrust::device_pointer_cast(d_fullfmt_data); + thrust::device_ptr output = thrust::device_pointer_cast(d_compact_data); + thrust::device_ptr lengths = thrust::device_pointer_cast(d_lengths); + // compute the output size (run lengths) + num_runs = thrust::reduce_by_key( + input, input + N, // input::key (symbol) + const_gen(1), // input::value (count) + output, // output::key (symbol) + lengths) // output::value (count) + .first - + output; +} + +template +void RunLengthDecoding(T* d_fullfmt_data, const size_t N, T* d_compact_data, int* d_lengths, const size_t num_runs) +{ + thrust::device_ptr output = thrust::device_pointer_cast(d_fullfmt_data); + thrust::device_ptr input = thrust::device_pointer_cast(d_compact_data); + thrust::device_ptr lengths = thrust::device_pointer_cast(d_lengths); + + // scan the lengths + thrust::inclusive_scan(lengths, lengths + num_runs, lengths); + + // compute input index for each output element + thrust::device_vector indices(N); + thrust::lower_bound( + lengths, lengths + N, // + counter(1), counter(N + 1), // + indices.begin()); + + thrust::encode(indices.begin(), indices.end(), input, output); +} + +} // namespace kernel + +#endif diff --git a/qtensor/compression/cusz/src/kernel/spv_gpu.cu b/qtensor/compression/cusz/src/kernel/spv_gpu.cu index 96b665a7..29bcee1c 100644 --- a/qtensor/compression/cusz/src/kernel/spv_gpu.cu +++ b/qtensor/compression/cusz/src/kernel/spv_gpu.cu @@ -1,60 +1,60 @@ -/** - * @file spv_gpu.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-29 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/spv_gpu.inl" -#include "kernel/spv_gpu.h" -#include "kernel/spv_gpu.hh" - -#define SPV(Tliteral, Mliteral, T, M) \ - void spv_gather_T##Tliteral##_M##Mliteral( \ - T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \ - { \ - psz::detail::spv_gather(in, in_len, d_val, d_idx, nnz, milliseconds, stream); \ - } \ - \ - void spv_scatter_T##Tliteral##_M##Mliteral( \ - T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) \ - { \ - psz::detail::spv_scatter(d_val, d_idx, nnz, decoded, milliseconds, stream); \ - } - -SPV(ui8, ui32, uint8_t, uint32_t) -SPV(ui16, ui32, uint16_t, uint32_t) -SPV(ui32, ui32, uint32_t, uint32_t) -SPV(ui64, ui32, uint64_t, uint32_t) -SPV(fp32, ui32, float, uint32_t) -SPV(fp64, ui32, double, uint32_t) - -#undef SPV - -#define SPV(Tliteral, Mliteral, T, M) \ - template <> \ - void psz::spv_gather( \ - T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \ - { \ - spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream); \ - } \ - \ - template <> \ - void psz::spv_scatter( \ - T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) \ - { \ - spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream); \ - } - -SPV(ui8, ui32, uint8_t, uint32_t) -SPV(ui16, ui32, uint16_t, uint32_t) -SPV(ui32, ui32, uint32_t, uint32_t) -SPV(ui64, ui32, uint64_t, uint32_t) -SPV(fp32, ui32, float, uint32_t) -SPV(fp64, ui32, double, uint32_t) - -#undef SPV +/** + * @file spv_gpu.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-29 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/spv_gpu.inl" +#include "kernel/spv_gpu.h" +#include "kernel/spv_gpu.hh" + +#define SPV(Tliteral, Mliteral, T, M) \ + void spv_gather_T##Tliteral##_M##Mliteral( \ + T* in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \ + { \ + psz::detail::spv_gather(in, in_len, d_val, d_idx, nnz, milliseconds, stream); \ + } \ + \ + void spv_scatter_T##Tliteral##_M##Mliteral( \ + T* d_val, uint32_t* d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) \ + { \ + psz::detail::spv_scatter(d_val, d_idx, nnz, decoded, milliseconds, stream); \ + } + +SPV(ui8, ui32, uint8_t, uint32_t) +SPV(ui16, ui32, uint16_t, uint32_t) +SPV(ui32, ui32, uint32_t, uint32_t) +SPV(ui64, ui32, uint64_t, uint32_t) +SPV(fp32, ui32, float, uint32_t) +SPV(fp64, ui32, double, uint32_t) + +#undef SPV + +#define SPV(Tliteral, Mliteral, T, M) \ + template <> \ + void psz::spv_gather( \ + T * in, size_t const in_len, T* d_val, uint32_t* d_idx, int* nnz, float* milliseconds, cudaStream_t stream) \ + { \ + spv_gather_T##Tliteral##_M##Mliteral(in, in_len, d_val, d_idx, nnz, milliseconds, stream); \ + } \ + \ + template <> \ + void psz::spv_scatter( \ + T * d_val, uint32_t * d_idx, int const nnz, T* decoded, float* milliseconds, cudaStream_t stream) \ + { \ + spv_scatter_T##Tliteral##_M##Mliteral(d_val, d_idx, nnz, decoded, milliseconds, stream); \ + } + +SPV(ui8, ui32, uint8_t, uint32_t) +SPV(ui16, ui32, uint16_t, uint32_t) +SPV(ui32, ui32, uint32_t, uint32_t) +SPV(ui64, ui32, uint64_t, uint32_t) +SPV(fp32, ui32, float, uint32_t) +SPV(fp64, ui32, double, uint32_t) + +#undef SPV diff --git a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu index fb2c22ed..b7263613 100644 --- a/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu +++ b/qtensor/compression/cusz/src/kernel/v2_lorenzo.cu @@ -1,118 +1,118 @@ -/** - * @file v2_lorenzo.cu - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-23 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include "cusz/type.h" -#include "utils/cuda_err.cuh" -#include "utils/timer.h" - -#include "kernel/lorenzo_all.hh" -#include "kernel/v2_lorenzo.hh" - -template -cusz_error_status v2_compress_predict_lorenzo_i( - T* const data, - dim3 const len3, - double const eb, - int const radius, - E* const errctrl, - dim3 const placeholder_2, - T* const anchor, - dim3 const placeholder_1, - CompactionDRAM outlier, - float* time_elapsed, - cudaStream_t stream) -{ - auto divide3 = [](dim3 len, dim3 sublen) { - return dim3( - (len.x - 1) / sublen.x + 1, // - (len.y - 1) / sublen.y + 1, // - (len.z - 1) / sublen.z + 1); - }; - - auto ndim = [&]() { - if (len3.z == 1 and len3.y == 1) - return 1; - else if (len3.z == 1 and len3.y != 1) - return 2; - else - return 3; - }; - - constexpr auto SUBLEN_1D = 256; - constexpr auto SEQ_1D = 4; // x-sequentiality == 4 - constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); - auto GRID_1D = divide3(len3, SUBLEN_1D); - - constexpr auto SUBLEN_2D = dim3(16, 16, 1); - constexpr auto BLOCK_2D = dim3(16, 2, 1); - auto GRID_2D = divide3(len3, SUBLEN_2D); - - constexpr auto SUBLEN_3D = dim3(32, 8, 8); - constexpr auto BLOCK_3D = dim3(32, 8, 1); // for v0::r1_shfl - auto GRID_3D = divide3(len3, SUBLEN_3D); - - auto d = ndim(); - - // error bound - auto ebx2 = eb * 2; - auto ebx2_r = 1 / ebx2; - auto leap3 = dim3(1, len3.x, len3.x * len3.y); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - if (d == 1) { - psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l - <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); - } - else if (d == 2) { - psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l - <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); - } - else if (d == 3) { - psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l - <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); - } - - STOP_CUDAEVENT_RECORDING(stream); - CHECK_CUDA(cudaStreamSynchronize(stream)); - TIME_ELAPSED_CUDAEVENT(time_elapsed); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP) \ - template cusz_error_status v2_compress_predict_lorenzo_i( \ - T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \ - struct CompactionDRAM, float*, cudaStream_t); \ - \ - // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ - // T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl, \ - // dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed, \ - // cudaStream_t stream) \ - // { \ - // return v2_compress_predict_lorenzo_i( \ - // data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \ - // time_elapsed, stream); \ - // } - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); - -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); -CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); - -#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER +/** + * @file v2_lorenzo.cu + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-23 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include "cusz/type.h" +#include "utils/cuda_err.cuh" +#include "utils/timer.h" + +#include "kernel/lorenzo_all.hh" +#include "kernel/v2_lorenzo.hh" + +template +cusz_error_status v2_compress_predict_lorenzo_i( + T* const data, + dim3 const len3, + double const eb, + int const radius, + E* const errctrl, + dim3 const placeholder_2, + T* const anchor, + dim3 const placeholder_1, + CompactionDRAM outlier, + float* time_elapsed, + cudaStream_t stream) +{ + auto divide3 = [](dim3 len, dim3 sublen) { + return dim3( + (len.x - 1) / sublen.x + 1, // + (len.y - 1) / sublen.y + 1, // + (len.z - 1) / sublen.z + 1); + }; + + auto ndim = [&]() { + if (len3.z == 1 and len3.y == 1) + return 1; + else if (len3.z == 1 and len3.y != 1) + return 2; + else + return 3; + }; + + constexpr auto SUBLEN_1D = 256; + constexpr auto SEQ_1D = 4; // x-sequentiality == 4 + constexpr auto BLOCK_1D = dim3(256 / 4, 1, 1); + auto GRID_1D = divide3(len3, SUBLEN_1D); + + constexpr auto SUBLEN_2D = dim3(16, 16, 1); + constexpr auto BLOCK_2D = dim3(16, 2, 1); + auto GRID_2D = divide3(len3, SUBLEN_2D); + + constexpr auto SUBLEN_3D = dim3(32, 8, 8); + constexpr auto BLOCK_3D = dim3(32, 8, 1); // for v0::r1_shfl + auto GRID_3D = divide3(len3, SUBLEN_3D); + + auto d = ndim(); + + // error bound + auto ebx2 = eb * 2; + auto ebx2_r = 1 / ebx2; + auto leap3 = dim3(1, len3.x, len3.x * len3.y); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + if (d == 1) { + psz::cuda::__kernel::v0::compaction::c_lorenzo_1d1l + <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); + } + else if (d == 2) { + psz::cuda::__kernel::v0::compaction::c_lorenzo_2d1l + <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); + } + else if (d == 3) { + psz::cuda::__kernel::v0::compaction::c_lorenzo_3d1l + <<>>(data, len3, leap3, radius, ebx2_r, errctrl, outlier); + } + + STOP_CUDAEVENT_RECORDING(stream); + CHECK_CUDA(cudaStreamSynchronize(stream)); + TIME_ELAPSED_CUDAEVENT(time_elapsed); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +#define CPP_TEMPLATE_INIT_AND_C_WRAPPER(Tliteral, Eliteral, FPliteral, T, E, FP) \ + template cusz_error_status v2_compress_predict_lorenzo_i( \ + T* const, dim3 const, double const, int const, E* const, dim3 const, T* const, dim3 const, \ + struct CompactionDRAM, float*, cudaStream_t); \ + \ + // cusz_error_status v2_compress_predict_lorenzo_i_T##Tliteral##_E##Eliteral##_FP##FPliteral( \ + // T* const data, dim3 const len3, T* const anchor, dim3 const placeholder_1, E* const errctrl, \ + // dim3 const placeholder_2, T* outlier, double const eb, int const radius, float* time_elapsed, \ + // cudaStream_t stream) \ + // { \ + // return v2_compress_predict_lorenzo_i( \ + // data, len3, eb, radius, errctrl, placeholder_2, anchor, placeholder_1, outlier, nullptr, nullptr, \ + // time_elapsed, stream); \ + // } + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui8, fp32, float, uint8_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui16, fp32, float, uint16_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, ui32, fp32, float, uint32_t, float); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp32, fp32, fp32, float, float, float); + +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui8, fp64, double, uint8_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui16, fp64, double, uint16_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, ui32, fp64, double, uint32_t, double); +CPP_TEMPLATE_INIT_AND_C_WRAPPER(fp64, fp32, fp64, double, float, double); + +#undef CPP_TEMPLATE_INIT_AND_C_WRAPPER diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc index 73ee3c83..a9449447 100644 --- a/qtensor/compression/cusz/src/pipeline/v2_compressor.cc +++ b/qtensor/compression/cusz/src/pipeline/v2_compressor.cc @@ -1,112 +1,112 @@ -/** - * @file v2_compressor.cc - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-29 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include "pipeline/v2_compressor.hh" -#include "common/configs.hh" -#include "framework.hh" - -namespace psz { - -template -v2_Compressor::~v2_Compressor() -{ - pimpl.reset(); -} - -template -v2_Compressor::v2_Compressor() : pimpl{std::make_unique()} -{ -} - -template -v2_Compressor::v2_Compressor(const v2_Compressor& old) : pimpl{std::make_unique(*old.pimpl)} -{ -} - -template -v2_Compressor& v2_Compressor::operator=(const v2_Compressor& old) -{ - *pimpl = *old.pimpl; - return *this; -} - -template -v2_Compressor::v2_Compressor(v2_Compressor&&) = default; - -template -v2_Compressor& v2_Compressor::operator=(v2_Compressor&&) = default; - -//------------------------------------------------------------------------------ - -template -void v2_Compressor::init(Context* config) -{ - pimpl->init(config); -} - -template -void v2_Compressor::init(v2_header* config) -{ - pimpl->init(config); -} - -template -void v2_Compressor::compress( - Context* config, - v2_Compressor::T* uncompressed, - BYTE*& compressed, - size_t& compressed_len, - cudaStream_t stream, - bool dbg_print) -{ - pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print); -} - -template -void v2_Compressor::decompress( - v2_header* config, - BYTE* compressed, - v2_Compressor::T* decompressed, - cudaStream_t stream, - bool dbg_print) -{ - pimpl->decompress(config, compressed, decompressed, stream, dbg_print); -} - -// template -// void v2_Compressor::clear_buffer() -// { -// pimpl->clear_buffer(); -// } - -// getter - -template -void v2_Compressor::export_header(v2_header& header) -{ - pimpl->export_header(header); -} - -template -void v2_Compressor::export_header(v2_header* header) -{ - pimpl->export_header(header); -} - -// template -// void v2_Compressor::export_timerecord(TimeRecord* ext_timerecord) -// { -// pimpl->export_timerecord(ext_timerecord); -// } - -} // namespace psz - +/** + * @file v2_compressor.cc + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-29 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include "pipeline/v2_compressor.hh" +#include "common/configs.hh" +#include "framework.hh" + +namespace psz { + +template +v2_Compressor::~v2_Compressor() +{ + pimpl.reset(); +} + +template +v2_Compressor::v2_Compressor() : pimpl{std::make_unique()} +{ +} + +template +v2_Compressor::v2_Compressor(const v2_Compressor& old) : pimpl{std::make_unique(*old.pimpl)} +{ +} + +template +v2_Compressor& v2_Compressor::operator=(const v2_Compressor& old) +{ + *pimpl = *old.pimpl; + return *this; +} + +template +v2_Compressor::v2_Compressor(v2_Compressor&&) = default; + +template +v2_Compressor& v2_Compressor::operator=(v2_Compressor&&) = default; + +//------------------------------------------------------------------------------ + +template +void v2_Compressor::init(Context* config) +{ + pimpl->init(config); +} + +template +void v2_Compressor::init(v2_header* config) +{ + pimpl->init(config); +} + +template +void v2_Compressor::compress( + Context* config, + v2_Compressor::T* uncompressed, + BYTE*& compressed, + size_t& compressed_len, + cudaStream_t stream, + bool dbg_print) +{ + pimpl->compress(config, uncompressed, compressed, compressed_len, stream, dbg_print); +} + +template +void v2_Compressor::decompress( + v2_header* config, + BYTE* compressed, + v2_Compressor::T* decompressed, + cudaStream_t stream, + bool dbg_print) +{ + pimpl->decompress(config, compressed, decompressed, stream, dbg_print); +} + +// template +// void v2_Compressor::clear_buffer() +// { +// pimpl->clear_buffer(); +// } + +// getter + +template +void v2_Compressor::export_header(v2_header& header) +{ + pimpl->export_header(header); +} + +template +void v2_Compressor::export_header(v2_header* header) +{ + pimpl->export_header(header); +} + +// template +// void v2_Compressor::export_timerecord(TimeRecord* ext_timerecord) +// { +// pimpl->export_timerecord(ext_timerecord); +// } + +} // namespace psz + template class psz::v2_Compressor>; \ No newline at end of file diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu index 32eeb39d..0fcc6ebc 100644 --- a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu +++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.cu @@ -1,15 +1,15 @@ -/** - * @file v2_compressor_impl.cu - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-23 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#include "framework.hh" -#include "v2_compressor_impl.inl" - +/** + * @file v2_compressor_impl.cu + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-23 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#include "framework.hh" +#include "v2_compressor_impl.inl" + template class psz::v2_Compressor>::impl; \ No newline at end of file diff --git a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl index 2a2788f4..0dd96f91 100644 --- a/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl +++ b/qtensor/compression/cusz/src/pipeline/v2_compressor_impl.inl @@ -1,239 +1,239 @@ -/** - * @file v2_compressor_impl.inl - * @author Jiannan Tian - * @brief - * @version 0.4 - * @date 2023-01-23 - * - * (C) 2023 by Indiana University, Argonne National Laboratory - * - */ - -#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D -#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D - -#include - -#include "component.hh" -#include "header.h" -#include "pipeline/v2_compressor.hh" -// #include "kernel/cpplaunch_cuda.hh" -#include "kernel/v2_lorenzo.hh" -#include "stat/stat_g.hh" -#include "utils/cuda_err.cuh" - -#include "../detail/spv_gpu.inl" -#include "../kernel/detail/lorenzo23.inl" - -#define TEMPLATE_TYPE template -#define IMPL v2_Compressor::impl - -#define ARCHIVE(VAR, FIELD) \ - if (segments[v2_header::FIELD] != 0 and VAR != nullptr) { \ - auto dst = var_archive() + header.entry[v2_header::FIELD]; \ - auto src = reinterpret_cast(VAR); \ - CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ - } - -#define ACCESS_VAR(SYM, TYPE) reinterpret_cast(in_compressed + header->entry[v2_header::SYM]) - -namespace psz { - -TEMPLATE_TYPE -IMPL::impl() -{ - codec = new Codec; - // TODO re-enable fallback codec - // fb_codec = new FallbackCodec; -} - -TEMPLATE_TYPE -void IMPL::destroy() -{ - if (codec) delete codec; - // if (fb_codec) delete codec; - - // also deallocate buffer -} - -TEMPLATE_TYPE -void IMPL::init(Context* config) { __init(config); } - -TEMPLATE_TYPE -void IMPL::init(v2_header* config) { __init(config); } - -TEMPLATE_TYPE -template -void IMPL::__init(ContextOrHeader* c) -{ - static_assert( - std::is_same::value or // - std::is_same::value, - "[v2_Compressor::impl::init] not a valid comrpessor config type."); - - auto len = c->x * c->y * c->z; - // TODO allocate anchor - - // allocate eq - cudaMalloc(&d_errctrl, len * sizeof(EQ)); // to overlap with one of vle/hf buffers - - // allocate outlier - outlier.allocate(len / sp_factor, true); - - // allocate vle/hf - codec->init(len, c->radius * 2, c->vle_pardeg); - // TODO disable fallback codec for now -} - -TEMPLATE_TYPE -void IMPL::compress( - Context* c, - T* uncompressed, - BYTE*& compressed, - size_t& compressed_len, - cudaStream_t stream, - bool dbg_print) -{ - auto const eb = c->eb; - auto const radius = c->radius; - auto const pardeg = c->vle_pardeg; - - if (dbg_print) { - printf("[dbg] eb: %lf\n", eb); - printf("[dbg] radius: %d\n", radius); - printf("[dbg] pardeg: %d\n", pardeg); - // printf("[dbg] codecs_in_use: %d\n", codecs_in_use); - printf("[dbg] sp_factor: %d\n", sp_factor); - } - - data_len3 = dim3(c->x, c->y, c->z); - data_len = c->x * c->y * c->z; - - header.sp.factor = sp_factor; - - BYTE* d_codec_out{nullptr}; - size_t codec_outlen{0}; - - // size_t sublen; - auto booklen = radius * 2; - - /******************************************************************************/ - - // TODO version clarification - // with compaction - v2_compress_predict_lorenzo_i( - uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier, - &comp_time.construct, stream); - - outlier.make_count_host_accessible(stream); - - asz::stat::histogram(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - - // TODO overlapping memory - codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - - // update header - { - header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1; - header.sp.count = outlier.access_count_on_host(); - // TODO the new - { - // header.config.radius = radius, header.config.eb = eb; - // header.hf.pardeg = pardeg; - } - - // the compat - { - header.radius = radius, header.eb = eb; - header.vle_pardeg = pardeg; - } - - // header.byte_vle = 4; // regardless of fallback codec - }; - - size_t segments[v2_header::END] = {0}; - - // gather archive - { - // calculate offsets - segments[v2_header::HEADER] = sizeof(v2_header); - segments[v2_header::ANCHOR] = 0; // placeholder - segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX); - segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T); - segments[v2_header::HF] = codec_outlen; - - header.entry[0] = 0; - for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; } - for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } - - CHECK_CUDA(cudaStreamSynchronize(stream)); - - // memcpy - ARCHIVE(d_anchor, ANCHOR); - ARCHIVE(outlier.idx, SP_IDX); - ARCHIVE(outlier.val, SP_VAL); - ARCHIVE(d_codec_out, HF); - - CHECK_CUDA(cudaStreamSynchronize(stream)); - } - - // output - compressed_len = header.entry[v2_header::END]; - compressed = var_archive(); - - // collect_compress_timerecord(); -} - -TEMPLATE_TYPE -void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print) -{ - // TODO host having copy of header when compressing - if (not header) { - header = new v2_header; - CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - } - - data_len3 = dim3(header->x, header->y, header->z); - - // use_fallback_codec = header->byte_vle == 8; - // auto const vle_pardeg = header->hf.pardeg; - - // The inputs of components are from `compressed`. - // auto d_anchor = ACCESS_VAR(ANCHOR, T); - auto d_vle = ACCESS_VAR(HF, BYTE); - auto d_spidx = ACCESS_VAR(SP_IDX, IDX); - auto d_spval = ACCESS_VAR(SP_VAL, T); - - // wire and aliasing - auto d_outlier = out_decompressed; - auto d_xdata = out_decompressed; - - psz::detail::spv_scatter(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream); - - codec->decode(d_vle, d_errctrl); - - decompress_predict_lorenzo_i( - d_errctrl, data_len3, // - d_outlier, // - nullptr, 0, // TODO remove - header->eb, header->radius, - d_xdata, // output - &decomp_time.reconstruct, stream); - - // collect_decompress_timerecord(); - - // clear state for the next decompression after reporting - // use_fallback_codec = false; -} - -} // namespace psz - -#undef TEMPLATE_TYPE -#undef IMPL - -#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */ +/** + * @file v2_compressor_impl.inl + * @author Jiannan Tian + * @brief + * @version 0.4 + * @date 2023-01-23 + * + * (C) 2023 by Indiana University, Argonne National Laboratory + * + */ + +#ifndef F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D +#define F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D + +#include + +#include "component.hh" +#include "header.h" +#include "pipeline/v2_compressor.hh" +// #include "kernel/cpplaunch_cuda.hh" +#include "kernel/v2_lorenzo.hh" +#include "stat/stat_g.hh" +#include "utils/cuda_err.cuh" + +#include "../detail/spv_gpu.inl" +#include "../kernel/detail/lorenzo23.inl" + +#define TEMPLATE_TYPE template +#define IMPL v2_Compressor::impl + +#define ARCHIVE(VAR, FIELD) \ + if (segments[v2_header::FIELD] != 0 and VAR != nullptr) { \ + auto dst = var_archive() + header.entry[v2_header::FIELD]; \ + auto src = reinterpret_cast(VAR); \ + CHECK_CUDA(cudaMemcpyAsync(dst, src, segments[v2_header::FIELD], cudaMemcpyDeviceToDevice, stream)); \ + } + +#define ACCESS_VAR(SYM, TYPE) reinterpret_cast(in_compressed + header->entry[v2_header::SYM]) + +namespace psz { + +TEMPLATE_TYPE +IMPL::impl() +{ + codec = new Codec; + // TODO re-enable fallback codec + // fb_codec = new FallbackCodec; +} + +TEMPLATE_TYPE +void IMPL::destroy() +{ + if (codec) delete codec; + // if (fb_codec) delete codec; + + // also deallocate buffer +} + +TEMPLATE_TYPE +void IMPL::init(Context* config) { __init(config); } + +TEMPLATE_TYPE +void IMPL::init(v2_header* config) { __init(config); } + +TEMPLATE_TYPE +template +void IMPL::__init(ContextOrHeader* c) +{ + static_assert( + std::is_same::value or // + std::is_same::value, + "[v2_Compressor::impl::init] not a valid comrpessor config type."); + + auto len = c->x * c->y * c->z; + // TODO allocate anchor + + // allocate eq + cudaMalloc(&d_errctrl, len * sizeof(EQ)); // to overlap with one of vle/hf buffers + + // allocate outlier + outlier.allocate(len / sp_factor, true); + + // allocate vle/hf + codec->init(len, c->radius * 2, c->vle_pardeg); + // TODO disable fallback codec for now +} + +TEMPLATE_TYPE +void IMPL::compress( + Context* c, + T* uncompressed, + BYTE*& compressed, + size_t& compressed_len, + cudaStream_t stream, + bool dbg_print) +{ + auto const eb = c->eb; + auto const radius = c->radius; + auto const pardeg = c->vle_pardeg; + + if (dbg_print) { + printf("[dbg] eb: %lf\n", eb); + printf("[dbg] radius: %d\n", radius); + printf("[dbg] pardeg: %d\n", pardeg); + // printf("[dbg] codecs_in_use: %d\n", codecs_in_use); + printf("[dbg] sp_factor: %d\n", sp_factor); + } + + data_len3 = dim3(c->x, c->y, c->z); + data_len = c->x * c->y * c->z; + + header.sp.factor = sp_factor; + + BYTE* d_codec_out{nullptr}; + size_t codec_outlen{0}; + + // size_t sublen; + auto booklen = radius * 2; + + /******************************************************************************/ + + // TODO version clarification + // with compaction + v2_compress_predict_lorenzo_i( + uncompressed, data_len3, eb, radius, d_errctrl, dim3(1, 1, 1), d_anchor, dim3(1, 1, 1), outlier, + &comp_time.construct, stream); + + outlier.make_count_host_accessible(stream); + + asz::stat::histogram(d_errctrl, data_len, d_freq, booklen, &comp_time.hist, stream); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // TODO overlapping memory + codec->encode(d_errctrl, data_len, d_codec_out, codec_outlen, stream); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // update header + { + header.x = c->x, header.y = c->y, header.z = c->z, header.w = 1; + header.sp.count = outlier.access_count_on_host(); + // TODO the new + { + // header.config.radius = radius, header.config.eb = eb; + // header.hf.pardeg = pardeg; + } + + // the compat + { + header.radius = radius, header.eb = eb; + header.vle_pardeg = pardeg; + } + + // header.byte_vle = 4; // regardless of fallback codec + }; + + size_t segments[v2_header::END] = {0}; + + // gather archive + { + // calculate offsets + segments[v2_header::HEADER] = sizeof(v2_header); + segments[v2_header::ANCHOR] = 0; // placeholder + segments[v2_header::SP_IDX] = outlier.access_count_on_host() * sizeof(IDX); + segments[v2_header::SP_VAL] = outlier.access_count_on_host() * sizeof(T); + segments[v2_header::HF] = codec_outlen; + + header.entry[0] = 0; + for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] = segments[i - 1]; } + for (auto i = 1; i < v2_header::END + 1; i++) { header.entry[i] += header.entry[i - 1]; } + + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // memcpy + ARCHIVE(d_anchor, ANCHOR); + ARCHIVE(outlier.idx, SP_IDX); + ARCHIVE(outlier.val, SP_VAL); + ARCHIVE(d_codec_out, HF); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + // output + compressed_len = header.entry[v2_header::END]; + compressed = var_archive(); + + // collect_compress_timerecord(); +} + +TEMPLATE_TYPE +void IMPL::decompress(v2_header* header, BYTE* in_compressed, T* out_decompressed, cudaStream_t stream, bool dbg_print) +{ + // TODO host having copy of header when compressing + if (not header) { + header = new v2_header; + CHECK_CUDA(cudaMemcpyAsync(header, in_compressed, sizeof(v2_header), cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + data_len3 = dim3(header->x, header->y, header->z); + + // use_fallback_codec = header->byte_vle == 8; + // auto const vle_pardeg = header->hf.pardeg; + + // The inputs of components are from `compressed`. + // auto d_anchor = ACCESS_VAR(ANCHOR, T); + auto d_vle = ACCESS_VAR(HF, BYTE); + auto d_spidx = ACCESS_VAR(SP_IDX, IDX); + auto d_spval = ACCESS_VAR(SP_VAL, T); + + // wire and aliasing + auto d_outlier = out_decompressed; + auto d_xdata = out_decompressed; + + psz::detail::spv_scatter(d_spval, d_spidx, header->sp.count, d_outlier, &decomp_time.scatter, stream); + + codec->decode(d_vle, d_errctrl); + + decompress_predict_lorenzo_i( + d_errctrl, data_len3, // + d_outlier, // + nullptr, 0, // TODO remove + header->eb, header->radius, + d_xdata, // output + &decomp_time.reconstruct, stream); + + // collect_decompress_timerecord(); + + // clear state for the next decompression after reporting + // use_fallback_codec = false; +} + +} // namespace psz + +#undef TEMPLATE_TYPE +#undef IMPL + +#endif /* F4D645B7_B2E3_41AB_BCFD_DCF919C4C56D */ diff --git a/qtensor/compression/cusz/src/stat/cmpg1_1.cu b/qtensor/compression/cusz/src/stat/cmpg1_1.cu index ccf91661..a32a02eb 100644 --- a/qtensor/compression/cusz/src/stat/cmpg1_1.cu +++ b/qtensor/compression/cusz/src/stat/cmpg1_1.cu @@ -1,30 +1,30 @@ -/** - * @file cmpg1.cu - * @author Jiannan Tian - * @brief (split to speed up buid process; part 1) - * @version 0.3 - * @date 2022-10-09 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ - void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ - { \ - psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ - } \ - \ - template <> \ - void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ - { \ - thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ - } - -THRUSTGPU_DESCRIPTION(ui8, uint8_t) - -#undef THRUSTGPU_DESCRIPTION +/** + * @file cmpg1.cu + * @author Jiannan Tian + * @brief (split to speed up buid process; part 1) + * @version 0.3 + * @date 2022-10-09 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ + void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ + { \ + psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ + } \ + \ + template <> \ + void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ + { \ + thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ + } + +THRUSTGPU_DESCRIPTION(ui8, uint8_t) + +#undef THRUSTGPU_DESCRIPTION diff --git a/qtensor/compression/cusz/src/stat/cmpg1_2.cu b/qtensor/compression/cusz/src/stat/cmpg1_2.cu index 8b44a9e6..b85c6477 100644 --- a/qtensor/compression/cusz/src/stat/cmpg1_2.cu +++ b/qtensor/compression/cusz/src/stat/cmpg1_2.cu @@ -1,30 +1,30 @@ -/** - * @file cmpg1_2.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ - void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ - { \ - psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ - } \ - \ - template <> \ - void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ - { \ - thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ - } - -THRUSTGPU_DESCRIPTION(ui16, uint16_t) - +/** + * @file cmpg1_2.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ + void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ + { \ + psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ + } \ + \ + template <> \ + void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ + { \ + thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ + } + +THRUSTGPU_DESCRIPTION(ui16, uint16_t) + #undef THRUSTGPU_DESCRIPTION \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/cmpg1_3.cu b/qtensor/compression/cusz/src/stat/cmpg1_3.cu index 169741bc..a68f760c 100644 --- a/qtensor/compression/cusz/src/stat/cmpg1_3.cu +++ b/qtensor/compression/cusz/src/stat/cmpg1_3.cu @@ -1,30 +1,30 @@ -/** - * @file cmpg1_3.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ - void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ - { \ - psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ - } \ - \ - template <> \ - void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ - { \ - thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ - } - -THRUSTGPU_DESCRIPTION(ui32, uint32_t) - +/** + * @file cmpg1_3.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ + void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ + { \ + psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ + } \ + \ + template <> \ + void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ + { \ + thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ + } + +THRUSTGPU_DESCRIPTION(ui32, uint32_t) + #undef THRUSTGPU_DESCRIPTION \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/cmpg1_4.cu b/qtensor/compression/cusz/src/stat/cmpg1_4.cu index 4ec93b20..47dcc774 100644 --- a/qtensor/compression/cusz/src/stat/cmpg1_4.cu +++ b/qtensor/compression/cusz/src/stat/cmpg1_4.cu @@ -1,30 +1,30 @@ -/** - * @file cmpg1_4.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ - void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ - { \ - psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ - } \ - \ - template <> \ - void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ - { \ - thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ - } - -THRUSTGPU_DESCRIPTION(fp32, float) - +/** + * @file cmpg1_4.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ + void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ + { \ + psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ + } \ + \ + template <> \ + void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ + { \ + thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ + } + +THRUSTGPU_DESCRIPTION(fp32, float) + #undef THRUSTGPU_DESCRIPTION \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/cmpg1_5.cu b/qtensor/compression/cusz/src/stat/cmpg1_5.cu index 3b08e576..5828860d 100644 --- a/qtensor/compression/cusz/src/stat/cmpg1_5.cu +++ b/qtensor/compression/cusz/src/stat/cmpg1_5.cu @@ -1,30 +1,30 @@ -/** - * @file cmpg1_5.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ - void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ - { \ - psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ - } \ - \ - template <> \ - void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ - { \ - thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ - } - -THRUSTGPU_DESCRIPTION(fp64, double) - +/** + * @file cmpg1_5.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_DESCRIPTION(Tliteral, T) \ + void thrustgpu_get_extrema_rawptr_T##Tliteral(T* d_ptr, size_t len, T res[4]) \ + { \ + psz::detail::thrustgpu_get_extrema_rawptr(d_ptr, len, res); \ + } \ + \ + template <> \ + void psz::thrustgpu_get_extrema_rawptr(T* d_ptr, size_t len, T res[4]) \ + { \ + thrustgpu_get_extrema_rawptr_T##Tliteral(d_ptr, len, res); \ + } + +THRUSTGPU_DESCRIPTION(fp64, double) + #undef THRUSTGPU_DESCRIPTION \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/cmpg2.cu b/qtensor/compression/cusz/src/stat/cmpg2.cu index 0ece52b5..a8bdcd29 100644 --- a/qtensor/compression/cusz/src/stat/cmpg2.cu +++ b/qtensor/compression/cusz/src/stat/cmpg2.cu @@ -1,34 +1,34 @@ -/** - * @file cmp2g.cu - * @author Jiannan Tian - * @brief (split to speed up buid process; part 2) - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T) \ - bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \ - { \ - return psz::detail::thrustgpu_identical(d1, d2, len); \ - } \ - \ - template <> \ - bool psz::thrustgpu_identical(T * d1, T * d2, size_t const len) \ - { \ - return thrustgpu_identical_T##Tliteral(d1, d2, len); \ - } - -THRUSTGPU_COMPARE_LOSSLESS(fp32, float) -THRUSTGPU_COMPARE_LOSSLESS(fp64, double) -THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t) -THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t) -THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t) - -#undef THRUSTGPU_COMPARE_LOSSLESS +/** + * @file cmp2g.cu + * @author Jiannan Tian + * @brief (split to speed up buid process; part 2) + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_COMPARE_LOSSLESS(Tliteral, T) \ + bool thrustgpu_identical_T##Tliteral(T* d1, T* d2, size_t const len) \ + { \ + return psz::detail::thrustgpu_identical(d1, d2, len); \ + } \ + \ + template <> \ + bool psz::thrustgpu_identical(T * d1, T * d2, size_t const len) \ + { \ + return thrustgpu_identical_T##Tliteral(d1, d2, len); \ + } + +THRUSTGPU_COMPARE_LOSSLESS(fp32, float) +THRUSTGPU_COMPARE_LOSSLESS(fp64, double) +THRUSTGPU_COMPARE_LOSSLESS(ui8, uint8_t) +THRUSTGPU_COMPARE_LOSSLESS(ui16, uint16_t) +THRUSTGPU_COMPARE_LOSSLESS(ui32, uint32_t) + +#undef THRUSTGPU_COMPARE_LOSSLESS diff --git a/qtensor/compression/cusz/src/stat/cmpg3.cu b/qtensor/compression/cusz/src/stat/cmpg3.cu index 05c7af97..61f71f13 100644 --- a/qtensor/compression/cusz/src/stat/cmpg3.cu +++ b/qtensor/compression/cusz/src/stat/cmpg3.cu @@ -1,32 +1,32 @@ -/** - * @file cmp3g.cu - * @author Jiannan Tian - * @brief (split to speed up buid process; part 3) - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T) \ - bool thrustgpu_error_bounded_T##Tliteral( \ - T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) \ - { \ - return psz::detail::thrustgpu_error_bounded(a, b, len, eb, first_faulty_idx); \ - } \ - \ - template <> \ - bool psz::thrustgpu_error_bounded(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \ - { \ - return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx); \ - } - -THRUSTGPU_COMPARE_LOSSY(fp32, float); -THRUSTGPU_COMPARE_LOSSY(fp64, double); - -#undef THRUSTGPU_COMPARE_LOSSY +/** + * @file cmp3g.cu + * @author Jiannan Tian + * @brief (split to speed up buid process; part 3) + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_COMPARE_LOSSY(Tliteral, T) \ + bool thrustgpu_error_bounded_T##Tliteral( \ + T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) \ + { \ + return psz::detail::thrustgpu_error_bounded(a, b, len, eb, first_faulty_idx); \ + } \ + \ + template <> \ + bool psz::thrustgpu_error_bounded(T * a, T * b, size_t const len, double const eb, size_t* first_faulty_idx) \ + { \ + return thrustgpu_error_bounded_T##Tliteral(a, b, len, eb, first_faulty_idx); \ + } + +THRUSTGPU_COMPARE_LOSSY(fp32, float); +THRUSTGPU_COMPARE_LOSSY(fp64, double); + +#undef THRUSTGPU_COMPARE_LOSSY diff --git a/qtensor/compression/cusz/src/stat/cmpg4_1.cu b/qtensor/compression/cusz/src/stat/cmpg4_1.cu index b3e5edaf..34d74884 100644 --- a/qtensor/compression/cusz/src/stat/cmpg4_1.cu +++ b/qtensor/compression/cusz/src/stat/cmpg4_1.cu @@ -1,24 +1,24 @@ -/** - * @file cmpg4_1.cu - * @author Jiannan Tian - * @brief (split to speed up buid process; part 4) - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_ASSESS(Tliteral, T) \ - void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ - { \ - psz::detail::thrustgpu_assess_quality(s, xdata, odata, len); \ - } - -THRUSTGPU_ASSESS(fp32, float); - -#undef THRUSTGPU_ASSESS +/** + * @file cmpg4_1.cu + * @author Jiannan Tian + * @brief (split to speed up buid process; part 4) + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_ASSESS(Tliteral, T) \ + void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ + { \ + psz::detail::thrustgpu_assess_quality(s, xdata, odata, len); \ + } + +THRUSTGPU_ASSESS(fp32, float); + +#undef THRUSTGPU_ASSESS diff --git a/qtensor/compression/cusz/src/stat/cmpg4_2.cu b/qtensor/compression/cusz/src/stat/cmpg4_2.cu index 7a62b06d..73dcde1f 100644 --- a/qtensor/compression/cusz/src/stat/cmpg4_2.cu +++ b/qtensor/compression/cusz/src/stat/cmpg4_2.cu @@ -1,25 +1,25 @@ -/** - * @file cmpg4_2.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_ASSESS(Tliteral, T) \ - template <> \ - void psz::thrustgpu_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ - { \ - thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len); \ - } - -THRUSTGPU_ASSESS(fp32, float); - -#undef THRUSTGPU_ASSESS +/** + * @file cmpg4_2.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_ASSESS(Tliteral, T) \ + template <> \ + void psz::thrustgpu_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ + { \ + thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len); \ + } + +THRUSTGPU_ASSESS(fp32, float); + +#undef THRUSTGPU_ASSESS diff --git a/qtensor/compression/cusz/src/stat/cmpg4_3.cu b/qtensor/compression/cusz/src/stat/cmpg4_3.cu index b9361bfb..bbca7c6c 100644 --- a/qtensor/compression/cusz/src/stat/cmpg4_3.cu +++ b/qtensor/compression/cusz/src/stat/cmpg4_3.cu @@ -1,24 +1,24 @@ -/** - * @file cmpg4_3.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_ASSESS(Tliteral, T) \ - void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ - { \ - psz::detail::thrustgpu_assess_quality(s, xdata, odata, len); \ - } - -THRUSTGPU_ASSESS(fp64, double); - +/** + * @file cmpg4_3.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_ASSESS(Tliteral, T) \ + void thrustgpu_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ + { \ + psz::detail::thrustgpu_assess_quality(s, xdata, odata, len); \ + } + +THRUSTGPU_ASSESS(fp64, double); + #undef THRUSTGPU_ASSESS \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/cmpg4_4.cu b/qtensor/compression/cusz/src/stat/cmpg4_4.cu index 4df3919f..d60b8b97 100644 --- a/qtensor/compression/cusz/src/stat/cmpg4_4.cu +++ b/qtensor/compression/cusz/src/stat/cmpg4_4.cu @@ -1,25 +1,25 @@ -/** - * @file cmpg4_4.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-11-03 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_gpu.inl" -#include "stat/compare.h" -#include "stat/compare_gpu.hh" - -#define THRUSTGPU_ASSESS(Tliteral, T) \ - template <> \ - void psz::thrustgpu_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ - { \ - thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len); \ - } - -THRUSTGPU_ASSESS(fp64, double); - +/** + * @file cmpg4_4.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-11-03 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_gpu.inl" +#include "stat/compare.h" +#include "stat/compare_gpu.hh" + +#define THRUSTGPU_ASSESS(Tliteral, T) \ + template <> \ + void psz::thrustgpu_assess_quality(cusz_stats * s, T * xdata, T * odata, size_t const len) \ + { \ + thrustgpu_assess_quality_T##Tliteral(s, xdata, odata, len); \ + } + +THRUSTGPU_ASSESS(fp64, double); + #undef THRUSTGPU_ASSESS \ No newline at end of file diff --git a/qtensor/compression/cusz/src/stat/compare_cpu.cc b/qtensor/compression/cusz/src/stat/compare_cpu.cc index c9432bb4..8a22dbe3 100644 --- a/qtensor/compression/cusz/src/stat/compare_cpu.cc +++ b/qtensor/compression/cusz/src/stat/compare_cpu.cc @@ -1,43 +1,43 @@ -/** - * @file _compare.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-09 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../detail/compare_cpu.inl" -#include "stat/compare.h" - -#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T) \ - bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \ - { \ - return psz::detail::cppstd_identical(d1, d2, len); \ - } - -#define CPPSTD_COMPARE_LOSSY(Tliteral, T) \ - bool cppstd_error_bounded_T##Tliteral( \ - T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) \ - { \ - return psz::detail::cppstd_error_bounded(a, b, len, eb, first_faulty_idx); \ - } \ - \ - void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ - { \ - psz::detail::cppstd_assess_quality(s, xdata, odata, len); \ - } - -CPPSTD_COMPARE_LOSSLESS(fp32, float) -CPPSTD_COMPARE_LOSSLESS(fp64, double) -CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t) -CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t) -CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t) - -CPPSTD_COMPARE_LOSSY(fp32, float) -CPPSTD_COMPARE_LOSSY(fp64, double) - -#undef CPPSTD_COMPARE_LOSSLESS -#undef CPPSTD_COMPARE_LOSSY +/** + * @file _compare.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-09 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../detail/compare_cpu.inl" +#include "stat/compare.h" + +#define CPPSTD_COMPARE_LOSSLESS(Tliteral, T) \ + bool cppstd_identical_T##Tliteral(T* d1, T* d2, size_t const len) \ + { \ + return psz::detail::cppstd_identical(d1, d2, len); \ + } + +#define CPPSTD_COMPARE_LOSSY(Tliteral, T) \ + bool cppstd_error_bounded_T##Tliteral( \ + T* a, T* b, size_t const len, double const eb, size_t* first_faulty_idx = nullptr) \ + { \ + return psz::detail::cppstd_error_bounded(a, b, len, eb, first_faulty_idx); \ + } \ + \ + void cppstd_assess_quality_T##Tliteral(cusz_stats* s, T* xdata, T* odata, size_t const len) \ + { \ + psz::detail::cppstd_assess_quality(s, xdata, odata, len); \ + } + +CPPSTD_COMPARE_LOSSLESS(fp32, float) +CPPSTD_COMPARE_LOSSLESS(fp64, double) +CPPSTD_COMPARE_LOSSLESS(ui8, uint8_t) +CPPSTD_COMPARE_LOSSLESS(ui16, uint16_t) +CPPSTD_COMPARE_LOSSLESS(ui32, uint32_t) + +CPPSTD_COMPARE_LOSSY(fp32, float) +CPPSTD_COMPARE_LOSSY(fp64, double) + +#undef CPPSTD_COMPARE_LOSSLESS +#undef CPPSTD_COMPARE_LOSSY diff --git a/qtensor/compression/cusz/src/stat/stat_g.cu b/qtensor/compression/cusz/src/stat/stat_g.cu index 2fcc81c6..c3c18c12 100644 --- a/qtensor/compression/cusz/src/stat/stat_g.cu +++ b/qtensor/compression/cusz/src/stat/stat_g.cu @@ -1,96 +1,96 @@ -/** - * @file stat_g.cu - * @author Cody Rivera, Jiannan Tian - * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper - * @version 0.3 - * @date 2022-11-02 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "../kernel/detail/hist.inl" - -#include "cusz/type.h" -#include "stat/stat.h" -#include "stat/stat_g.hh" - -template -cusz_error_status asz::stat::histogram( - T* in_data, - size_t const in_len, - uint32_t* out_freq, - int const num_buckets, - float* milliseconds, - cudaStream_t stream) -{ - int device_id, max_bytes, num_SMs; - int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use; - - cudaGetDevice(&device_id); - cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id); - - auto query_maxbytes = [&]() { - int max_bytes_opt_in; - cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id); - - // account for opt-in extra shared memory on certain architectures - cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - max_bytes = std::max(max_bytes, max_bytes_opt_in); - - // config kernel attribute - cudaFuncSetAttribute( - kernel::p2013Histogram, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes); - }; - - auto optimize_launch = [&]() { - items_per_thread = 1; - r_per_block = (max_bytes / sizeof(int)) / (num_buckets + 1); - grid_dim = num_SMs; - // fits to size - block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64; - while (block_dim > 1024) { - if (r_per_block <= 1) { block_dim = 1024; } - else { - r_per_block /= 2; - grid_dim *= 2; - block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64; - } - } - shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int); - }; - - query_maxbytes(); - optimize_launch(); - - CREATE_CUDAEVENT_PAIR; - START_CUDAEVENT_RECORDING(stream); - - kernel::p2013Histogram<<>> // - (in_data, out_freq, in_len, num_buckets, r_per_block); - - STOP_CUDAEVENT_RECORDING(stream); - - cudaStreamSynchronize(stream); - TIME_ELAPSED_CUDAEVENT(milliseconds); - DESTROY_CUDAEVENT_PAIR; - - return CUSZ_SUCCESS; -} - -#define INIT_HIST_AND_C(Tname, T) \ - template cusz_error_status asz::stat::histogram(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \ - \ - cusz_error_status histogram_T##Tname( \ - T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \ - cudaStream_t stream) \ - { \ - return asz::stat::histogram(in_data, in_len, out_freq, num_buckets, milliseconds, stream); \ - } - -INIT_HIST_AND_C(ui8, uint8_t) -INIT_HIST_AND_C(ui16, uint16_t) -INIT_HIST_AND_C(ui32, uint32_t) -INIT_HIST_AND_C(ui64, uint64_t) - +/** + * @file stat_g.cu + * @author Cody Rivera, Jiannan Tian + * @brief Fast histogramming from [Gómez-Luna et al. 2013], wrapper + * @version 0.3 + * @date 2022-11-02 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "../kernel/detail/hist.inl" + +#include "cusz/type.h" +#include "stat/stat.h" +#include "stat/stat_g.hh" + +template +cusz_error_status asz::stat::histogram( + T* in_data, + size_t const in_len, + uint32_t* out_freq, + int const num_buckets, + float* milliseconds, + cudaStream_t stream) +{ + int device_id, max_bytes, num_SMs; + int items_per_thread, r_per_block, grid_dim, block_dim, shmem_use; + + cudaGetDevice(&device_id); + cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, device_id); + + auto query_maxbytes = [&]() { + int max_bytes_opt_in; + cudaDeviceGetAttribute(&max_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id); + + // account for opt-in extra shared memory on certain architectures + cudaDeviceGetAttribute(&max_bytes_opt_in, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + max_bytes = std::max(max_bytes, max_bytes_opt_in); + + // config kernel attribute + cudaFuncSetAttribute( + kernel::p2013Histogram, cudaFuncAttributeMaxDynamicSharedMemorySize, max_bytes); + }; + + auto optimize_launch = [&]() { + items_per_thread = 1; + r_per_block = (max_bytes / sizeof(int)) / (num_buckets + 1); + grid_dim = num_SMs; + // fits to size + block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64; + while (block_dim > 1024) { + if (r_per_block <= 1) { block_dim = 1024; } + else { + r_per_block /= 2; + grid_dim *= 2; + block_dim = ((((in_len / (grid_dim * items_per_thread)) + 1) / 64) + 1) * 64; + } + } + shmem_use = ((num_buckets + 1) * r_per_block) * sizeof(int); + }; + + query_maxbytes(); + optimize_launch(); + + CREATE_CUDAEVENT_PAIR; + START_CUDAEVENT_RECORDING(stream); + + kernel::p2013Histogram<<>> // + (in_data, out_freq, in_len, num_buckets, r_per_block); + + STOP_CUDAEVENT_RECORDING(stream); + + cudaStreamSynchronize(stream); + TIME_ELAPSED_CUDAEVENT(milliseconds); + DESTROY_CUDAEVENT_PAIR; + + return CUSZ_SUCCESS; +} + +#define INIT_HIST_AND_C(Tname, T) \ + template cusz_error_status asz::stat::histogram(T*, size_t const, uint32_t*, int const, float*, cudaStream_t); \ + \ + cusz_error_status histogram_T##Tname( \ + T* in_data, size_t const in_len, uint32_t* out_freq, int const num_buckets, float* milliseconds, \ + cudaStream_t stream) \ + { \ + return asz::stat::histogram(in_data, in_len, out_freq, num_buckets, milliseconds, stream); \ + } + +INIT_HIST_AND_C(ui8, uint8_t) +INIT_HIST_AND_C(ui16, uint16_t) +INIT_HIST_AND_C(ui32, uint32_t) +INIT_HIST_AND_C(ui64, uint64_t) + #undef INIT_HIST_AND_C \ No newline at end of file diff --git a/qtensor/compression/cusz/src/utils/dbg_print.cuh b/qtensor/compression/cusz/src/utils/dbg_print.cuh index 19334e2e..2c2b5580 100644 --- a/qtensor/compression/cusz/src/utils/dbg_print.cuh +++ b/qtensor/compression/cusz/src/utils/dbg_print.cuh @@ -1,132 +1,132 @@ -#ifndef UTILS_DBG_PRINT_CUH -#define UTILS_DBG_PRINT_CUH - -/** - * @file dbg_print.cuh - * @author Jiannan Tian - * @brief - * @version 0.2 - * @date 2020-09-20 - * Created on 2020-03-17 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -template -__global__ void print_deflated(Q* coded, size_t gid) -{ - if (blockIdx.x * blockDim.x + threadIdx.x != gid) return; - printf("print after deflating\n"); - // for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); }); - for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); } - printf("\n"); -} - -template -__global__ void print_histogram(T* freq, size_t size, size_t radius = 20) -{ - const int DICT_SIZE = size; /* Dynamic sizing */ - if (blockIdx.x * blockDim.x + threadIdx.x == 0) { - for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) { - if (i % 10 == 0) printf("\n"); - printf("%4lu: %-12lu", i, static_cast(freq[i])); - } - printf("\n"); - } -} - -template -__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n') -{ - for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) { - printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u); - if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep); - } - printf("%c", ending); -} - -// MSB to LSB -template -__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n') -{ - for (size_t j = 0; j < bitwidth; j++) { - printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u); - if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep); - } - printf("%c", ending); -} - -template -__device__ __host__ void snippet_print_bitset_full(T num) -{ - print_by_type(num, '_', '\t'); - size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1); - // size_t code_bitwidth = ((static_cast(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8); - printf("len: %3lu\tcode: ", bitwidth); - print_code_only(num, bitwidth, '\0', '\n'); -} - -template -__global__ void print_codebook(T* codebook, size_t len) -{ - if (blockIdx.x * blockDim.x + threadIdx.x != 0) return; - printf("--------------------------------------------------------------------------------\n"); - printf("printing codebook\n"); - printf("--------------------------------------------------------------------------------\n"); - __shared__ T buffer; - for (size_t i = 0; i < len; i++) { - buffer = codebook[i]; - if (buffer == ~((T)0x0)) continue; - printf("%5lu\t", i); - snippet_print_bitset_full(buffer); - } - printf("--------------------------------------------------------------------------------\n"); - printf("done printing codebook\n"); - printf("--------------------------------------------------------------------------------\n"); -} - -template -__global__ void get_entropy(T* freq) -{ -} - -// TODO real GPU version -template -__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len) -{ -} - -// template -//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) { -// if (blockIdx.x * blockDim.x + threadIdx.x != 0) return; -// printf("print Huffman coded before it is deflated\n"); -// for (size_t i = 0; i < 200; i++) { -// if (coded[i] == ~((T)0x0)) continue; -// printf("%5lu\t", i); -// snippet_print_bitset_full(coded[i]); -// } -// printf("\n"); -//} - -template -__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len) -{ - if (blockIdx.x != 0) return; - size_t gid = blockDim.x * blockIdx.x + threadIdx.x; - if (coded[gid] == ~((T)0x0)) return; - printf("%5lu\t", gid); - snippet_print_bitset_full(coded[gid]); - - // if (coded[i] == ~((T)0x0)) continue; - // printf("print Huffman coded before it is deflated\n"); - // for (size_t i = 0; i < 200; i++) { - // if (coded[i] == ~((T)0x0)) continue; - // printf("%5lu\t", i); - // snippet_print_bitset_full(coded[i]); - // } - // printf("\n"); -} - +#ifndef UTILS_DBG_PRINT_CUH +#define UTILS_DBG_PRINT_CUH + +/** + * @file dbg_print.cuh + * @author Jiannan Tian + * @brief + * @version 0.2 + * @date 2020-09-20 + * Created on 2020-03-17 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +template +__global__ void print_deflated(Q* coded, size_t gid) +{ + if (blockIdx.x * blockDim.x + threadIdx.x != gid) return; + printf("print after deflating\n"); + // for_each(coded, coded + PART_SIZE, [](Q& i) { print_by_type(i, '_', '\n'); }); + for (size_t i = 0; i < PART_SIZE; i++) { print_by_type(*(coded + i), '_', '\n'); } + printf("\n"); +} + +template +__global__ void print_histogram(T* freq, size_t size, size_t radius = 20) +{ + const int DICT_SIZE = size; /* Dynamic sizing */ + if (blockIdx.x * blockDim.x + threadIdx.x == 0) { + for (size_t i = DICT_SIZE / 2 - radius; i < DICT_SIZE / 2 + radius; i++) { + if (i % 10 == 0) printf("\n"); + printf("%4lu: %-12lu", i, static_cast(freq[i])); + } + printf("\n"); + } +} + +template +__device__ __host__ void print_by_type(T num, char sep = '_', char ending = '\n') +{ + for (size_t j = 0; j < sizeof(T) * CHAR_BIT; j++) { + printf("%u", (num >> ((sizeof(T) * CHAR_BIT - 1) - j)) & 0x01u); + if (j != 0 and j != sizeof(T) * CHAR_BIT - 1 and j % 8 == 7) printf("%c", sep); + } + printf("%c", ending); +} + +// MSB to LSB +template +__device__ __host__ void print_code_only(T num, size_t bitwidth, char sep = '_', char ending = '\n') +{ + for (size_t j = 0; j < bitwidth; j++) { + printf("%u", (num >> ((bitwidth - 1) - j)) & 0x01u); + if (j != 0 and j != bitwidth - 1 and j % 8 == 7) printf("%c", sep); + } + printf("%c", ending); +} + +template +__device__ __host__ void snippet_print_bitset_full(T num) +{ + print_by_type(num, '_', '\t'); + size_t bitwidth = *((uint8_t*)&num + sizeof(T) - 1); + // size_t code_bitwidth = ((static_cast(0xffu) << (sizeof(T) * 8 - 8)) & num) >> (sizeof(T) * 8 - 8); + printf("len: %3lu\tcode: ", bitwidth); + print_code_only(num, bitwidth, '\0', '\n'); +} + +template +__global__ void print_codebook(T* codebook, size_t len) +{ + if (blockIdx.x * blockDim.x + threadIdx.x != 0) return; + printf("--------------------------------------------------------------------------------\n"); + printf("printing codebook\n"); + printf("--------------------------------------------------------------------------------\n"); + __shared__ T buffer; + for (size_t i = 0; i < len; i++) { + buffer = codebook[i]; + if (buffer == ~((T)0x0)) continue; + printf("%5lu\t", i); + snippet_print_bitset_full(buffer); + } + printf("--------------------------------------------------------------------------------\n"); + printf("done printing codebook\n"); + printf("--------------------------------------------------------------------------------\n"); +} + +template +__global__ void get_entropy(T* freq) +{ +} + +// TODO real GPU version +template +__global__ void get_theoretical_dense_Huffman_coded_length(T* codebook, Q* freq, size_t codebook_len) +{ +} + +// template +//__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len=200) { +// if (blockIdx.x * blockDim.x + threadIdx.x != 0) return; +// printf("print Huffman coded before it is deflated\n"); +// for (size_t i = 0; i < 200; i++) { +// if (coded[i] == ~((T)0x0)) continue; +// printf("%5lu\t", i); +// snippet_print_bitset_full(coded[i]); +// } +// printf("\n"); +//} + +template +__global__ void print_Huffman_coded_before_deflating(T* coded, size_t len) +{ + if (blockIdx.x != 0) return; + size_t gid = blockDim.x * blockIdx.x + threadIdx.x; + if (coded[gid] == ~((T)0x0)) return; + printf("%5lu\t", gid); + snippet_print_bitset_full(coded[gid]); + + // if (coded[i] == ~((T)0x0)) continue; + // printf("print Huffman coded before it is deflated\n"); + // for (size_t i = 0; i < 200; i++) { + // if (coded[i] == ~((T)0x0)) continue; + // printf("%5lu\t", i); + // snippet_print_bitset_full(coded[i]); + // } + // printf("\n"); +} + #endif \ No newline at end of file diff --git a/qtensor/compression/cusz/src/utils/print_gpu.cu b/qtensor/compression/cusz/src/utils/print_gpu.cu index 9fd20040..2d2b195f 100644 --- a/qtensor/compression/cusz/src/utils/print_gpu.cu +++ b/qtensor/compression/cusz/src/utils/print_gpu.cu @@ -1,121 +1,121 @@ -/** - * @file print_gpu.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-09-23 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -// #include "../detail/print_gpu.inl" -#include -#include -#include -#include "utils/print_gpu.h" -#include "utils/print_gpu.hh" - -#define PRINT_INT_LESS_THAN_64(Tliteral, T) \ - void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset) \ - { \ - thrust::for_each( \ - thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \ - printf("\n"); \ - } - -PRINT_INT_LESS_THAN_64(i8, int8_t) -PRINT_INT_LESS_THAN_64(i16, int16_t) -PRINT_INT_LESS_THAN_64(i32, int32_t) - -void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset) -{ - thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); }); - printf("\n"); -} - -#define PRINT_UINT_LESS_THAN_64(Tliteral, T) \ - void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset) \ - { \ - thrust::for_each( \ - thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \ - printf("\n"); \ - } - -PRINT_UINT_LESS_THAN_64(ui8, uint8_t) -PRINT_UINT_LESS_THAN_64(ui16, uint16_t) -PRINT_UINT_LESS_THAN_64(ui32, uint32_t) - -void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset) -{ - thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); }); - printf("\n"); -} - -void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset) -{ - thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); }); - printf("\n"); -} - -void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset) -{ - thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); }); - printf("\n"); -} - -template -void psz::peek_device_data(T* d_arr, size_t num, size_t offset) -{ - if (std::is_same::value) { // - peek_device_data_Ti8((int8_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Ti16((int16_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Ti32((int32_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Ti64((int64_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tui8((uint8_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tui16((uint16_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tui32((uint32_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tui64((uint64_t*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tfp32((float*)d_arr, num, offset); - } - else if (std::is_same::value) { - peek_device_data_Tfp64((double*)d_arr, num, offset); - } - else { - std::runtime_error("peek_device_data cannot accept this type."); - } -} - -#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data(T * d_arr, size_t num, size_t offset); - -CPP_PEEK(i8, int8_t); -CPP_PEEK(i16, int16_t); -CPP_PEEK(i32, int32_t); -CPP_PEEK(i64, int64_t); -CPP_PEEK(ui8, uint8_t); -CPP_PEEK(ui16, uint16_t); -CPP_PEEK(ui32, uint32_t); -CPP_PEEK(ui64, uint64_t); -CPP_PEEK(fp32, float); -CPP_PEEK(fp64, double); - -#undef CPP_PEEK - -#undef PRINT_INT_LESS_THAN_64 -#undef PRINT_UINT_LESS_THAN_64 +/** + * @file print_gpu.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-09-23 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +// #include "../detail/print_gpu.inl" +#include +#include +#include +#include "utils/print_gpu.h" +#include "utils/print_gpu.hh" + +#define PRINT_INT_LESS_THAN_64(Tliteral, T) \ + void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset) \ + { \ + thrust::for_each( \ + thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%d\t", (int32_t)i); }); \ + printf("\n"); \ + } + +PRINT_INT_LESS_THAN_64(i8, int8_t) +PRINT_INT_LESS_THAN_64(i16, int16_t) +PRINT_INT_LESS_THAN_64(i32, int32_t) + +void peek_device_data_Ti64(int64_t* d_arr, size_t num, size_t offset) +{ + thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const int64_t i) { printf("%ld\t", i); }); + printf("\n"); +} + +#define PRINT_UINT_LESS_THAN_64(Tliteral, T) \ + void peek_device_data_T##Tliteral(T* d_arr, size_t num, size_t offset) \ + { \ + thrust::for_each( \ + thrust::device, d_arr, d_arr + num, [=] __device__(const T i) { printf("%u\t", (uint32_t)i); }); \ + printf("\n"); \ + } + +PRINT_UINT_LESS_THAN_64(ui8, uint8_t) +PRINT_UINT_LESS_THAN_64(ui16, uint16_t) +PRINT_UINT_LESS_THAN_64(ui32, uint32_t) + +void peek_device_data_Tui64(uint64_t* d_arr, size_t num, size_t offset) +{ + thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const uint64_t i) { printf("%lu\t", i); }); + printf("\n"); +} + +void peek_device_data_Tfp32(float* d_arr, size_t num, size_t offset) +{ + thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const float i) { printf("%.7f\t", i); }); + printf("\n"); +} + +void peek_device_data_Tfp64(double* d_arr, size_t num, size_t offset) +{ + thrust::for_each(thrust::device, d_arr, d_arr + num, [=] __device__(const double i) { printf("%.7lf\t", i); }); + printf("\n"); +} + +template +void psz::peek_device_data(T* d_arr, size_t num, size_t offset) +{ + if (std::is_same::value) { // + peek_device_data_Ti8((int8_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Ti16((int16_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Ti32((int32_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Ti64((int64_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tui8((uint8_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tui16((uint16_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tui32((uint32_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tui64((uint64_t*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tfp32((float*)d_arr, num, offset); + } + else if (std::is_same::value) { + peek_device_data_Tfp64((double*)d_arr, num, offset); + } + else { + std::runtime_error("peek_device_data cannot accept this type."); + } +} + +#define CPP_PEEK(Tliteral, T) template void psz::peek_device_data(T * d_arr, size_t num, size_t offset); + +CPP_PEEK(i8, int8_t); +CPP_PEEK(i16, int16_t); +CPP_PEEK(i32, int32_t); +CPP_PEEK(i64, int64_t); +CPP_PEEK(ui8, uint8_t); +CPP_PEEK(ui16, uint16_t); +CPP_PEEK(ui32, uint32_t); +CPP_PEEK(ui64, uint64_t); +CPP_PEEK(fp32, float); +CPP_PEEK(fp64, double); + +#undef CPP_PEEK + +#undef PRINT_INT_LESS_THAN_64 +#undef PRINT_UINT_LESS_THAN_64 diff --git a/qtensor/compression/cusz/src/utils/timer_cpu.cc b/qtensor/compression/cusz/src/utils/timer_cpu.cc index 3983bc0f..2422f6f2 100644 --- a/qtensor/compression/cusz/src/utils/timer_cpu.cc +++ b/qtensor/compression/cusz/src/utils/timer_cpu.cc @@ -1,30 +1,30 @@ -/** - * @file timer_cpu.cc - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-31 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include "utils/timer.h" - -#include -#include - -using hires = std::chrono::high_resolution_clock; -using duration_t = std::chrono::duration; -using hires_clock_t = std::chrono::time_point; - -struct asz_timer { - hires_clock_t start, stop; -}; - -// cpu timer specific -asz_timer* asz_cputimer_create() { return new asz_timer; } -void asz_cputimer_destroy(asz_timer* t) { delete t; } -void asz_cputimer_start(asz_timer* t) { t->start = hires::now(); } -void asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); } -double asz_cputime_elapsed(asz_timer* t) { return static_cast((t->stop) - (t->start)).count(); } +/** + * @file timer_cpu.cc + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-31 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include "utils/timer.h" + +#include +#include + +using hires = std::chrono::high_resolution_clock; +using duration_t = std::chrono::duration; +using hires_clock_t = std::chrono::time_point; + +struct asz_timer { + hires_clock_t start, stop; +}; + +// cpu timer specific +asz_timer* asz_cputimer_create() { return new asz_timer; } +void asz_cputimer_destroy(asz_timer* t) { delete t; } +void asz_cputimer_start(asz_timer* t) { t->start = hires::now(); } +void asz_cputimer_end(asz_timer* t) { t->stop = hires::now(); } +double asz_cputime_elapsed(asz_timer* t) { return static_cast((t->stop) - (t->start)).count(); } diff --git a/qtensor/compression/cusz/src/utils/timer_gpu.cu b/qtensor/compression/cusz/src/utils/timer_gpu.cu index a44ee4bf..247c80f8 100644 --- a/qtensor/compression/cusz/src/utils/timer_gpu.cu +++ b/qtensor/compression/cusz/src/utils/timer_gpu.cu @@ -1,82 +1,82 @@ -/** - * @file timer_gpu.cu - * @author Jiannan Tian - * @brief - * @version 0.3 - * @date 2022-10-31 - * - * (C) 2022 by Indiana University, Argonne National Laboratory - * - */ - -#include -#include -#include "utils/timer.h" - -typedef struct asz_cudatimer { - cudaEvent_t a, b; - float milliseconds; - cudaStream_t stream; - - asz_cudatimer() { create(); } - asz_cudatimer(cudaStream_t stream) - { - create(); - this->stream = stream; - } - - void create() - { - cudaEventCreate(&a); - cudaEventCreate(&b); - } - - void destroy() - { - cudaEventDestroy(a); - cudaEventDestroy(b); - } - - // stream not involved - void start() { cudaEventRecord(a); } - - void stop() - { - cudaEventRecord(b); - cudaEventSynchronize(b); - } - - // stream involved - void stream_start() - { - cudaEventRecord(a, stream); // set event as not occurred - } - - void stream_stop() - { - cudaEventRecord(b, stream); - cudaEventSynchronize(b); // block host until `stream` meets `stop` - } - - // get time - float time_elapsed() - { - cudaEventElapsedTime(&milliseconds, a, b); - std::cout << "milliseconds: " << milliseconds << std::endl; - return milliseconds; - } -} asz_cudatimer; - -// cuda timer specific -asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; } -void asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); } -void asz_cudatimer_start(asz_cudatimer* t) { t->start(); } -void asz_cudatimer_end(asz_cudatimer* t) { t->stop(); } -double asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; } - -// cuda streamtimer specific -asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); } -void asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); } -void asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); } -void asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); } -double asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; } +/** + * @file timer_gpu.cu + * @author Jiannan Tian + * @brief + * @version 0.3 + * @date 2022-10-31 + * + * (C) 2022 by Indiana University, Argonne National Laboratory + * + */ + +#include +#include +#include "utils/timer.h" + +typedef struct asz_cudatimer { + cudaEvent_t a, b; + float milliseconds; + cudaStream_t stream; + + asz_cudatimer() { create(); } + asz_cudatimer(cudaStream_t stream) + { + create(); + this->stream = stream; + } + + void create() + { + cudaEventCreate(&a); + cudaEventCreate(&b); + } + + void destroy() + { + cudaEventDestroy(a); + cudaEventDestroy(b); + } + + // stream not involved + void start() { cudaEventRecord(a); } + + void stop() + { + cudaEventRecord(b); + cudaEventSynchronize(b); + } + + // stream involved + void stream_start() + { + cudaEventRecord(a, stream); // set event as not occurred + } + + void stream_stop() + { + cudaEventRecord(b, stream); + cudaEventSynchronize(b); // block host until `stream` meets `stop` + } + + // get time + float time_elapsed() + { + cudaEventElapsedTime(&milliseconds, a, b); + std::cout << "milliseconds: " << milliseconds << std::endl; + return milliseconds; + } +} asz_cudatimer; + +// cuda timer specific +asz_cudatimer* asz_cudatimer_create() { return new asz_cudatimer{}; } +void asz_cudatimer_destroy(asz_cudatimer* t) { t->destroy(); } +void asz_cudatimer_start(asz_cudatimer* t) { t->start(); } +void asz_cudatimer_end(asz_cudatimer* t) { t->stop(); } +double asz_cudatime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; } + +// cuda streamtimer specific +asz_cudatimer* asz_cudastreamtimer_create(void* stream) { return new asz_cudatimer((cudaStream_t)stream); } +void asz_cudastreamtimer_destroy(asz_cudatimer* t) { t->destroy(); } +void asz_cudastreamtimer_start(asz_cudatimer* t) { t->stream_start(); } +void asz_cudastreamtimer_end(asz_cudatimer* t) { t->stream_stop(); } +double asz_cudastreamtime_elapsed(asz_cudatimer* t) { return t->time_elapsed() / 1000; } diff --git a/qtensor/compression/cusz/src/utils/vis_stat.hh b/qtensor/compression/cusz/src/utils/vis_stat.hh index 60099138..ff27695f 100644 --- a/qtensor/compression/cusz/src/utils/vis_stat.hh +++ b/qtensor/compression/cusz/src/utils/vis_stat.hh @@ -1,137 +1,137 @@ -#ifndef UTILS_VIS_STAT_HH -#define UTILS_VIS_STAT_HH - -/** - * @file vis_stat.hh - * @author Jiannan Tian - * @brief Analysis and visualization of datum. - * @version 0.1 - * @date 2020-09-20 - * Created on 2020-02-09 - * - * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory - * See LICENSE in top-level directory - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -using std::cerr; -using std::cout; -using std::endl; -using std::tuple; - -template -double GetEntropy(T* code, size_t l, size_t cap = 1024) -{ - if (cap == 0) { - cerr << "wrong cap" << endl; - exit(-1); - } - auto arr = new size_t[cap](); - for (size_t i = 0; i < l; i++) arr[code[i]]++; - std::vector raw(arr, arr + cap); - std::vector frequencies; - std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; }); - double entropy = 0; - for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); } - - // cout << "entropy:\t" << entropy << endl; - delete[] arr; - return entropy; -} - -// TODO automatically omit bins that are less than 1% -template -void VisualizeHistogram( - const std::string& tag, - T* _d_POD, - size_t l, - size_t _bins = 16, - bool log_freq = false, - double override_min = 0, - double override_max = 0, - bool eliminate_zeros = false, - bool use_scientific_notation = true) -{ - std::vector _d(_d_POD, _d_POD + l); - std::vector _d_nonzero; - // std::vector arr; - // arr.reserve(_bins); - // for (size_t i = 0; i< _bins; i++) arr.push_back(0); - auto arr = new size_t[_bins](); - - if (eliminate_zeros) { - std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; }); - } - double Min = *std::min_element(_d.begin(), _d.end()); - double Max = *std::max_element(_d.begin(), _d.end()); - // double sum = std::accumulate(_d.begin(), _d.end(), 0); - double rng = Max - Min; - // double avg = sum / l; - - cout << "\e[7m[[" << tag << "]]\e[0m"; - if (override_max > override_min) { - cout << "zoom into " << override_min << "--" << override_max << endl; - std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min); - } - double step = rng / _bins; - for (size_t i = 0; i < l; i++) arr[static_cast((_d[i] - Min) / step)]++; - std::vector _viz(arr, arr + _bins); - // std::vector _viz(arr); - - // visualization - printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step); - // printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng); - cout << "count:\t" << l << "\t"; - cout << "min:\t" << Min << "\t"; - cout << "max:\t" << Max << "\t"; - cout << "rng:\t" << rng << endl; - - if (log_freq) { - cout << "using log_freq" << endl; - std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); }); - } - - size_t longest = *std::max_element(_viz.begin(), _viz.end()); - size_t bar_str_len = 64; // scale according to the longest - std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) { - n = static_cast(n / static_cast(longest) * bar_str_len); - }); - - for (size_t i = 0; i < _bins; i++) { - // normalize to width - cout << "|" - << "\33[43m"; - - for (size_t j = 0; j < bar_str_len + 1; j++) { - if (j < _viz[i]) - cout << "-"; - else if (j == _viz[i]) - cout << "\33[0m" - << "+"; - else - cout << " "; - } - cout.precision(2); - cout << " "; - if (use_scientific_notation) cout << std::scientific; - cout << Min + i * step << " -- " << Min + (i + 1) * step; - cout << " "; - cout << std::setw((int)log10(l) + 2); - cout << arr[i]; - cout << " "; - cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast(l) * 100 << "%" << endl; - } - cout << endl; - // delete[] arr; -} - -#endif +#ifndef UTILS_VIS_STAT_HH +#define UTILS_VIS_STAT_HH + +/** + * @file vis_stat.hh + * @author Jiannan Tian + * @brief Analysis and visualization of datum. + * @version 0.1 + * @date 2020-09-20 + * Created on 2020-02-09 + * + * @copyright (C) 2020 by Washington State University, The University of Alabama, Argonne National Laboratory + * See LICENSE in top-level directory + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cerr; +using std::cout; +using std::endl; +using std::tuple; + +template +double GetEntropy(T* code, size_t l, size_t cap = 1024) +{ + if (cap == 0) { + cerr << "wrong cap" << endl; + exit(-1); + } + auto arr = new size_t[cap](); + for (size_t i = 0; i < l; i++) arr[code[i]]++; + std::vector raw(arr, arr + cap); + std::vector frequencies; + std::copy_if(raw.begin(), raw.end(), std::back_inserter(frequencies), [](double& e) { return e != 0; }); + double entropy = 0; + for (auto freq : frequencies) { entropy += -(freq * 1.0 / l) * log2(freq * 1.0 / l); } + + // cout << "entropy:\t" << entropy << endl; + delete[] arr; + return entropy; +} + +// TODO automatically omit bins that are less than 1% +template +void VisualizeHistogram( + const std::string& tag, + T* _d_POD, + size_t l, + size_t _bins = 16, + bool log_freq = false, + double override_min = 0, + double override_max = 0, + bool eliminate_zeros = false, + bool use_scientific_notation = true) +{ + std::vector _d(_d_POD, _d_POD + l); + std::vector _d_nonzero; + // std::vector arr; + // arr.reserve(_bins); + // for (size_t i = 0; i< _bins; i++) arr.push_back(0); + auto arr = new size_t[_bins](); + + if (eliminate_zeros) { + std::copy_if(_d.begin(), _d.end(), std::back_inserter(_d_nonzero), [](int i) { return i != 0; }); + } + double Min = *std::min_element(_d.begin(), _d.end()); + double Max = *std::max_element(_d.begin(), _d.end()); + // double sum = std::accumulate(_d.begin(), _d.end(), 0); + double rng = Max - Min; + // double avg = sum / l; + + cout << "\e[7m[[" << tag << "]]\e[0m"; + if (override_max > override_min) { + cout << "zoom into " << override_min << "--" << override_max << endl; + std::tie(Max, Min, rng) = std::make_tuple(override_max, override_min, override_max - override_min); + } + double step = rng / _bins; + for (size_t i = 0; i < l; i++) arr[static_cast((_d[i] - Min) / step)]++; + std::vector _viz(arr, arr + _bins); + // std::vector _viz(arr); + + // visualization + printf("\tbins:\t%zu\tbin_width:\t%lf\n", _bins, step); + // printf("count:\t%zu\tmin:\t%lf\tmax:\t%lf\trng:\t%lf\n", l, Min, Max, rng); + cout << "count:\t" << l << "\t"; + cout << "min:\t" << Min << "\t"; + cout << "max:\t" << Max << "\t"; + cout << "rng:\t" << rng << endl; + + if (log_freq) { + cout << "using log_freq" << endl; + std::for_each(_viz.begin(), _viz.end(), [](size_t& n) { n = log2(n); }); + } + + size_t longest = *std::max_element(_viz.begin(), _viz.end()); + size_t bar_str_len = 64; // scale according to the longest + std::for_each(_viz.begin(), _viz.end(), [&](size_t& n) { + n = static_cast(n / static_cast(longest) * bar_str_len); + }); + + for (size_t i = 0; i < _bins; i++) { + // normalize to width + cout << "|" + << "\33[43m"; + + for (size_t j = 0; j < bar_str_len + 1; j++) { + if (j < _viz[i]) + cout << "-"; + else if (j == _viz[i]) + cout << "\33[0m" + << "+"; + else + cout << " "; + } + cout.precision(2); + cout << " "; + if (use_scientific_notation) cout << std::scientific; + cout << Min + i * step << " -- " << Min + (i + 1) * step; + cout << " "; + cout << std::setw((int)log10(l) + 2); + cout << arr[i]; + cout << " "; + cout << std::defaultfloat << std::setw(5) << arr[i] / static_cast(l) * 100 << "%" << endl; + } + cout << endl; + // delete[] arr; +} + +#endif diff --git a/qtensor/compression/newsz/newsz.cu b/qtensor/compression/newsz/newsz.cu index 3ef211d5..00a394b6 100644 --- a/qtensor/compression/newsz/newsz.cu +++ b/qtensor/compression/newsz/newsz.cu @@ -1,248 +1,248 @@ -#include -#include "newsz.h" -#include -#include -#include -// #include "cuCompactor.cuh" - -#include "nvcomp/lz4.hpp" -#include "nvcomp.hpp" -#include "nvcomp/nvcompManagerFactory.hpp" - -#define BLKS 40 -#define THDS 128 -#define FULL_MASK 0xffffffff - -__device__ int g_ints; - -struct int_predicate -{ - - __host__ __device__ - bool operator()(const int x) - { - return x>0; - } -}; - -struct to_copy -{ - __host__ __device__ - bool operator()(const uint8_t x) - { - return x==1; - } -}; - - - - -__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){ - int bid = blockIdx.x; - int tid = threadIdx.x; - extern __shared__ float scratchpad[]; - __shared__ float min; - __shared__ float max; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage1; - - float item = data[blockIdx.x*blockDim.x+threadIdx.x]; - - float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max()); - float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min()); - - if (threadIdx.x==0) - { - max = tmax; - min = tmin; - } - - __syncthreads(); - - float vrange = max - min; - float scale = vrange/((2^8) - 1); - int zeropt = -1*lrintf(min*scale) - (2^7); - - int q_item = lrintf(item/scale) + zeropt; - - // Clamp quantized value - if(q_item>127)q_item = 127; - if(q_item <-128)q_item = -128; - int8_t q_val = (int8_t)(0xff & q_item); - out[blockIdx.x*blockDim.x+threadIdx.x] = q_val; - if (threadIdx.x==0) - { - scales[blockIdx.x] = scale; - zeropts[blockIdx.x]= zeropt; - } - -} - -__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){ - int bid = blockIdx.x; - int tid = threadIdx.x; - extern __shared__ float scratchpad[]; - __shared__ float min; - __shared__ float max; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage1; - - int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x]; - - out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid]; -} - -__global__ void p_ints(){ - printf("codebook entries used: %d\n", g_ints); -} - -unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){ - float *scales, *zeropts; - int8_t *q_out; - unsigned char *cmpbytes; - int num_blocks = num_elements/blocksize; - - cudaMalloc(&scales, sizeof(float)*num_blocks); - cudaMalloc(&zeropts,sizeof(float)*num_blocks); - cudaMalloc(&q_out, num_elements); - - using namespace nvcomp; - - cudaStream_t stream; - cudaStreamCreate(&stream); - - const int chunk_size = 1 << 16; - nvcompType_t data_type = NVCOMP_TYPE_CHAR; - - - - compress<<>>(data, scales, zeropts, q_out); - cudaDeviceSynchronize(); - - LZ4Manager nvcomp_manager{chunk_size, data_type, stream}; - CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements); - - uint8_t* comp_buffer; - cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size); - - nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config); - - size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer); - cudaFree(q_out); - - *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size; - cudaMalloc(&cmpbytes, *outsize); - - cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); - cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); - cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice); - - float h_firstscale; - cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost); - cudaFree(scales); - cudaFree(zeropts); - cudaFree(comp_buffer); - return cmpbytes; -} - -float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){ - float *scales, *zeropts; - uint8_t *q_cmp; - int8_t *q_vals; - float *out; - int num_blocks = num_elements/blocksize; - size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks); - - float first_val, *d_first; - - cudaMalloc(&d_first, sizeof(float)); - cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost); - - - - cudaMalloc((void **)&scales, sizeof(float)*num_blocks); - cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks); - cudaMalloc((void **)&q_cmp, c_size); - cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); - - cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); - - cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice); - cudaStream_t stream; - cudaStreamCreate(&stream); - - const int chunk_size = 1 << 16; - - - nvcompType_t data_type = NVCOMP_TYPE_CHAR; - - auto decomp_manager = nvcomp::create_manager(q_cmp, stream); - - nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp); - cudaMalloc(&q_vals, num_elements); - - decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config); - cudaFree(q_cmp); - - cudaMalloc(&out, sizeof(float)*num_elements); - - decompress<<>>(q_vals, scales, zeropts, out); - cudaDeviceSynchronize(); - - cudaFree(scales); - cudaFree(zeropts); - cudaFree(q_vals); - - return out; -} - -int main(int argc, char** argv){ - char oriFilePath[640], outputFilePath[645]; - float* data; - size_t nbEle; - if(argc < 3) - { - printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n"); - printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n"); - exit(0); - } - - sprintf(oriFilePath, "%s", argv[1]); - int blockSize = atoi(argv[2]); - float errBound = atof(argv[3]); - nbEle = atoi(argv[4]); - - data = (float*)malloc(sizeof(float)*nbEle); - sprintf(outputFilePath, "%s.sznew", oriFilePath); - - FILE *in_file; - in_file = fopen(oriFilePath, "rb"); - - fread(data, sizeof(float), nbEle, in_file); - fclose(in_file); - - float max = data[0]; - float min = data[0]; - for(int i=0;i=max){ - max = data[i]; - } - if(data[i]<=min){ - min = data[i]; - } - } - errBound = errBound*(max-min); - - // Move to device - float *d_data; - unsigned char *cmpbytes; - size_t outsize; - cudaMalloc(&d_data, sizeof(float)*nbEle); - cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice); - //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize); - - cudaFree(d_data); - -} +#include +#include "newsz.h" +#include +#include +#include +// #include "cuCompactor.cuh" + +#include "nvcomp/lz4.hpp" +#include "nvcomp.hpp" +#include "nvcomp/nvcompManagerFactory.hpp" + +#define BLKS 40 +#define THDS 128 +#define FULL_MASK 0xffffffff + +__device__ int g_ints; + +struct int_predicate +{ + + __host__ __device__ + bool operator()(const int x) + { + return x>0; + } +}; + +struct to_copy +{ + __host__ __device__ + bool operator()(const uint8_t x) + { + return x==1; + } +}; + + + + +__global__ void compress(float *data, float *scales, float *zeropts, int8_t *out){ + int bid = blockIdx.x; + int tid = threadIdx.x; + extern __shared__ float scratchpad[]; + __shared__ float min; + __shared__ float max; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + + float item = data[blockIdx.x*blockDim.x+threadIdx.x]; + + float tmax = BlockReduce(temp_storage1).Reduce(item, cub::Max()); + float tmin = BlockReduce(temp_storage1).Reduce(item, cub::Min()); + + if (threadIdx.x==0) + { + max = tmax; + min = tmin; + } + + __syncthreads(); + + float vrange = max - min; + float scale = vrange/((2^8) - 1); + int zeropt = -1*lrintf(min*scale) - (2^7); + + int q_item = lrintf(item/scale) + zeropt; + + // Clamp quantized value + if(q_item>127)q_item = 127; + if(q_item <-128)q_item = -128; + int8_t q_val = (int8_t)(0xff & q_item); + out[blockIdx.x*blockDim.x+threadIdx.x] = q_val; + if (threadIdx.x==0) + { + scales[blockIdx.x] = scale; + zeropts[blockIdx.x]= zeropt; + } + +} + +__global__ void decompress(int8_t *q_data, float *scales, float *zeropts, float *out){ + int bid = blockIdx.x; + int tid = threadIdx.x; + extern __shared__ float scratchpad[]; + __shared__ float min; + __shared__ float max; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + + int8_t q_val = q_data[blockIdx.x*blockDim.x+threadIdx.x]; + + out[blockIdx.x*blockDim.x+threadIdx.x] = (q_val - zeropts[bid])*scales[bid]; +} + +__global__ void p_ints(){ + printf("codebook entries used: %d\n", g_ints); +} + +unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize){ + float *scales, *zeropts; + int8_t *q_out; + unsigned char *cmpbytes; + int num_blocks = num_elements/blocksize; + + cudaMalloc(&scales, sizeof(float)*num_blocks); + cudaMalloc(&zeropts,sizeof(float)*num_blocks); + cudaMalloc(&q_out, num_elements); + + using namespace nvcomp; + + cudaStream_t stream; + cudaStreamCreate(&stream); + + const int chunk_size = 1 << 16; + nvcompType_t data_type = NVCOMP_TYPE_CHAR; + + + + compress<<>>(data, scales, zeropts, q_out); + cudaDeviceSynchronize(); + + LZ4Manager nvcomp_manager{chunk_size, data_type, stream}; + CompressionConfig comp_config = nvcomp_manager.configure_compression(num_elements); + + uint8_t* comp_buffer; + cudaMalloc(&comp_buffer, comp_config.max_compressed_buffer_size); + + nvcomp_manager.compress((const uint8_t *)q_out, comp_buffer, comp_config); + + size_t c_size = nvcomp_manager.get_compressed_output_size(comp_buffer); + cudaFree(q_out); + + *outsize = sizeof(float)*(num_blocks+num_blocks)+c_size; + cudaMalloc(&cmpbytes, *outsize); + + cudaMemcpy(cmpbytes, (unsigned char *)scales, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); + cudaMemcpy(cmpbytes+sizeof(float)*num_blocks, (unsigned char *)zeropts, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); + cudaMemcpy(cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, comp_buffer, c_size, cudaMemcpyDeviceToDevice); + + float h_firstscale; + cudaMemcpy(&h_firstscale, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(scales); + cudaFree(zeropts); + cudaFree(comp_buffer); + return cmpbytes; +} + +float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize){ + float *scales, *zeropts; + uint8_t *q_cmp; + int8_t *q_vals; + float *out; + int num_blocks = num_elements/blocksize; + size_t c_size = *cmpsize-(2*sizeof(float)*num_blocks); + + float first_val, *d_first; + + cudaMalloc(&d_first, sizeof(float)); + cudaMemcpy((unsigned char *)&first_val, cmpbytes, sizeof(float), cudaMemcpyDeviceToHost); + + + + cudaMalloc((void **)&scales, sizeof(float)*num_blocks); + cudaMalloc((void **)&zeropts,sizeof(float)*num_blocks); + cudaMalloc((void **)&q_cmp, c_size); + cudaMemcpy((unsigned char *)scales, cmpbytes, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); + + cudaMemcpy((unsigned char *)zeropts, cmpbytes+sizeof(float)*num_blocks, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice); + + cudaMemcpy(q_cmp, cmpbytes+sizeof(float)*num_blocks+sizeof(float)*num_blocks, c_size, cudaMemcpyDeviceToDevice); + cudaStream_t stream; + cudaStreamCreate(&stream); + + const int chunk_size = 1 << 16; + + + nvcompType_t data_type = NVCOMP_TYPE_CHAR; + + auto decomp_manager = nvcomp::create_manager(q_cmp, stream); + + nvcomp::DecompressionConfig decomp_config = decomp_manager->configure_decompression((uint8_t *)q_cmp); + cudaMalloc(&q_vals, num_elements); + + decomp_manager->decompress((uint8_t*)q_vals, (uint8_t*)q_cmp, decomp_config); + cudaFree(q_cmp); + + cudaMalloc(&out, sizeof(float)*num_elements); + + decompress<<>>(q_vals, scales, zeropts, out); + cudaDeviceSynchronize(); + + cudaFree(scales); + cudaFree(zeropts); + cudaFree(q_vals); + + return out; +} + +int main(int argc, char** argv){ + char oriFilePath[640], outputFilePath[645]; + float* data; + size_t nbEle; + if(argc < 3) + { + printf("Usage: testfloat_compress_fastmode2 [srcFilePath] [block size] [err bound] [--cuda]\n"); + printf("Example: testfloat_compress_fastmode2 testfloat_8_8_128.dat 64 1E-3 --cuda\n"); + exit(0); + } + + sprintf(oriFilePath, "%s", argv[1]); + int blockSize = atoi(argv[2]); + float errBound = atof(argv[3]); + nbEle = atoi(argv[4]); + + data = (float*)malloc(sizeof(float)*nbEle); + sprintf(outputFilePath, "%s.sznew", oriFilePath); + + FILE *in_file; + in_file = fopen(oriFilePath, "rb"); + + fread(data, sizeof(float), nbEle, in_file); + fclose(in_file); + + float max = data[0]; + float min = data[0]; + for(int i=0;i=max){ + max = data[i]; + } + if(data[i]<=min){ + min = data[i]; + } + } + errBound = errBound*(max-min); + + // Move to device + float *d_data; + unsigned char *cmpbytes; + size_t outsize; + cudaMalloc(&d_data, sizeof(float)*nbEle); + cudaMemcpy(d_data, data, sizeof(float)*nbEle, cudaMemcpyHostToDevice); + //SZ_device_compress(d_data, nbEle, errBound, blockSize, cmpbytes, &outsize); + + cudaFree(d_data); + +} diff --git a/qtensor/compression/newsz/newsz.h b/qtensor/compression/newsz/newsz.h index c537b889..1022e20a 100644 --- a/qtensor/compression/newsz/newsz.h +++ b/qtensor/compression/newsz/newsz.h @@ -1,3 +1,3 @@ - -unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize); -float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize); + +unsigned char* SZ_device_compress(float *data, size_t num_elements, int blocksize, size_t *outsize); +float* SZ_device_decompress(unsigned char *cmpbytes, size_t num_elements, int blocksize, size_t *cmpsize); diff --git a/qtensor/compression/newsz/newsz_wrapper.cu b/qtensor/compression/newsz/newsz_wrapper.cu index d067560d..a692af9b 100644 --- a/qtensor/compression/newsz/newsz_wrapper.cu +++ b/qtensor/compression/newsz/newsz_wrapper.cu @@ -1,21 +1,21 @@ -#include "newsz.h" -#include - -extern "C"{ - - unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){ - //unsigned char* cmpbytes; - return SZ_device_compress(oriData, nbEle, blockSize, outSize); - //printf("in wrap cmpbytes: %p\n", cmpbytes); - //return cmpbytes; - } - - float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){ - size_t *cmpsize_ptr; - *cmpsize_ptr = cmpsize; - - float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr); - return res; - } - -} +#include "newsz.h" +#include + +extern "C"{ + + unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize){ + //unsigned char* cmpbytes; + return SZ_device_compress(oriData, nbEle, blockSize, outSize); + //printf("in wrap cmpbytes: %p\n", cmpbytes); + //return cmpbytes; + } + + float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize){ + size_t *cmpsize_ptr; + *cmpsize_ptr = cmpsize; + + float *res = SZ_device_decompress(cmpBytes, nbEle, blocksize, cmpsize_ptr); + return res; + } + +} diff --git a/qtensor/compression/newsz/newsz_wrapper.py b/qtensor/compression/newsz/newsz_wrapper.py index d40304fb..4cbc2692 100644 --- a/qtensor/compression/newsz/newsz_wrapper.py +++ b/qtensor/compression/newsz/newsz_wrapper.py @@ -1,161 +1,161 @@ -import numpy as np -import ctypes -from ctypes import * -import random -from qtensor.tools.lazy_import import cupy as cp -import time -import torch - -from pathlib import Path -LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so') - -NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so') -#NVCOMP_PATH= './libnvcomp.so' -#LIB_PATH = './libnewsz_wrapper.so' - -# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize) -def get_device_compress(): - dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL) - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.newSZ_device_compress - func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int] - func.restype = POINTER(c_ubyte) - return func - -# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize) -def get_device_decompress(): - - dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL) - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.newSZ_device_decompress - func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t] - func.restype = POINTER(c_float) - return func - - -def newsz_device_compress(oriData, nbEle, blockSize,threshold): - __cuszx_device_compress = get_device_compress() - ori_nbEle = nbEle - variable = ctypes.c_size_t(0) - outSize = ctypes.pointer(variable) - - oriData = oriData.flatten() - ori_real = oriData.real - ori_imag = oriData.imag - oriData = cp.concatenate((ori_real, ori_imag)) - sample = oriData[::2] - d = cp.amax(oriData) - cp.amin(oriData) - d = d.get() - if d.dtype == np.complex64: - d = d.real - threshold = threshold*(d) - truth_values = abs(oriData)<=threshold - oriData[truth_values] = 0.0 - nbEle = oriData.shape[0] - - - oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) - # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize) - o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize)) - #print("testing") - #print(o_bytes.value) - return (o_bytes,outSize.contents.value, blockSize), outSize - - -def newsz_device_decompress(nbEle, cmpBytes, owner, dtype): - __cuszx_device_decompress=get_device_decompress() - (cmpBytes, cmpsize, blockSize) = cmpBytes - - nbEle_p = ctypes.c_size_t(nbEle) - # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize) - newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize)) - # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) - # -- Workaround to convert GPU pointer to int - p_decompressed_ptr = ctypes.addressof(newData) - # cast to int64 pointer - # (effectively converting pointer to pointer to addr to pointer to int64) - p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) - decompressed_int = p_decompressed_int.contents - # -- - pointer_for_free = decompressed_int.value - # self.decompressed_own.append(decompressed_int.value) - mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) - mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) - #print("mem ptr") - #print(mem_ptr) - arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) - # res = cp.zeros((nbEle,)) - # ## need to convert newData to cupy - # cp.place(res,bitmap,arr) - - c_res = cp.zeros(int(nbEle/2), np.complex64) - c_res.real = arr[0:int(nbEle/2)] - c_res.imag = arr[int(nbEle/2):] - return (c_res, pointer_for_free) - -### Example of device compress/decompress wrapper usage -class Comp(): - def __init__(self): - self.name = "dummy" - -def free_compressed(ptr): - p_ptr = ctypes.addressof(ptr) - p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) - decomp_int = p_int.contents - cp.cuda.runtime.free(decomp_int.value) - - -if __name__ == "__main__": - - DATA_SIZE = int(1024) - MAX_D = 10.0 - MIN_D = -10.0 - RANGE = MAX_D - MIN_D - r2r_threshold = 0.002 - r2r_error = 0.0001 - - in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) - #print(np.max(in_vector)) - DATA_SIZE = len(in_vector) - #range_vr = np.max(in_vector)-np.min(in_vector) - #r2r_threshold = r2r_threshold*range_vr - #r2r_error = r2r_error*range_vr - #in_vector = np.zeros((DATA_SIZE,)) - #for i in range(0,int(DATA_SIZE/4)): - # in_vector[i] = 0.0 - #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): - # in_vector[i] = 5.0 - #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): - # in_vector[i] = random.uniform(MIN_D, MAX_D) - #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): - # in_vector[i] = -7.0 - #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): - # in_vector[i] = 0.001 - - print(DATA_SIZE) - #in_vector = in_vector.astype('float32') - in_vector_gpu = cp.asarray(in_vector) - - # variable = ctypes.c_size_t(0) - # outSize = ctypes.pointer(variable) - #print(in_vector[0:16]) - for i in range(200): - s_time = time.time() - #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold) - - o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold) - print("Time python: "+str(time.time()-s_time)) - print(outSize[0]) - print("Compress Success...starting decompress ") - comp = Comp() - - s_time = time.time() - #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) - - (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) - free_compressed(o_bytes[0]) - cp.cuda.runtime.free(ptr) - print("Time python: "+str(time.time()-s_time)) - #for i in d_bytes: - # print(i) - print("Decompress Success") +import numpy as np +import ctypes +from ctypes import * +import random +from qtensor.tools.lazy_import import cupy as cp +import time +import torch + +from pathlib import Path +LIB_PATH = str(Path(__file__).parent/'libnewsz_wrapper.so') + +NVCOMP_PATH = str(Path(__file__).parent/'libnvcomp.so') +#NVCOMP_PATH= './libnvcomp.so' +#LIB_PATH = './libnewsz_wrapper.so' + +# unsigned char* newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize) +def get_device_compress(): + dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL) + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.newSZ_device_compress + func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_size_t, c_int] + func.restype = POINTER(c_ubyte) + return func + +# float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize) +def get_device_decompress(): + + dll_base = ctypes.CDLL(NVCOMP_PATH,mode=ctypes.RTLD_GLOBAL) + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.newSZ_device_decompress + func.argtypes = [c_size_t, POINTER(c_ubyte), c_int, c_size_t] + func.restype = POINTER(c_float) + return func + + +def newsz_device_compress(oriData, nbEle, blockSize,threshold): + __cuszx_device_compress = get_device_compress() + ori_nbEle = nbEle + variable = ctypes.c_size_t(0) + outSize = ctypes.pointer(variable) + + oriData = oriData.flatten() + ori_real = oriData.real + ori_imag = oriData.imag + oriData = cp.concatenate((ori_real, ori_imag)) + sample = oriData[::2] + d = cp.amax(oriData) - cp.amin(oriData) + d = d.get() + if d.dtype == np.complex64: + d = d.real + threshold = threshold*(d) + truth_values = abs(oriData)<=threshold + oriData[truth_values] = 0.0 + nbEle = oriData.shape[0] + + + oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) + # newSZ_device_compress(float *oriData, size_t *outSize, size_t nbEle, int blockSize) + o_bytes = __cuszx_device_compress(oriData_p, outSize, np.ulonglong(nbEle), np.int32(blockSize)) + #print("testing") + #print(o_bytes.value) + return (o_bytes,outSize.contents.value, blockSize), outSize + + +def newsz_device_decompress(nbEle, cmpBytes, owner, dtype): + __cuszx_device_decompress=get_device_decompress() + (cmpBytes, cmpsize, blockSize) = cmpBytes + + nbEle_p = ctypes.c_size_t(nbEle) + # float* newSZ_device_decompress(size_t nbEle, unsigned char* cmpBytes, int blocksize, size_t cmpsize) + newData = __cuszx_device_decompress(nbEle_p, cmpBytes, np.int32(blockSize), ctypes.c_size_t(cmpsize)) + # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) + # -- Workaround to convert GPU pointer to int + p_decompressed_ptr = ctypes.addressof(newData) + # cast to int64 pointer + # (effectively converting pointer to pointer to addr to pointer to int64) + p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) + decompressed_int = p_decompressed_int.contents + # -- + pointer_for_free = decompressed_int.value + # self.decompressed_own.append(decompressed_int.value) + mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) + mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) + #print("mem ptr") + #print(mem_ptr) + arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) + # res = cp.zeros((nbEle,)) + # ## need to convert newData to cupy + # cp.place(res,bitmap,arr) + + c_res = cp.zeros(int(nbEle/2), np.complex64) + c_res.real = arr[0:int(nbEle/2)] + c_res.imag = arr[int(nbEle/2):] + return (c_res, pointer_for_free) + +### Example of device compress/decompress wrapper usage +class Comp(): + def __init__(self): + self.name = "dummy" + +def free_compressed(ptr): + p_ptr = ctypes.addressof(ptr) + p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) + decomp_int = p_int.contents + cp.cuda.runtime.free(decomp_int.value) + + +if __name__ == "__main__": + + DATA_SIZE = int(1024) + MAX_D = 10.0 + MIN_D = -10.0 + RANGE = MAX_D - MIN_D + r2r_threshold = 0.002 + r2r_error = 0.0001 + + in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) + #print(np.max(in_vector)) + DATA_SIZE = len(in_vector) + #range_vr = np.max(in_vector)-np.min(in_vector) + #r2r_threshold = r2r_threshold*range_vr + #r2r_error = r2r_error*range_vr + #in_vector = np.zeros((DATA_SIZE,)) + #for i in range(0,int(DATA_SIZE/4)): + # in_vector[i] = 0.0 + #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): + # in_vector[i] = 5.0 + #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): + # in_vector[i] = random.uniform(MIN_D, MAX_D) + #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): + # in_vector[i] = -7.0 + #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): + # in_vector[i] = 0.001 + + print(DATA_SIZE) + #in_vector = in_vector.astype('float32') + in_vector_gpu = cp.asarray(in_vector) + + # variable = ctypes.c_size_t(0) + # outSize = ctypes.pointer(variable) + #print(in_vector[0:16]) + for i in range(200): + s_time = time.time() + #o_bytes, outSize = cusz_device_compress(in_vector_gpu, r2r_error, DATA_SIZE, 256, r2r_threshold) + + o_bytes, outSize = newsz_device_compress(in_vector_gpu, DATA_SIZE, 256,r2r_threshold) + print("Time python: "+str(time.time()-s_time)) + print(outSize[0]) + print("Compress Success...starting decompress ") + comp = Comp() + + s_time = time.time() + #(d_bytes,ptr )= cusz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) + + (d_bytes, ptr) = newsz_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) + free_compressed(o_bytes[0]) + cp.cuda.runtime.free(ptr) + print("Time python: "+str(time.time()-s_time)) + #for i in d_bytes: + # print(i) + print("Decompress Success") diff --git a/qtensor/compression/szp/include/cuSZp.h b/qtensor/compression/szp/include/cuSZp.h index 0a168f34..d94e2943 100644 --- a/qtensor/compression/szp/include/cuSZp.h +++ b/qtensor/compression/szp/include/cuSZp.h @@ -1,12 +1,12 @@ -#ifndef CUSZP_INCLUDE_CUSZP_H -#define CUSZP_INCLUDE_CUSZP_H - -static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify. -static const int dec_tblock_size = 32; // 32 should be the best, not need to modify. -static const int cmp_chunk = 8192; -static const int dec_chunk = 8192; - -__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle); -__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle); - +#ifndef CUSZP_INCLUDE_CUSZP_H +#define CUSZP_INCLUDE_CUSZP_H + +static const int cmp_tblock_size = 32; // 32 should be the best, not need to modify. +static const int dec_tblock_size = 32; // 32 should be the best, not need to modify. +static const int cmp_chunk = 8192; +static const int dec_chunk = 8192; + +__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle); +__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle); + #endif // CUSZP_INCLUDE_CUSZP_H \ No newline at end of file diff --git a/qtensor/compression/szp/include/cuSZp_entry.h b/qtensor/compression/szp/include/cuSZp_entry.h index 5acd97a5..fcdcb420 100644 --- a/qtensor/compression/szp/include/cuSZp_entry.h +++ b/qtensor/compression/szp/include/cuSZp_entry.h @@ -1,12 +1,12 @@ -#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H -#define CUSZP_INCLUDE_CUSZP_ENTRY_H - -#include - -void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound); -void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound); -extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0); -void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0); -extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0); - +#ifndef CUSZP_INCLUDE_CUSZP_ENTRY_H +#define CUSZP_INCLUDE_CUSZP_ENTRY_H + +#include + +void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound); +void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound); +extern "C" void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0); +void SZp_dev_new(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream = 0); +extern "C" void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream = 0); + #endif // CUSZP_INCLUDE_CUSZP_ENTRY_H \ No newline at end of file diff --git a/qtensor/compression/szp/include/cuSZp_timer.h b/qtensor/compression/szp/include/cuSZp_timer.h index faca61c3..2777a919 100644 --- a/qtensor/compression/szp/include/cuSZp_timer.h +++ b/qtensor/compression/szp/include/cuSZp_timer.h @@ -1,31 +1,31 @@ -#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H -#define CUSZP_INCLUDE_CUSZP_TIMER_H - -#include -#include - -struct PrivateTimingGPU { - cudaEvent_t start; - cudaEvent_t stop; -}; - -class TimingGPU -{ - private: - PrivateTimingGPU *privateTimingGPU; - - public: - - TimingGPU(); - - ~TimingGPU(); - - void StartCounter(); - - void StartCounterFlags(); - - float GetCounter(); - -}; - +#ifndef CUSZP_INCLUDE_CUSZP_TIMER_H +#define CUSZP_INCLUDE_CUSZP_TIMER_H + +#include +#include + +struct PrivateTimingGPU { + cudaEvent_t start; + cudaEvent_t stop; +}; + +class TimingGPU +{ + private: + PrivateTimingGPU *privateTimingGPU; + + public: + + TimingGPU(); + + ~TimingGPU(); + + void StartCounter(); + + void StartCounterFlags(); + + float GetCounter(); + +}; + #endif // CUSZP_INCLUDE_CUSZP_TIMER_H \ No newline at end of file diff --git a/qtensor/compression/szp/include/cuSZp_utility.h b/qtensor/compression/szp/include/cuSZp_utility.h index e698633f..32af7040 100644 --- a/qtensor/compression/szp/include/cuSZp_utility.h +++ b/qtensor/compression/szp/include/cuSZp_utility.h @@ -1,14 +1,14 @@ -#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H -#define CUSZP_INCLUDE_CUSZP_UTILITY_H - -void symTransForm_4Bytes(unsigned char data[4]); -unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status); -float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status); -float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status); -void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status); -void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status); -double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2); -double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0); -double *computePSNR(size_t nbEle, float *ori_data, float *data); - +#ifndef CUSZP_INCLUDE_CUSZP_UTILITY_H +#define CUSZP_INCLUDE_CUSZP_UTILITY_H + +void symTransForm_4Bytes(unsigned char data[4]); +unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status); +float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status); +float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status); +void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status); +void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status); +double SSIM_3d_calcWindow_float(float* data, float* other, size_t size1, size_t size0, int offset0, int offset1, int offset2, int windowSize0, int windowSize1, int windowSize2); +double computeSSIM(float* oriData, float* decData, size_t size2, size_t size1, size_t size0); +double *computePSNR(size_t nbEle, float *ori_data, float *data); + #endif // CUSZP_INCLUDE_CUSZP_UTILITY_H \ No newline at end of file diff --git a/qtensor/compression/szp/src/cuSZp.cu b/qtensor/compression/szp/src/cuSZp.cu index c58cf21f..f506ee97 100644 --- a/qtensor/compression/szp/src/cuSZp.cu +++ b/qtensor/compression/szp/src/cuSZp.cu @@ -1,393 +1,393 @@ -#include "cuSZp.h" - -__device__ inline int quantization(float data, float recipPrecision) -{ - float dataRecip = data*recipPrecision; - int s = dataRecip>=-0.5f?0:1; - return (int)(dataRecip+0.5f) - s; -} - - -__device__ inline int get_bit_num(unsigned int x) -{ - return (sizeof(unsigned int)*8) - __clz(x); -} - - -__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle) -{ - __shared__ unsigned int base_idx; - - const int tid = threadIdx.x; - const int bid = blockIdx.x; - const int idx = bid * blockDim.x + tid; - const int lane = idx & 31; - const int warp = idx >> 5; - const int block_num = cmp_chunk/32; - const int rate_ofs = (nbEle+31)/32; - const float recipPrecision = 0.5f/eb; - - int base_start_idx; - int base_block_start_idx, base_block_end_idx; - int quant_chunk_idx; - int block_idx; - int currQuant, lorenQuant, prevQuant, maxQuant; - int absQuant[cmp_chunk]; - unsigned int sign_flag[block_num]; - int sign_ofs; - int fixed_rate[block_num]; - unsigned int thread_ofs = 0; - - // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory. - base_start_idx = warp * cmp_chunk * 32; - for(int j=0; j absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx]; - } - - // Record block info. - fixed_rate[j] = get_bit_num(maxQuant); - thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0; - // Write block fixed rate to compressed data. - if(block_idx= i) thread_ofs += tmp; - } - __syncthreads(); - - // Write warp(i.e. thread-block)-level prefix-sum to global-memory. - if(lane==31) - { - cmpOffset[warp+1] = (thread_ofs+7)/8; - if(warp==0) - flag[1] = 2; - else - flag[warp+1] = 1; - } - __syncthreads(); - - // Global-level prefix-sum (exclusive). - if(warp>0) - { - if(!lane) - { - int temp_flag = 1; - while(temp_flag!=2) temp_flag = flag[warp]; - __threadfence(); - cmpOffset[warp] += cmpOffset[warp-1]; - __threadfence(); - flag[warp+1] = 2; - } - } - else - { - if(!lane) cmpOffset[0] = 0; - } - __syncthreads(); - - // Assigning compression bytes by given prefix-sum results. - if(!lane) base_idx = cmpOffset[warp] + rate_ofs; - __syncthreads(); - - // Bit shuffle for each index, also storing data to global memory. - unsigned int base_cmp_byte_ofs = base_idx; - unsigned int cmp_byte_ofs; - unsigned int tmp_byte_ofs = 0; - unsigned int cur_byte_ofs = 0; - for(int j=0; j= i) tmp_byte_ofs += tmp; - } - unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1); - if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs; - else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread; - - // Operation for each block, if zero block then do nothing. - if(fixed_rate[j]) - { - // Assign sign information for one block. - cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24); - cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16); - cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8); - cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j]; - - // Assign quant bit information for one block by bit-shuffle. - unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3; - int mask = 1; - for(int i=0; i> i) << 7) | - (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) | - (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) | - (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) | - (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) | - (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) | - (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) | - (((absQuant[chunk_idx_start+7] & mask) >> i) << 0); - - // Get ith bit in 8~15 quant, and store to tmp_char1. - tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) | - (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) | - (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) | - (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) | - (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) | - (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) | - (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) | - (((absQuant[chunk_idx_start+15] & mask) >> i) << 0); - - // Get ith bit in 16~23 quant, and store to tmp_char2. - tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) | - (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) | - (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) | - (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) | - (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) | - (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) | - (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) | - (((absQuant[chunk_idx_start+23] & mask) >> i) << 0); - - // Get ith bit in 24-31 quant, and store to tmp_char3. - tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) | - (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) | - (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) | - (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) | - (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) | - (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) | - (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) | - (((absQuant[chunk_idx_start+31] & mask) >> i) << 0); - - // Move data to global memory. - cmpData[cmp_byte_ofs++] = tmp_char0; - cmpData[cmp_byte_ofs++] = tmp_char1; - cmpData[cmp_byte_ofs++] = tmp_char2; - cmpData[cmp_byte_ofs++] = tmp_char3; - mask <<= 1; - } - } - - // Index updating across different iterations. - cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31); - } -} - - - -__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle) -{ - __shared__ unsigned int base_idx; - - const int tid = threadIdx.x; - const int bid = blockIdx.x; - const int idx = bid * blockDim.x + tid; - const int lane = idx & 31; - const int warp = idx >> 5; - const int block_num = dec_chunk/32; - const int rate_ofs = (nbEle+31)/32; - - int base_start_idx; - int base_block_start_idx; - int block_idx; - int absQuant[32]; - int currQuant, lorenQuant, prevQuant; - int sign_ofs; - int fixed_rate[block_num]; - unsigned int thread_ofs = 0; - - // Obtain fixed rate information for each block. - for(int j=0; j= i) thread_ofs += tmp; - } - __syncthreads(); - - // Write warp(i.e. thread-block)-level prefix-sum to global-memory. - if(lane==31) - { - cmpOffset[warp+1] = (thread_ofs+7)/8; - if(warp==0) - flag[1] = 2; - else - flag[warp+1] = 1; - } - __syncthreads(); - - // Global-level prefix-sum (exclusive). - if(warp>0) - { - if(!lane) - { - int temp_flag = 1; - while(temp_flag!=2) temp_flag = flag[warp]; - __threadfence(); - cmpOffset[warp] += cmpOffset[warp-1]; - __threadfence(); - flag[warp+1] = 2; - } - } - else - { - if(!lane) cmpOffset[0] = 0; - } - __syncthreads(); - - // Retrieving compression bytes and reconstruct decompression data. - if(!lane) base_idx = cmpOffset[warp] + rate_ofs; - __syncthreads(); - - // Restore bit-shuffle for each block. - unsigned int base_cmp_byte_ofs = base_idx; - unsigned int cmp_byte_ofs; - unsigned int tmp_byte_ofs = 0; - unsigned int cur_byte_ofs = 0; - base_start_idx = warp * dec_chunk * 32; - for(int j=0; j= i) tmp_byte_ofs += tmp; - } - unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1); - if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs; - else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread; - - // Operation for each block, if zero block then do nothing. - if(fixed_rate[j]) - { - // Retrieve sign information for one block. - sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) | - (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) | - (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8)) | - (0x000000ff & cmpData[cmp_byte_ofs++]); - - // Retrieve quant data for one block. - unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3; - for(int i=0; i<32; i++) absQuant[i] = 0; - for(int i=0; i> 7) & 0x00000001) << i; - absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i; - absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i; - absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i; - absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i; - absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i; - absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i; - absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i; - - // Get ith bit in 8~15 abs quant from global memory. - absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i; - absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i; - absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i; - absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i; - absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i; - absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i; - absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i; - absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i; - - // Get ith bit in 16-23 abs quant from global memory. - absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i; - absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i; - absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i; - absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i; - absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i; - absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i; - absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i; - absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i; - - // // Get ith bit in 24-31 abs quant from global memory. - absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i; - absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i; - absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i; - absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i; - absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i; - absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i; - absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i; - absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i; - } - - // Delorenzo and store data back to decompression data. - prevQuant = 0; - for(int i=0; i<32; i++) - { - sign_ofs = i % 32; - if(sign_flag & (1 << (31 - sign_ofs))) - lorenQuant = absQuant[i] * -1; - else - lorenQuant = absQuant[i]; - currQuant = lorenQuant + prevQuant; - decData[base_block_start_idx+i] = currQuant * eb * 2; - prevQuant = currQuant; - } - } - - // Index updating across different iterations. - cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31); - } +#include "cuSZp.h" + +__device__ inline int quantization(float data, float recipPrecision) +{ + float dataRecip = data*recipPrecision; + int s = dataRecip>=-0.5f?0:1; + return (int)(dataRecip+0.5f) - s; +} + + +__device__ inline int get_bit_num(unsigned int x) +{ + return (sizeof(unsigned int)*8) - __clz(x); +} + + +__global__ void SZp_compress_kernel(const float* const __restrict__ oriData, unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle) +{ + __shared__ unsigned int base_idx; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int idx = bid * blockDim.x + tid; + const int lane = idx & 31; + const int warp = idx >> 5; + const int block_num = cmp_chunk/32; + const int rate_ofs = (nbEle+31)/32; + const float recipPrecision = 0.5f/eb; + + int base_start_idx; + int base_block_start_idx, base_block_end_idx; + int quant_chunk_idx; + int block_idx; + int currQuant, lorenQuant, prevQuant, maxQuant; + int absQuant[cmp_chunk]; + unsigned int sign_flag[block_num]; + int sign_ofs; + int fixed_rate[block_num]; + unsigned int thread_ofs = 0; + + // Prequantization + Lorenzo Prediction + Fixed-length encoding + store fixed-length to global memory. + base_start_idx = warp * cmp_chunk * 32; + for(int j=0; j absQuant[quant_chunk_idx] ? maxQuant : absQuant[quant_chunk_idx]; + } + + // Record block info. + fixed_rate[j] = get_bit_num(maxQuant); + thread_ofs += (fixed_rate[j]) ? (32+fixed_rate[j]*32) : 0; + // Write block fixed rate to compressed data. + if(block_idx= i) thread_ofs += tmp; + } + __syncthreads(); + + // Write warp(i.e. thread-block)-level prefix-sum to global-memory. + if(lane==31) + { + cmpOffset[warp+1] = (thread_ofs+7)/8; + if(warp==0) + flag[1] = 2; + else + flag[warp+1] = 1; + } + __syncthreads(); + + // Global-level prefix-sum (exclusive). + if(warp>0) + { + if(!lane) + { + int temp_flag = 1; + while(temp_flag!=2) temp_flag = flag[warp]; + __threadfence(); + cmpOffset[warp] += cmpOffset[warp-1]; + __threadfence(); + flag[warp+1] = 2; + } + } + else + { + if(!lane) cmpOffset[0] = 0; + } + __syncthreads(); + + // Assigning compression bytes by given prefix-sum results. + if(!lane) base_idx = cmpOffset[warp] + rate_ofs; + __syncthreads(); + + // Bit shuffle for each index, also storing data to global memory. + unsigned int base_cmp_byte_ofs = base_idx; + unsigned int cmp_byte_ofs; + unsigned int tmp_byte_ofs = 0; + unsigned int cur_byte_ofs = 0; + for(int j=0; j= i) tmp_byte_ofs += tmp; + } + unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1); + if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs; + else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread; + + // Operation for each block, if zero block then do nothing. + if(fixed_rate[j]) + { + // Assign sign information for one block. + cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 24); + cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 16); + cmpData[cmp_byte_ofs++] = 0xff & (sign_flag[j] >> 8); + cmpData[cmp_byte_ofs++] = 0xff & sign_flag[j]; + + // Assign quant bit information for one block by bit-shuffle. + unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3; + int mask = 1; + for(int i=0; i> i) << 7) | + (((absQuant[chunk_idx_start+1] & mask) >> i) << 6) | + (((absQuant[chunk_idx_start+2] & mask) >> i) << 5) | + (((absQuant[chunk_idx_start+3] & mask) >> i) << 4) | + (((absQuant[chunk_idx_start+4] & mask) >> i) << 3) | + (((absQuant[chunk_idx_start+5] & mask) >> i) << 2) | + (((absQuant[chunk_idx_start+6] & mask) >> i) << 1) | + (((absQuant[chunk_idx_start+7] & mask) >> i) << 0); + + // Get ith bit in 8~15 quant, and store to tmp_char1. + tmp_char1 = (((absQuant[chunk_idx_start+8] & mask) >> i) << 7) | + (((absQuant[chunk_idx_start+9] & mask) >> i) << 6) | + (((absQuant[chunk_idx_start+10] & mask) >> i) << 5) | + (((absQuant[chunk_idx_start+11] & mask) >> i) << 4) | + (((absQuant[chunk_idx_start+12] & mask) >> i) << 3) | + (((absQuant[chunk_idx_start+13] & mask) >> i) << 2) | + (((absQuant[chunk_idx_start+14] & mask) >> i) << 1) | + (((absQuant[chunk_idx_start+15] & mask) >> i) << 0); + + // Get ith bit in 16~23 quant, and store to tmp_char2. + tmp_char2 = (((absQuant[chunk_idx_start+16] & mask) >> i) << 7) | + (((absQuant[chunk_idx_start+17] & mask) >> i) << 6) | + (((absQuant[chunk_idx_start+18] & mask) >> i) << 5) | + (((absQuant[chunk_idx_start+19] & mask) >> i) << 4) | + (((absQuant[chunk_idx_start+20] & mask) >> i) << 3) | + (((absQuant[chunk_idx_start+21] & mask) >> i) << 2) | + (((absQuant[chunk_idx_start+22] & mask) >> i) << 1) | + (((absQuant[chunk_idx_start+23] & mask) >> i) << 0); + + // Get ith bit in 24-31 quant, and store to tmp_char3. + tmp_char3 = (((absQuant[chunk_idx_start+24] & mask) >> i) << 7) | + (((absQuant[chunk_idx_start+25] & mask) >> i) << 6) | + (((absQuant[chunk_idx_start+26] & mask) >> i) << 5) | + (((absQuant[chunk_idx_start+27] & mask) >> i) << 4) | + (((absQuant[chunk_idx_start+28] & mask) >> i) << 3) | + (((absQuant[chunk_idx_start+29] & mask) >> i) << 2) | + (((absQuant[chunk_idx_start+30] & mask) >> i) << 1) | + (((absQuant[chunk_idx_start+31] & mask) >> i) << 0); + + // Move data to global memory. + cmpData[cmp_byte_ofs++] = tmp_char0; + cmpData[cmp_byte_ofs++] = tmp_char1; + cmpData[cmp_byte_ofs++] = tmp_char2; + cmpData[cmp_byte_ofs++] = tmp_char3; + mask <<= 1; + } + } + + // Index updating across different iterations. + cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31); + } +} + + + +__global__ void SZp_decompress_kernel(float* const __restrict__ decData, const unsigned char* const __restrict__ cmpData, volatile unsigned int* const __restrict__ cmpOffset, volatile int* const __restrict__ flag, const float eb, const size_t nbEle) +{ + __shared__ unsigned int base_idx; + + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int idx = bid * blockDim.x + tid; + const int lane = idx & 31; + const int warp = idx >> 5; + const int block_num = dec_chunk/32; + const int rate_ofs = (nbEle+31)/32; + + int base_start_idx; + int base_block_start_idx; + int block_idx; + int absQuant[32]; + int currQuant, lorenQuant, prevQuant; + int sign_ofs; + int fixed_rate[block_num]; + unsigned int thread_ofs = 0; + + // Obtain fixed rate information for each block. + for(int j=0; j= i) thread_ofs += tmp; + } + __syncthreads(); + + // Write warp(i.e. thread-block)-level prefix-sum to global-memory. + if(lane==31) + { + cmpOffset[warp+1] = (thread_ofs+7)/8; + if(warp==0) + flag[1] = 2; + else + flag[warp+1] = 1; + } + __syncthreads(); + + // Global-level prefix-sum (exclusive). + if(warp>0) + { + if(!lane) + { + int temp_flag = 1; + while(temp_flag!=2) temp_flag = flag[warp]; + __threadfence(); + cmpOffset[warp] += cmpOffset[warp-1]; + __threadfence(); + flag[warp+1] = 2; + } + } + else + { + if(!lane) cmpOffset[0] = 0; + } + __syncthreads(); + + // Retrieving compression bytes and reconstruct decompression data. + if(!lane) base_idx = cmpOffset[warp] + rate_ofs; + __syncthreads(); + + // Restore bit-shuffle for each block. + unsigned int base_cmp_byte_ofs = base_idx; + unsigned int cmp_byte_ofs; + unsigned int tmp_byte_ofs = 0; + unsigned int cur_byte_ofs = 0; + base_start_idx = warp * dec_chunk * 32; + for(int j=0; j= i) tmp_byte_ofs += tmp; + } + unsigned int prev_thread = __shfl_up_sync(0xffffffff, tmp_byte_ofs, 1); + if(!lane) cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs; + else cmp_byte_ofs = base_cmp_byte_ofs + cur_byte_ofs + prev_thread; + + // Operation for each block, if zero block then do nothing. + if(fixed_rate[j]) + { + // Retrieve sign information for one block. + sign_flag = (0xff000000 & (cmpData[cmp_byte_ofs++] << 24)) | + (0x00ff0000 & (cmpData[cmp_byte_ofs++] << 16)) | + (0x0000ff00 & (cmpData[cmp_byte_ofs++] << 8)) | + (0x000000ff & cmpData[cmp_byte_ofs++]); + + // Retrieve quant data for one block. + unsigned char tmp_char0, tmp_char1, tmp_char2, tmp_char3; + for(int i=0; i<32; i++) absQuant[i] = 0; + for(int i=0; i> 7) & 0x00000001) << i; + absQuant[1] |= ((tmp_char0 >> 6) & 0x00000001) << i; + absQuant[2] |= ((tmp_char0 >> 5) & 0x00000001) << i; + absQuant[3] |= ((tmp_char0 >> 4) & 0x00000001) << i; + absQuant[4] |= ((tmp_char0 >> 3) & 0x00000001) << i; + absQuant[5] |= ((tmp_char0 >> 2) & 0x00000001) << i; + absQuant[6] |= ((tmp_char0 >> 1) & 0x00000001) << i; + absQuant[7] |= ((tmp_char0 >> 0) & 0x00000001) << i; + + // Get ith bit in 8~15 abs quant from global memory. + absQuant[8] |= ((tmp_char1 >> 7) & 0x00000001) << i; + absQuant[9] |= ((tmp_char1 >> 6) & 0x00000001) << i; + absQuant[10] |= ((tmp_char1 >> 5) & 0x00000001) << i; + absQuant[11] |= ((tmp_char1 >> 4) & 0x00000001) << i; + absQuant[12] |= ((tmp_char1 >> 3) & 0x00000001) << i; + absQuant[13] |= ((tmp_char1 >> 2) & 0x00000001) << i; + absQuant[14] |= ((tmp_char1 >> 1) & 0x00000001) << i; + absQuant[15] |= ((tmp_char1 >> 0) & 0x00000001) << i; + + // Get ith bit in 16-23 abs quant from global memory. + absQuant[16] |= ((tmp_char2 >> 7) & 0x00000001) << i; + absQuant[17] |= ((tmp_char2 >> 6) & 0x00000001) << i; + absQuant[18] |= ((tmp_char2 >> 5) & 0x00000001) << i; + absQuant[19] |= ((tmp_char2 >> 4) & 0x00000001) << i; + absQuant[20] |= ((tmp_char2 >> 3) & 0x00000001) << i; + absQuant[21] |= ((tmp_char2 >> 2) & 0x00000001) << i; + absQuant[22] |= ((tmp_char2 >> 1) & 0x00000001) << i; + absQuant[23] |= ((tmp_char2 >> 0) & 0x00000001) << i; + + // // Get ith bit in 24-31 abs quant from global memory. + absQuant[24] |= ((tmp_char3 >> 7) & 0x00000001) << i; + absQuant[25] |= ((tmp_char3 >> 6) & 0x00000001) << i; + absQuant[26] |= ((tmp_char3 >> 5) & 0x00000001) << i; + absQuant[27] |= ((tmp_char3 >> 4) & 0x00000001) << i; + absQuant[28] |= ((tmp_char3 >> 3) & 0x00000001) << i; + absQuant[29] |= ((tmp_char3 >> 2) & 0x00000001) << i; + absQuant[30] |= ((tmp_char3 >> 1) & 0x00000001) << i; + absQuant[31] |= ((tmp_char3 >> 0) & 0x00000001) << i; + } + + // Delorenzo and store data back to decompression data. + prevQuant = 0; + for(int i=0; i<32; i++) + { + sign_ofs = i % 32; + if(sign_flag & (1 << (31 - sign_ofs))) + lorenQuant = absQuant[i] * -1; + else + lorenQuant = absQuant[i]; + currQuant = lorenQuant + prevQuant; + decData[base_block_start_idx+i] = currQuant * eb * 2; + prevQuant = currQuant; + } + } + + // Index updating across different iterations. + cur_byte_ofs += __shfl_sync(0xffffffff, tmp_byte_ofs, 31); + } } \ No newline at end of file diff --git a/qtensor/compression/szp/src/cuSZp_entry.cu b/qtensor/compression/szp/src/cuSZp_entry.cu index a04d8348..e92e669a 100644 --- a/qtensor/compression/szp/src/cuSZp_entry.cu +++ b/qtensor/compression/szp/src/cuSZp_entry.cu @@ -1,147 +1,147 @@ -#include "cuSZp_entry.h" -#include "cuSZp.h" - -void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound) -{ - // Data blocking. - int bsize = cmp_tblock_size; - int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk); - int cmpOffSize = gsize + 1; - int pad_nbEle = gsize * bsize * cmp_chunk; - - // Initializing global memory for GPU compression. - float* d_oriData; - unsigned char* d_cmpData; - unsigned int* d_cmpOffset; - int* d_flag; - unsigned int glob_sync; - cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle); - cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice); - cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle); - cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); - cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); - cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); - cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); - - // Initializing CUDA Stream. - cudaStream_t stream; - cudaStreamCreate(&stream); - - // cuSZp GPU compression. - dim3 blockSize(bsize); - dim3 gridSize(gsize); - SZp_compress_kernel<<>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle); - - // Obtain compression ratio and move data back to CPU. - cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost); - *cmpSize = (size_t)glob_sync + (nbEle+31)/32; - cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost); - - // Free memory that is used. - cudaFree(d_oriData); - cudaFree(d_cmpData); - cudaFree(d_cmpOffset); - cudaFree(d_flag); - cudaStreamDestroy(stream); -} - - -void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound) -{ - // Data blocking. - int bsize = dec_tblock_size; - int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk); - int cmpOffSize = gsize + 1; - int pad_nbEle = gsize * bsize * dec_chunk; - - // Initializing global memory for GPU compression. - float* d_decData; - unsigned char* d_cmpData; - unsigned int* d_cmpOffset; - int* d_flag; - cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle); - cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle); - cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle); - cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice); - cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); - cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); - cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); - cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); - - // Initializing CUDA Stream. - cudaStream_t stream; - cudaStreamCreate(&stream); - - // cuSZp GPU compression. - dim3 blockSize(bsize); - dim3 gridSize(gsize); - SZp_decompress_kernel<<>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle); - - // Move data back to CPU. - cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost); - - // Free memoy that is used. - cudaFree(d_decData); - cudaFree(d_cmpData); - cudaFree(d_cmpOffset); - cudaFree(d_flag); - cudaStreamDestroy(stream); -} - - -void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream) -{ - // Data blocking. - int bsize = cmp_tblock_size; - int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk); - int cmpOffSize = gsize + 1; - - // Initializing global memory for GPU compression. - unsigned int* d_cmpOffset; - int* d_flag; - unsigned int glob_sync; - cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); - cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); - cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); - cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); - - // cuSZp GPU compression. - dim3 blockSize(bsize); - dim3 gridSize(gsize); - SZp_compress_kernel<<>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle); - cudaDeviceSynchronize(); - // Obtain compression ratio and move data back to CPU. - cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost); - *cmpSize = (size_t)glob_sync + (nbEle+31)/32; - - // Free memory that is used. - cudaFree(d_cmpOffset); - cudaFree(d_flag); -} - - -void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream) -{ - // Data blocking. - int bsize = dec_tblock_size; - int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk); - int cmpOffSize = gsize + 1; - - // Initializing global memory for GPU compression. - unsigned int* d_cmpOffset; - int* d_flag; - cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); - cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); - cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); - cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); - cudaMemset(d_decData, 0, sizeof(float)*nbEle); - - // cuSZp GPU compression. - dim3 blockSize(bsize); - dim3 gridSize(gsize); - SZp_decompress_kernel<<>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle); - cudaDeviceSynchronize(); - // Free memoy that is used. - cudaFree(d_cmpOffset); - cudaFree(d_flag); -} +#include "cuSZp_entry.h" +#include "cuSZp.h" + +void SZp_compress_hostptr(float* oriData, unsigned char* cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound) +{ + // Data blocking. + int bsize = cmp_tblock_size; + int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk); + int cmpOffSize = gsize + 1; + int pad_nbEle = gsize * bsize * cmp_chunk; + + // Initializing global memory for GPU compression. + float* d_oriData; + unsigned char* d_cmpData; + unsigned int* d_cmpOffset; + int* d_flag; + unsigned int glob_sync; + cudaMalloc((void**)&d_oriData, sizeof(float)*pad_nbEle); + cudaMemcpy(d_oriData, oriData, sizeof(float)*pad_nbEle, cudaMemcpyHostToDevice); + cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle); + cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); + cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); + cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); + cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); + + // Initializing CUDA Stream. + cudaStream_t stream; + cudaStreamCreate(&stream); + + // cuSZp GPU compression. + dim3 blockSize(bsize); + dim3 gridSize(gsize); + SZp_compress_kernel<<>>(d_oriData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle); + + // Obtain compression ratio and move data back to CPU. + cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost); + *cmpSize = (size_t)glob_sync + (nbEle+31)/32; + cudaMemcpy(cmpBytes, d_cmpData, *cmpSize*sizeof(unsigned char), cudaMemcpyDeviceToHost); + + // Free memory that is used. + cudaFree(d_oriData); + cudaFree(d_cmpData); + cudaFree(d_cmpOffset); + cudaFree(d_flag); + cudaStreamDestroy(stream); +} + + +void SZp_decompress_hostptr(float* decData, unsigned char* cmpBytes, size_t nbEle, size_t cmpSize, float errorBound) +{ + // Data blocking. + int bsize = dec_tblock_size; + int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk); + int cmpOffSize = gsize + 1; + int pad_nbEle = gsize * bsize * dec_chunk; + + // Initializing global memory for GPU compression. + float* d_decData; + unsigned char* d_cmpData; + unsigned int* d_cmpOffset; + int* d_flag; + cudaMalloc((void**)&d_decData, sizeof(float)*pad_nbEle); + cudaMemset(d_decData, 0, sizeof(float)*pad_nbEle); + cudaMalloc((void**)&d_cmpData, sizeof(float)*pad_nbEle); + cudaMemcpy(d_cmpData, cmpBytes, sizeof(unsigned char)*cmpSize, cudaMemcpyHostToDevice); + cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); + cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); + cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); + cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); + + // Initializing CUDA Stream. + cudaStream_t stream; + cudaStreamCreate(&stream); + + // cuSZp GPU compression. + dim3 blockSize(bsize); + dim3 gridSize(gsize); + SZp_decompress_kernel<<>>(d_decData, d_cmpData, d_cmpOffset, d_flag, errorBound, nbEle); + + // Move data back to CPU. + cudaMemcpy(decData, d_decData, sizeof(float)*nbEle, cudaMemcpyDeviceToHost); + + // Free memoy that is used. + cudaFree(d_decData); + cudaFree(d_cmpData); + cudaFree(d_cmpOffset); + cudaFree(d_flag); + cudaStreamDestroy(stream); +} + + +void SZp_compress_deviceptr(float* d_oriData, unsigned char* d_cmpBytes, size_t nbEle, size_t* cmpSize, float errorBound, cudaStream_t stream) +{ + // Data blocking. + int bsize = cmp_tblock_size; + int gsize = (nbEle + bsize * cmp_chunk - 1) / (bsize * cmp_chunk); + int cmpOffSize = gsize + 1; + + // Initializing global memory for GPU compression. + unsigned int* d_cmpOffset; + int* d_flag; + unsigned int glob_sync; + cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); + cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); + cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); + cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); + + // cuSZp GPU compression. + dim3 blockSize(bsize); + dim3 gridSize(gsize); + SZp_compress_kernel<<>>(d_oriData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle); + cudaDeviceSynchronize(); + // Obtain compression ratio and move data back to CPU. + cudaMemcpy(&glob_sync, d_cmpOffset+cmpOffSize-2, sizeof(unsigned int), cudaMemcpyDeviceToHost); + *cmpSize = (size_t)glob_sync + (nbEle+31)/32; + + // Free memory that is used. + cudaFree(d_cmpOffset); + cudaFree(d_flag); +} + + +void SZp_decompress_deviceptr(float* d_decData, unsigned char* d_cmpBytes, size_t nbEle, size_t cmpSize, float errorBound, cudaStream_t stream) +{ + // Data blocking. + int bsize = dec_tblock_size; + int gsize = (nbEle + bsize * dec_chunk - 1) / (bsize * dec_chunk); + int cmpOffSize = gsize + 1; + + // Initializing global memory for GPU compression. + unsigned int* d_cmpOffset; + int* d_flag; + cudaMalloc((void**)&d_cmpOffset, sizeof(unsigned int)*cmpOffSize); + cudaMemset(d_cmpOffset, 0, sizeof(unsigned int)*cmpOffSize); + cudaMalloc((void**)&d_flag, sizeof(int)*cmpOffSize); + cudaMemset(d_flag, 0, sizeof(int)*cmpOffSize); + cudaMemset(d_decData, 0, sizeof(float)*nbEle); + + // cuSZp GPU compression. + dim3 blockSize(bsize); + dim3 gridSize(gsize); + SZp_decompress_kernel<<>>(d_decData, d_cmpBytes, d_cmpOffset, d_flag, errorBound, nbEle); + cudaDeviceSynchronize(); + // Free memoy that is used. + cudaFree(d_cmpOffset); + cudaFree(d_flag); +} diff --git a/qtensor/compression/szp/src/cuSZp_timer.cu b/qtensor/compression/szp/src/cuSZp_timer.cu index 74c81c30..5148af98 100644 --- a/qtensor/compression/szp/src/cuSZp_timer.cu +++ b/qtensor/compression/szp/src/cuSZp_timer.cu @@ -1,31 +1,31 @@ -#include "cuSZp_timer.h" - -TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; } - -TimingGPU::~TimingGPU() { } - -void TimingGPU::StartCounter() -{ - cudaEventCreate(&((*privateTimingGPU).start)); - cudaEventCreate(&((*privateTimingGPU).stop)); - cudaEventRecord((*privateTimingGPU).start,0); -} - -void TimingGPU::StartCounterFlags() -{ - int eventflags = cudaEventBlockingSync; - - cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags); - cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags); - cudaEventRecord((*privateTimingGPU).start,0); -} - -// Gets the counter in ms -float TimingGPU::GetCounter() -{ - float time; - cudaEventRecord((*privateTimingGPU).stop, 0); - cudaEventSynchronize((*privateTimingGPU).stop); - cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop); - return time; -} +#include "cuSZp_timer.h" + +TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; } + +TimingGPU::~TimingGPU() { } + +void TimingGPU::StartCounter() +{ + cudaEventCreate(&((*privateTimingGPU).start)); + cudaEventCreate(&((*privateTimingGPU).stop)); + cudaEventRecord((*privateTimingGPU).start,0); +} + +void TimingGPU::StartCounterFlags() +{ + int eventflags = cudaEventBlockingSync; + + cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags); + cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags); + cudaEventRecord((*privateTimingGPU).start,0); +} + +// Gets the counter in ms +float TimingGPU::GetCounter() +{ + float time; + cudaEventRecord((*privateTimingGPU).stop, 0); + cudaEventSynchronize((*privateTimingGPU).stop); + cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop); + return time; +} diff --git a/qtensor/compression/szp/src/cuSZp_utility.cu b/qtensor/compression/szp/src/cuSZp_utility.cu index 784d378a..ac4006d7 100644 --- a/qtensor/compression/szp/src/cuSZp_utility.cu +++ b/qtensor/compression/szp/src/cuSZp_utility.cu @@ -1,493 +1,493 @@ -// -// Created by Yafan Huang on 5/31/22. -// Copied from SZx. -// -#include -#include -#include -#include -#include -#include "cuSZp_utility.h" - -/*Macro Definition for Processing Data*/ -// #define SZ_SCES 0 //successful -#define RW_SCES 0 -#define RW_FERR 1 -#define RW_TERR 2 -#define LITTLE_ENDIAN_SYSTEM 0 -#define QCAT_BUFS 64 - -/*Global Varaibles for Processing Data*/ -int dataEndianType_Yafan = 0; -int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian - -typedef union lint32 -{ - int ivalue; - unsigned int uivalue; - unsigned char byte[4]; -} lint32; - -typedef union llfloat -{ - float value; - unsigned int ivalue; - unsigned char byte[4]; -} llfloat; - -/** ************************************************************************ - * @brief Reverse 4-bit-length unsigned char array. - * - * @param data[4] 4-bit-length unsigned char array. - * *********************************************************************** */ -void symTransForm_4Bytes(unsigned char data[4]) -{ - unsigned char tmp = data[0]; - data[0] = data[3]; - data[3] = tmp; - - tmp = data[1]; - data[1] = data[2]; - data[2] = tmp; -} - -/** ************************************************************************ - * @brief Read byte data from path to source binary format file. - * Usually used for decompressing data from input file. - * Variables byteLength and status can be obtained through this function. - * - * @param srcFilePath input source file path - * @param byteLength the length of byte array - * @param status data processing states (macro definitions) - * - * @return byteBuf unsigned char array with length byteLength - * *********************************************************************** */ -unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status) -{ - FILE *pFile = fopen(srcFilePath, "rb"); - if (pFile == NULL) - { - printf("Failed to open input file. 1\n"); - *status = RW_FERR; - return 0; - } - fseek(pFile, 0, SEEK_END); - *byteLength = ftell(pFile); - fclose(pFile); - - unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1 - - pFile = fopen(srcFilePath, "rb"); - if (pFile == NULL) - { - printf("Failed to open input file. 2\n"); - *status = RW_FERR; - return 0; - } - fread(byteBuf, 1, *byteLength, pFile); - fclose(pFile); - *status = RW_SCES; - return byteBuf; -} - -/** ************************************************************************ - * @brief Read float data from path to source binary format file in endian systems. - * Usually used for compressing data from input file. - * Variables nbEle and status can be obtained through this function. - * - * @param srcFilePath input source file path - * @param nbEle the length of float array - * @param status data processing states (macro definitions) - * - * @return daBuf float array with length nbEle - * *********************************************************************** */ -float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status) -{ - size_t inSize; - FILE *pFile = fopen(srcFilePath, "rb"); - if (pFile == NULL) - { - printf("Failed to open input file. 1\n"); - *status = RW_FERR; - return NULL; - } - fseek(pFile, 0, SEEK_END); - inSize = ftell(pFile); - *nbEle = inSize/4; - fclose(pFile); - - if(inSize<=0) - { - printf("Error: input file is wrong!\n"); - *status = RW_FERR; - } - - float *daBuf = (float *)malloc(inSize); - - pFile = fopen(srcFilePath, "rb"); - if (pFile == NULL) - { - printf("Failed to open input file. 2\n"); - *status = RW_FERR; - return NULL; - } - fread(daBuf, 4, *nbEle, pFile); - fclose(pFile); - *status = RW_SCES; - return daBuf; -} - -/** ************************************************************************ - * @brief Read float data from path to source binary format file. - * Usually used for compressing data from input file. - * Variables nbEle and status can be obtained through this function. - * - * @param srcFilePath input source file path - * @param nbEle the length of float array - * @param status data processing states (macro definitions) - * - * @return daBuf float array with length nbEle - * *********************************************************************** */ -float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status) -{ - int state = RW_SCES; - if(dataEndianType_Yafan==sysEndianType_Yafan) - { - float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state); - *status = state; - return daBuf; - } - else - { - size_t i,j; - - size_t byteLength; - unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state); - if(state == RW_FERR) - { - *status = RW_FERR; - return NULL; - } - float *daBuf = (float *)malloc(byteLength); - *nbEle = byteLength/4; - - llfloat buf; - for(i = 0;i<*nbEle;i++) - { - j = i*4; - memcpy(buf.byte, bytes+j, 4); - symTransForm_4Bytes(buf.byte); - daBuf[i] = buf.value; - } - free(bytes); - return daBuf; - } -} - -/** ************************************************************************ - * @brief Write byte data to binary format file. - * Usually used for writing compressed data. - * Variable status can be obtained/switched through this function. - * - * @param bytes unsigned char array (compressed data) - * @param byteLength the length of unsigned char array - * @param tgtFilePath output file path - * @param status data processing states (macro definitions) - * *********************************************************************** */ -void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status) -{ - FILE *pFile = fopen(tgtFilePath, "wb"); - if (pFile == NULL) - { - printf("Failed to open input file. 3\n"); - *status = RW_FERR; - return; - } - - fwrite(bytes, 1, byteLength, pFile); //write outSize bytes - fclose(pFile); - *status = RW_SCES; -} - -/** ************************************************************************ - * @brief Write float data to binary format file. - * Usually used for writing decompressed (reconstructed) data. - * Variable status can be obtained/switched through this function. - * - * @param bytes unsigned char array (compressed data) - * @param nbEle the length of float array - * @param tgtFilePath output file path - * @param status data processing states (macro definitions) - * *********************************************************************** */ -void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status) -{ - size_t i = 0; - int state = RW_SCES; - llfloat buf; - unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float)); - for(i=0;idata[index]) - xMin=data[index]; - if(xMaxother[index]) - yMin=other[index]; - if(yMaxsize0) { - printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0); - } - if(windowSize1>size1) { - printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1); - } - if(windowSize2>size2) { - printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2); - } - //offsetInc0=windowSize0/2; - //offsetInc1=windowSize1/2; - //offsetInc2=windowSize2/2; - offsetInc0=windowShift0; - offsetInc1=windowShift1; - offsetInc2=windowShift2; - for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW - for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW - for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW - nw++; - ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2); - } - } - } - return ssimSum/nw; -} - - -/** ************************************************************************ - * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data. - * API for computing PSNR. - * - * @param nbEle the length of float array - * @param ori_data original float array - * @param dec_data decompressed (reconstructed) float array - * - * @return result 6-length double array, which contains: - * 0. *Mean Square Error (MSE)* - * 1. *Value Range (Max-Min)* - * 2. *Peak Signal-to-noise Ratio (PSNR)* - * 3. Squared Error - * 4. Normalized Squared Error - * 5. Normalized Squared MSE - * *********************************************************************** */ -double *computePSNR(size_t nbEle, float *ori_data, float *data) { - size_t i = 0; - double Max = 0, Min = 0, diffMax = 0; - Max = ori_data[0]; - Min = ori_data[0]; - diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0]; - - //diffMax = fabs(data[0] - ori_data[0]); - double sum1 = 0, sum2 = 0, sum22 = 0; - - for (i = 0; i < nbEle; i++) { - sum1 += ori_data[i]; - sum2 += data[i]; - sum22 += data[i] * data[i]; - } - double mean1 = sum1 / nbEle; - double mean2 = sum2 / nbEle; - - double sum3 = 0, sum4 = 0; - double sum = 0, prodSum = 0, relerr = 0; - - double maxpw_relerr = 0; - for (i = 0; i < nbEle; i++) { - if (Max < ori_data[i]) Max = ori_data[i]; - if (Min > ori_data[i]) Min = ori_data[i]; - - float err = fabs(data[i] - ori_data[i]); - if (ori_data[i] != 0) { - relerr = err / fabs(ori_data[i]); - if (maxpw_relerr < relerr) - maxpw_relerr = relerr; - } - - if (diffMax < err) - diffMax = err; - prodSum += (ori_data[i] - mean1) * (data[i] - mean2); - sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1); - sum4 += (data[i] - mean2) * (data[i] - mean2); - sum += err * err; - } - double std1 = sqrt(sum3 / nbEle); - double std2 = sqrt(sum4 / nbEle); - double ee = prodSum / nbEle; - double acEff = ee / std1 / std2; - - double mse = sum / nbEle; - double range = Max - Min; - double psnr = 20 * log10(range) - 10 * log10(mse); - double normErr = sqrt(sum); - double normErr_norm = normErr / sqrt(sum22); - double nrmse = sqrt(mse) / range; - double *result = (double *) malloc(sizeof(double) * 6); - result[0] = mse; - result[1] = range; - result[2] = psnr; - result[3] = normErr; - result[4] = normErr_norm; - result[5] = nrmse; - - return result; +// +// Created by Yafan Huang on 5/31/22. +// Copied from SZx. +// +#include +#include +#include +#include +#include +#include "cuSZp_utility.h" + +/*Macro Definition for Processing Data*/ +// #define SZ_SCES 0 //successful +#define RW_SCES 0 +#define RW_FERR 1 +#define RW_TERR 2 +#define LITTLE_ENDIAN_SYSTEM 0 +#define QCAT_BUFS 64 + +/*Global Varaibles for Processing Data*/ +int dataEndianType_Yafan = 0; +int sysEndianType_Yafan = 0; //0 means little endian, 1 means big endian + +typedef union lint32 +{ + int ivalue; + unsigned int uivalue; + unsigned char byte[4]; +} lint32; + +typedef union llfloat +{ + float value; + unsigned int ivalue; + unsigned char byte[4]; +} llfloat; + +/** ************************************************************************ + * @brief Reverse 4-bit-length unsigned char array. + * + * @param data[4] 4-bit-length unsigned char array. + * *********************************************************************** */ +void symTransForm_4Bytes(unsigned char data[4]) +{ + unsigned char tmp = data[0]; + data[0] = data[3]; + data[3] = tmp; + + tmp = data[1]; + data[1] = data[2]; + data[2] = tmp; +} + +/** ************************************************************************ + * @brief Read byte data from path to source binary format file. + * Usually used for decompressing data from input file. + * Variables byteLength and status can be obtained through this function. + * + * @param srcFilePath input source file path + * @param byteLength the length of byte array + * @param status data processing states (macro definitions) + * + * @return byteBuf unsigned char array with length byteLength + * *********************************************************************** */ +unsigned char *readByteData_Yafan(char *srcFilePath, size_t *byteLength, int *status) +{ + FILE *pFile = fopen(srcFilePath, "rb"); + if (pFile == NULL) + { + printf("Failed to open input file. 1\n"); + *status = RW_FERR; + return 0; + } + fseek(pFile, 0, SEEK_END); + *byteLength = ftell(pFile); + fclose(pFile); + + unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1 + + pFile = fopen(srcFilePath, "rb"); + if (pFile == NULL) + { + printf("Failed to open input file. 2\n"); + *status = RW_FERR; + return 0; + } + fread(byteBuf, 1, *byteLength, pFile); + fclose(pFile); + *status = RW_SCES; + return byteBuf; +} + +/** ************************************************************************ + * @brief Read float data from path to source binary format file in endian systems. + * Usually used for compressing data from input file. + * Variables nbEle and status can be obtained through this function. + * + * @param srcFilePath input source file path + * @param nbEle the length of float array + * @param status data processing states (macro definitions) + * + * @return daBuf float array with length nbEle + * *********************************************************************** */ +float *readFloatData_systemEndian_Yafan(char *srcFilePath, size_t *nbEle, int *status) +{ + size_t inSize; + FILE *pFile = fopen(srcFilePath, "rb"); + if (pFile == NULL) + { + printf("Failed to open input file. 1\n"); + *status = RW_FERR; + return NULL; + } + fseek(pFile, 0, SEEK_END); + inSize = ftell(pFile); + *nbEle = inSize/4; + fclose(pFile); + + if(inSize<=0) + { + printf("Error: input file is wrong!\n"); + *status = RW_FERR; + } + + float *daBuf = (float *)malloc(inSize); + + pFile = fopen(srcFilePath, "rb"); + if (pFile == NULL) + { + printf("Failed to open input file. 2\n"); + *status = RW_FERR; + return NULL; + } + fread(daBuf, 4, *nbEle, pFile); + fclose(pFile); + *status = RW_SCES; + return daBuf; +} + +/** ************************************************************************ + * @brief Read float data from path to source binary format file. + * Usually used for compressing data from input file. + * Variables nbEle and status can be obtained through this function. + * + * @param srcFilePath input source file path + * @param nbEle the length of float array + * @param status data processing states (macro definitions) + * + * @return daBuf float array with length nbEle + * *********************************************************************** */ +float *readFloatData_Yafan(char *srcFilePath, size_t *nbEle, int *status) +{ + int state = RW_SCES; + if(dataEndianType_Yafan==sysEndianType_Yafan) + { + float *daBuf = readFloatData_systemEndian_Yafan(srcFilePath, nbEle, &state); + *status = state; + return daBuf; + } + else + { + size_t i,j; + + size_t byteLength; + unsigned char* bytes = readByteData_Yafan(srcFilePath, &byteLength, &state); + if(state == RW_FERR) + { + *status = RW_FERR; + return NULL; + } + float *daBuf = (float *)malloc(byteLength); + *nbEle = byteLength/4; + + llfloat buf; + for(i = 0;i<*nbEle;i++) + { + j = i*4; + memcpy(buf.byte, bytes+j, 4); + symTransForm_4Bytes(buf.byte); + daBuf[i] = buf.value; + } + free(bytes); + return daBuf; + } +} + +/** ************************************************************************ + * @brief Write byte data to binary format file. + * Usually used for writing compressed data. + * Variable status can be obtained/switched through this function. + * + * @param bytes unsigned char array (compressed data) + * @param byteLength the length of unsigned char array + * @param tgtFilePath output file path + * @param status data processing states (macro definitions) + * *********************************************************************** */ +void writeByteData_Yafan(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status) +{ + FILE *pFile = fopen(tgtFilePath, "wb"); + if (pFile == NULL) + { + printf("Failed to open input file. 3\n"); + *status = RW_FERR; + return; + } + + fwrite(bytes, 1, byteLength, pFile); //write outSize bytes + fclose(pFile); + *status = RW_SCES; +} + +/** ************************************************************************ + * @brief Write float data to binary format file. + * Usually used for writing decompressed (reconstructed) data. + * Variable status can be obtained/switched through this function. + * + * @param bytes unsigned char array (compressed data) + * @param nbEle the length of float array + * @param tgtFilePath output file path + * @param status data processing states (macro definitions) + * *********************************************************************** */ +void writeFloatData_inBytes_Yafan(float *data, size_t nbEle, char* tgtFilePath, int *status) +{ + size_t i = 0; + int state = RW_SCES; + llfloat buf; + unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float)); + for(i=0;idata[index]) + xMin=data[index]; + if(xMaxother[index]) + yMin=other[index]; + if(yMaxsize0) { + printf("ERROR: windowSize0 = %d > %zu\n", windowSize0, size0); + } + if(windowSize1>size1) { + printf("ERROR: windowSize1 = %d > %zu\n", windowSize1, size1); + } + if(windowSize2>size2) { + printf("ERROR: windowSize2 = %d > %zu\n", windowSize2, size2); + } + //offsetInc0=windowSize0/2; + //offsetInc1=windowSize1/2; + //offsetInc2=windowSize2/2; + offsetInc0=windowShift0; + offsetInc1=windowShift1; + offsetInc2=windowShift2; + for(offset2=0; offset2+windowSize2<=size2; offset2+=offsetInc2) { //MOVING WINDOW + for(offset1=0; offset1+windowSize1<=size1; offset1+=offsetInc1) { //MOVING WINDOW + for(offset0=0; offset0+windowSize0<=size0; offset0+=offsetInc0) { //MOVING WINDOW + nw++; + ssimSum+=SSIM_3d_calcWindow_float(oriData, decData, size1, size0, offset0, offset1, offset2, windowSize0, windowSize1, windowSize2); + } + } + } + return ssimSum/nw; +} + + +/** ************************************************************************ + * @brief Calculate PSNR between 3D original and decompressed (reconstructed) data. + * API for computing PSNR. + * + * @param nbEle the length of float array + * @param ori_data original float array + * @param dec_data decompressed (reconstructed) float array + * + * @return result 6-length double array, which contains: + * 0. *Mean Square Error (MSE)* + * 1. *Value Range (Max-Min)* + * 2. *Peak Signal-to-noise Ratio (PSNR)* + * 3. Squared Error + * 4. Normalized Squared Error + * 5. Normalized Squared MSE + * *********************************************************************** */ +double *computePSNR(size_t nbEle, float *ori_data, float *data) { + size_t i = 0; + double Max = 0, Min = 0, diffMax = 0; + Max = ori_data[0]; + Min = ori_data[0]; + diffMax = data[0] > ori_data[0] ? data[0] - ori_data[0] : ori_data[0] - data[0]; + + //diffMax = fabs(data[0] - ori_data[0]); + double sum1 = 0, sum2 = 0, sum22 = 0; + + for (i = 0; i < nbEle; i++) { + sum1 += ori_data[i]; + sum2 += data[i]; + sum22 += data[i] * data[i]; + } + double mean1 = sum1 / nbEle; + double mean2 = sum2 / nbEle; + + double sum3 = 0, sum4 = 0; + double sum = 0, prodSum = 0, relerr = 0; + + double maxpw_relerr = 0; + for (i = 0; i < nbEle; i++) { + if (Max < ori_data[i]) Max = ori_data[i]; + if (Min > ori_data[i]) Min = ori_data[i]; + + float err = fabs(data[i] - ori_data[i]); + if (ori_data[i] != 0) { + relerr = err / fabs(ori_data[i]); + if (maxpw_relerr < relerr) + maxpw_relerr = relerr; + } + + if (diffMax < err) + diffMax = err; + prodSum += (ori_data[i] - mean1) * (data[i] - mean2); + sum3 += (ori_data[i] - mean1) * (ori_data[i] - mean1); + sum4 += (data[i] - mean2) * (data[i] - mean2); + sum += err * err; + } + double std1 = sqrt(sum3 / nbEle); + double std2 = sqrt(sum4 / nbEle); + double ee = prodSum / nbEle; + double acEff = ee / std1 / std2; + + double mse = sum / nbEle; + double range = Max - Min; + double psnr = 20 * log10(range) - 10 * log10(mse); + double normErr = sqrt(sum); + double normErr_norm = normErr / sqrt(sum22); + double nrmse = sqrt(mse) / range; + double *result = (double *) malloc(sizeof(double) * 6); + result[0] = mse; + result[1] = range; + result[2] = psnr; + result[3] = normErr; + result[4] = normErr_norm; + result[5] = nrmse; + + return result; } \ No newline at end of file diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.cu b/qtensor/compression/szp/src/cuSZp_wrapper.cu index 4d83f283..803dbbe1 100644 --- a/qtensor/compression/szp/src/cuSZp_wrapper.cu +++ b/qtensor/compression/szp/src/cuSZp_wrapper.cu @@ -1,37 +1,37 @@ -#include "cuSZp_entry.h" -#include "cuSZp_timer.h" -#include "cuSZp_utility.h" -#include "cuSZp.h" - - -extern "C"{ - /** Before entering SZp_compress, must allocate on device: - * - d_cmpBytes - */ - unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){ - unsigned char *d_cmpBytes, *d_finalCmpBytes; - cudaStream_t stream; - cudaStreamCreate(&stream); - cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle); - SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream); - cudaMalloc((void**)&d_finalCmpBytes, *outSize); - cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice); - cudaFree(d_cmpBytes); - //cudaFree(oriData); - return d_finalCmpBytes; - } - - /** Before entering SZp_decompress, must allocate on device: - * - d_decData - */ - float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){ - float *d_decData; - cudaStream_t stream; - cudaStreamCreate(&stream); - cudaMalloc((void**)&d_decData, sizeof(float)*nbEle); - SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream); - cudaFree(cmpBytes); - return d_decData; - } - -} +#include "cuSZp_entry.h" +#include "cuSZp_timer.h" +#include "cuSZp_utility.h" +#include "cuSZp.h" + + +extern "C"{ + /** Before entering SZp_compress, must allocate on device: + * - d_cmpBytes + */ + unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){ + unsigned char *d_cmpBytes, *d_finalCmpBytes; + cudaStream_t stream; + cudaStreamCreate(&stream); + cudaMalloc((void**)&d_cmpBytes, sizeof(float)*nbEle); + SZp_compress_deviceptr(oriData, d_cmpBytes, nbEle, outSize, absErrBound, stream); + cudaMalloc((void**)&d_finalCmpBytes, *outSize); + cudaMemcpy(d_finalCmpBytes, d_cmpBytes, *outSize, cudaMemcpyDeviceToDevice); + cudaFree(d_cmpBytes); + //cudaFree(oriData); + return d_finalCmpBytes; + } + + /** Before entering SZp_decompress, must allocate on device: + * - d_decData + */ + float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){ + float *d_decData; + cudaStream_t stream; + cudaStreamCreate(&stream); + cudaMalloc((void**)&d_decData, sizeof(float)*nbEle); + SZp_decompress_deviceptr(d_decData, cmpBytes, nbEle, cmpSize, errorBound, stream); + cudaFree(cmpBytes); + return d_decData; + } + +} diff --git a/qtensor/compression/szp/src/cuSZp_wrapper.py b/qtensor/compression/szp/src/cuSZp_wrapper.py index 6f4053ba..9abe1fb1 100644 --- a/qtensor/compression/szp/src/cuSZp_wrapper.py +++ b/qtensor/compression/szp/src/cuSZp_wrapper.py @@ -1,190 +1,190 @@ -import numpy as np -import ctypes -from ctypes import * -import random -from qtensor.tools.lazy_import import cupy as cp -import time -import torch - -from pathlib import Path -#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so') -LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so' -# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){ - -def get_device_compress(): - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.cuSZp_device_compress - # Returns: unsigned char *bytes - # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold - func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t] - func.restype = POINTER(c_ubyte) - return func - -# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){ - -def get_device_decompress(): - dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) - func = dll.cuSZp_device_decompress - # Returns: float *newData - # Needs: size_t nbEle, unsigned char *cmpBytes - func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float] - func.restype = POINTER(c_float) - return func - - - -def cuszp_device_compress(oriData, absErrBound, nbEle,threshold): - __cuszp_device_compress = get_device_compress() - - ori_nbEle = nbEle - variable = ctypes.c_size_t(0) - outSize = ctypes.pointer(variable) - - oriData = oriData.flatten() - #ori_real = oriData.real - #ori_imag = oriData.imag - #oriData = cp.concatenate((ori_real, ori_imag)) - #sample = oriData[::2] - - - d = cp.amax(oriData) - cp.amin(oriData) - #print("max min time (s): " +str(time.time()-v_time)) - d = d.get() - if d.dtype == np.complex64: - #d = min(d.real, d.imag) - d = d.real - absErrBound = absErrBound*(d) - threshold = threshold*(d) - s_1 = time.time() - #print(cp.get_array_module(oriData)) - truth_values = cp.absolute(oriData)<=threshold - #oriData[truth_values] = 0.0 - truth_values = cp.invert(truth_values) - # oriData = oriData[truth_values] - bitmap = truth_values - nbEle = oriData.shape[0]*2 - - - oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) - #print("starting") - # float *oriData, size_t *outSize, float absErrBound, size_t nbEle - o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle)) - - mempool = cp.get_default_memory_pool() - pinned_mempool = cp.get_default_pinned_memory_pool() - #del oriData - - #print("tg and max time (s): "+str(time.time()-s_1)) - #print("bitmap shape: "+str(bitmap.shape[0])) - #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0])) - #print("CR") - #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8)) - return (o_bytes,bitmap, absErrBound), outSize - - -def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype): - __cuszp_device_decompress=get_device_decompress() - (cmpBytes, bitmap, absErrBound) = cmpBytes - #print("bitmap len:" +str(len(bitmap))) - #print(nbEle) - #tmp_nbEle = nbEle - # tmp_nbEle = cp.count_nonzero(bitmap).item() -# print(tmp_nbEle) - nbEle_p = ctypes.c_size_t(nbEle) - # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound - newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound)) - - # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff) - # -- Workaround to convert GPU pointer to int - p_decompressed_ptr = ctypes.addressof(newData) - # cast to int64 pointer - # (effectively converting pointer to pointer to addr to pointer to int64) - p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) - decompressed_int = p_decompressed_int.contents - # -- - pointer_for_free = decompressed_int.value - # self.decompressed_own.append(decompressed_int.value) - mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0) - mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) - #print("mem ptr") - #print(mem_ptr) - arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr) -# print("attempt alloc") - # res = cp.zeros(nbEle,dtype=cp.float32) -# print("alloc passed") - ## need to convert newData to cupy - # cp.putmask(res,bitmap,arr) - mempool = cp.get_default_memory_pool() - pinned_mempool = cp.get_default_pinned_memory_pool() - #del arr - - #print(res[0]) - #print(res[int(nbEle/2)]) - #reshaped_data = arr.reshape(-1,2) - reshaped_data = arr.reshape(-1,2) - #c_res = arr - c_res = reshaped_data.view(dtype=np.complex64) - #print(c_res[0]) - #c_res = cp.zeros(int(nbEle/2), np.complex64) - #c_res.real = res[0:int(nbEle/2)] - #c_res.imag = res[int(nbEle/2):] - #del res - #del bitmap - #mempool.free_all_blocks() - #pinned_mempool.free_all_blocks() - - return (c_res, pointer_for_free) - -### Example of device compress/decompress wrapper usage -class Comp(): - def __init__(self): - self.name = "dummy" - -if __name__ == "__main__": - - DATA_SIZE = int(1024) - MAX_D = 10.0 - MIN_D = -10.0 - RANGE = MAX_D - MIN_D - r2r_threshold = 0.002 - r2r_error = 0.0001 - - in_vector = np.fromfile("real_sample.bin", dtype=np.float32) - #print(np.max(in_vector)) - DATA_SIZE = len(in_vector) - #range_vr = np.max(in_vector)-np.min(in_vector) - #r2r_threshold = r2r_threshold*range_vr - #r2r_error = r2r_error*range_vr - #in_vector = np.zeros((DATA_SIZE,)) - #for i in range(0,int(DATA_SIZE/4)): - # in_vector[i] = 0.0 - #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): - # in_vector[i] = 5.0 - #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): - # in_vector[i] = random.uniform(MIN_D, MAX_D) - #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): - # in_vector[i] = -7.0 - #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): - # in_vector[i] = 0.001 - - print(DATA_SIZE) - in_vector = in_vector.astype('float32') - in_vector_gpu = cp.asarray(in_vector) - - # variable = ctypes.c_size_t(0) - # outSize = ctypes.pointer(variable) - for i in range(30): - s_time = time.time() - o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold) - print("Time python: "+str(time.time()-s_time)) - print(outSize[0]) - print("Compress Success...starting decompress ") - comp = Comp() - - s_time = time.time() - (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype) - - print("Time python: "+str(time.time()-s_time)) - #for i in d_bytes: - # print(i) - print("Decompress Success") +import numpy as np +import ctypes +from ctypes import * +import random +from qtensor.tools.lazy_import import cupy as cp +import time +import torch + +from pathlib import Path +#LIB_PATH = str(Path(__file__).parent/'libcuszp_wrapper.so') +LIB_PATH = '/home/mkshah5/QTensor/qtensor/compression/szp/src/libcuszp_wrapper.so' +# unsigned char* cuSZp_device_compress(float *oriData, size_t *outSize, float absErrBound, size_t nbEle){ + +def get_device_compress(): + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.cuSZp_device_compress + # Returns: unsigned char *bytes + # Needs: float *oriData, size_t *outSize, float absErrBound, size_t nbEle, int blockSize, float threshold + func.argtypes = [POINTER(c_float), POINTER(c_size_t), c_float, c_size_t] + func.restype = POINTER(c_ubyte) + return func + +# float* cuSZp_device_decompress(size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound){ + +def get_device_decompress(): + dll = ctypes.CDLL(LIB_PATH, mode=ctypes.RTLD_GLOBAL) + func = dll.cuSZp_device_decompress + # Returns: float *newData + # Needs: size_t nbEle, unsigned char *cmpBytes + func.argtypes = [c_size_t, POINTER(c_ubyte), c_size_t, c_float] + func.restype = POINTER(c_float) + return func + + + +def cuszp_device_compress(oriData, absErrBound, nbEle,threshold): + __cuszp_device_compress = get_device_compress() + + ori_nbEle = nbEle + variable = ctypes.c_size_t(0) + outSize = ctypes.pointer(variable) + + oriData = oriData.flatten() + #ori_real = oriData.real + #ori_imag = oriData.imag + #oriData = cp.concatenate((ori_real, ori_imag)) + #sample = oriData[::2] + + + d = cp.amax(oriData) - cp.amin(oriData) + #print("max min time (s): " +str(time.time()-v_time)) + d = d.get() + if d.dtype == np.complex64: + #d = min(d.real, d.imag) + d = d.real + absErrBound = absErrBound*(d) + threshold = threshold*(d) + s_1 = time.time() + #print(cp.get_array_module(oriData)) + truth_values = cp.absolute(oriData)<=threshold + #oriData[truth_values] = 0.0 + truth_values = cp.invert(truth_values) + # oriData = oriData[truth_values] + bitmap = truth_values + nbEle = oriData.shape[0]*2 + + + oriData_p = ctypes.cast(oriData.data.ptr, ctypes.POINTER(c_float)) + #print("starting") + # float *oriData, size_t *outSize, float absErrBound, size_t nbEle + o_bytes = __cuszp_device_compress(oriData_p, outSize,np.float32(absErrBound), np.ulonglong(nbEle)) + + mempool = cp.get_default_memory_pool() + pinned_mempool = cp.get_default_pinned_memory_pool() + #del oriData + + #print("tg and max time (s): "+str(time.time()-s_1)) + #print("bitmap shape: "+str(bitmap.shape[0])) + #print("percent nonzero bytes: "+str(bitmap[cp.nonzero(bitmap)].shape[0]/bitmap.shape[0])) + #print("CR") + #print((ori_nbEle*4)/(outSize[0] + bitmap.shape[0]/8)) + return (o_bytes,bitmap, absErrBound), outSize + + +def cuszp_device_decompress(nbEle, cmpBytes, cmpSize, owner, dtype): + __cuszp_device_decompress=get_device_decompress() + (cmpBytes, bitmap, absErrBound) = cmpBytes + #print("bitmap len:" +str(len(bitmap))) + #print(nbEle) + #tmp_nbEle = nbEle + # tmp_nbEle = cp.count_nonzero(bitmap).item() +# print(tmp_nbEle) + nbEle_p = ctypes.c_size_t(nbEle) + # size_t nbEle, unsigned char* cmpBytes, size_t cmpSize, float errorBound + newData = __cuszp_device_decompress(nbEle_p,cmpBytes, np.ulonglong(cmpSize), np.float32(absErrBound)) + + # decompressed_ptr = self.cuszp_decompress(isCuPy, cmp_bytes, num_elements_eff) + # -- Workaround to convert GPU pointer to int + p_decompressed_ptr = ctypes.addressof(newData) + # cast to int64 pointer + # (effectively converting pointer to pointer to addr to pointer to int64) + p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) + decompressed_int = p_decompressed_int.contents + # -- + pointer_for_free = decompressed_int.value + # self.decompressed_own.append(decompressed_int.value) + mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle, owner, device_id=0) + mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) + #print("mem ptr") + #print(mem_ptr) + arr = cp.ndarray(shape=nbEle, dtype=cp.float32, memptr=mem_ptr) +# print("attempt alloc") + # res = cp.zeros(nbEle,dtype=cp.float32) +# print("alloc passed") + ## need to convert newData to cupy + # cp.putmask(res,bitmap,arr) + mempool = cp.get_default_memory_pool() + pinned_mempool = cp.get_default_pinned_memory_pool() + #del arr + + #print(res[0]) + #print(res[int(nbEle/2)]) + #reshaped_data = arr.reshape(-1,2) + reshaped_data = arr.reshape(-1,2) + #c_res = arr + c_res = reshaped_data.view(dtype=np.complex64) + #print(c_res[0]) + #c_res = cp.zeros(int(nbEle/2), np.complex64) + #c_res.real = res[0:int(nbEle/2)] + #c_res.imag = res[int(nbEle/2):] + #del res + #del bitmap + #mempool.free_all_blocks() + #pinned_mempool.free_all_blocks() + + return (c_res, pointer_for_free) + +### Example of device compress/decompress wrapper usage +class Comp(): + def __init__(self): + self.name = "dummy" + +if __name__ == "__main__": + + DATA_SIZE = int(1024) + MAX_D = 10.0 + MIN_D = -10.0 + RANGE = MAX_D - MIN_D + r2r_threshold = 0.002 + r2r_error = 0.0001 + + in_vector = np.fromfile("real_sample.bin", dtype=np.float32) + #print(np.max(in_vector)) + DATA_SIZE = len(in_vector) + #range_vr = np.max(in_vector)-np.min(in_vector) + #r2r_threshold = r2r_threshold*range_vr + #r2r_error = r2r_error*range_vr + #in_vector = np.zeros((DATA_SIZE,)) + #for i in range(0,int(DATA_SIZE/4)): + # in_vector[i] = 0.0 + #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): + # in_vector[i] = 5.0 + #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): + # in_vector[i] = random.uniform(MIN_D, MAX_D) + #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): + # in_vector[i] = -7.0 + #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): + # in_vector[i] = 0.001 + + print(DATA_SIZE) + in_vector = in_vector.astype('float32') + in_vector_gpu = cp.asarray(in_vector) + + # variable = ctypes.c_size_t(0) + # outSize = ctypes.pointer(variable) + for i in range(30): + s_time = time.time() + o_bytes, outSize = cuszp_device_compress(in_vector_gpu, r2r_error, DATA_SIZE,r2r_threshold) + print("Time python: "+str(time.time()-s_time)) + print(outSize[0]) + print("Compress Success...starting decompress ") + comp = Comp() + + s_time = time.time() + (d_bytes,ptr )= cuszp_device_decompress(DATA_SIZE, o_bytes,outSize[0], comp, in_vector_gpu.dtype) + + print("Time python: "+str(time.time()-s_time)) + #for i in d_bytes: + # print(i) + print("Decompress Success") diff --git a/qtensor/compression/torch_quant/torch_quant.py b/qtensor/compression/torch_quant/torch_quant.py index c5f04fc6..bbea4657 100644 --- a/qtensor/compression/torch_quant/torch_quant.py +++ b/qtensor/compression/torch_quant/torch_quant.py @@ -1,174 +1,174 @@ -import numpy as np -import ctypes -from ctypes import * -import random -from qtensor.tools.lazy_import import cupy as cp -import time -import torch - -from pathlib import Path - - - -def quant_device_compress(oriData, nbEle, blockSize,threshold): - #print(nbEle) - ori_nbEle = nbEle - variable = ctypes.c_size_t(0) - outSize = ctypes.pointer(variable) - - oriData = oriData.flatten() - ori_real = oriData.real - ori_imag = oriData.imag - oriData = cp.concatenate((ori_real, ori_imag)) - sample = oriData[::2] - max_val = cp.amax(oriData).get() - min_val = cp.amin(oriData).get() - d = max_val - min_val - if d.dtype == np.complex64: - d = d.real - threshold = threshold*(d) - s_1 = time.time() - truth_values = abs(oriData)<=threshold - oriData[truth_values] = 0.0 - truth_values = cp.invert(truth_values) - ori_len = oriData.shape[0] - nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0] - print("Percent nonzero: "+str(nonzero_percent)) - - isGrouped = False - if nonzero_percent<=0.5: - isGrouped=True - oriData = oriData[truth_values] - - nbEle = oriData.shape[0] - - # oriData = cp.reshape(oriData, (-1, blockSize)) # Reshape to blocksize - tensor = torch.as_tensor(oriData, device='cuda') - # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d)) -# scale = d/255.0 -# zero_point = -1*round(min_val*scale) - 128 - - scale = d/((2**8) - 1) - #zero_point = -1*round(min_val*scale) - zero_point = -1*round(min_val*scale)+32 -# q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) - - q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) - del tensor - torch.cuda.empty_cache() - if isGrouped: - bitmap = cp.packbits(truth_values) - else: - bitmap = None - del truth_values - #q_ten2 = torch.dequantize(q_tensor) - #print(tensor) - #print(q_ten2) - #print("Max PW error") - #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0]))) - return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8) - - -def quant_device_decompress(nbEle, cmpBytes, owner, dtype): - (q_tensor, bitmap, isGrouped) = cmpBytes - if isGrouped: - bitmap = cp.unpackbits(bitmap) - restored = torch.dequantize(q_tensor) - arr = cp.asarray(restored) - # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error - - # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) - # -- Workaround to convert GPU pointer to int - # p_decompressed_ptr = ctypes.addressof(newData) - # cast to int64 pointer - # (effectively converting pointer to pointer to addr to pointer to int64) - # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) - # decompressed_int = p_decompressed_int.contents - # # -- - # pointer_for_free = decompressed_int.value - # # self.decompressed_own.append(decompressed_int.value) - # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) - # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) - #print("mem ptr") - #print(mem_ptr) - # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) - #print(nbEle) - if isGrouped: - res = cp.zeros((nbEle,)) - # ## need to convert newData to cupy - cp.place(res,bitmap,arr) - - c_res = cp.zeros(int(nbEle/2), np.complex64) - #c_res.real = arr[0:int(nbEle/2)] - #c_res.imag = arr[int(nbEle/2):] - - c_res.real = res[0:int(nbEle/2)] - c_res.imag = res[int(nbEle/2):] - else: - c_res = cp.zeros(int(nbEle/2), np.complex64) - c_res.real = arr[0:int(nbEle/2)] - c_res.imag = arr[int(nbEle/2):] - return (c_res, None) - -### Example of device compress/decompress wrapper usage -class Comp(): - def __init__(self): - self.name = "dummy" - -def free_compressed(ptr): - p_ptr = ctypes.addressof(ptr) - p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) - decomp_int = p_int.contents - cp.cuda.runtime.free(decomp_int.value) - - -if __name__ == "__main__": - - DATA_SIZE = int(1024) - MAX_D = 10.0 - MIN_D = -10.0 - RANGE = MAX_D - MIN_D - r2r_threshold = 0.002 - r2r_error = 0.0001 - - in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) - #print(np.max(in_vector)) - DATA_SIZE = len(in_vector) - #range_vr = np.max(in_vector)-np.min(in_vector) - #r2r_threshold = r2r_threshold*range_vr - #r2r_error = r2r_error*range_vr - #in_vector = np.zeros((DATA_SIZE,)) - #for i in range(0,int(DATA_SIZE/4)): - # in_vector[i] = 0.0 - #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): - # in_vector[i] = 5.0 - #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): - # in_vector[i] = random.uniform(MIN_D, MAX_D) - #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): - # in_vector[i] = -7.0 - #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): - # in_vector[i] = 0.001 - - print(DATA_SIZE) - #in_vector = in_vector.astype('float32') - in_vector_gpu = cp.asarray(in_vector) - - # variable = ctypes.c_size_t(0) - # outSize = ctypes.pointer(variable) - for i in range(200): - s_time = time.time() - o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold) - print("Time python: "+str(time.time()-s_time)) - # print(outSize[0]) - print("Compress Success...starting decompress ") - comp = Comp() - - s_time = time.time() - (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) - - # free_compressed(o_bytes[0]) - # cp.cuda.runtime.free(ptr) - print("Time python: "+str(time.time()-s_time)) - #for i in d_bytes: - # print(i) - print("Decompress Success") +import numpy as np +import ctypes +from ctypes import * +import random +from qtensor.tools.lazy_import import cupy as cp +import time +import torch + +from pathlib import Path + + + +def quant_device_compress(oriData, nbEle, blockSize,threshold): + #print(nbEle) + ori_nbEle = nbEle + variable = ctypes.c_size_t(0) + outSize = ctypes.pointer(variable) + + oriData = oriData.flatten() + ori_real = oriData.real + ori_imag = oriData.imag + oriData = cp.concatenate((ori_real, ori_imag)) + sample = oriData[::2] + max_val = cp.amax(oriData).get() + min_val = cp.amin(oriData).get() + d = max_val - min_val + if d.dtype == np.complex64: + d = d.real + threshold = threshold*(d) + s_1 = time.time() + truth_values = abs(oriData)<=threshold + oriData[truth_values] = 0.0 + truth_values = cp.invert(truth_values) + ori_len = oriData.shape[0] + nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0] + print("Percent nonzero: "+str(nonzero_percent)) + + isGrouped = False + if nonzero_percent<=0.5: + isGrouped=True + oriData = oriData[truth_values] + + nbEle = oriData.shape[0] + + # oriData = cp.reshape(oriData, (-1, blockSize)) # Reshape to blocksize + tensor = torch.as_tensor(oriData, device='cuda') + # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d)) +# scale = d/255.0 +# zero_point = -1*round(min_val*scale) - 128 + + scale = d/((2**8) - 1) + #zero_point = -1*round(min_val*scale) + zero_point = -1*round(min_val*scale)+32 +# q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) + + q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) + del tensor + torch.cuda.empty_cache() + if isGrouped: + bitmap = cp.packbits(truth_values) + else: + bitmap = None + del truth_values + #q_ten2 = torch.dequantize(q_tensor) + #print(tensor) + #print(q_ten2) + #print("Max PW error") + #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0]))) + return (q_tensor, bitmap, isGrouped), (nbEle/4)+(ori_len/8) + + +def quant_device_decompress(nbEle, cmpBytes, owner, dtype): + (q_tensor, bitmap, isGrouped) = cmpBytes + if isGrouped: + bitmap = cp.unpackbits(bitmap) + restored = torch.dequantize(q_tensor) + arr = cp.asarray(restored) + # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error + + # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) + # -- Workaround to convert GPU pointer to int + # p_decompressed_ptr = ctypes.addressof(newData) + # cast to int64 pointer + # (effectively converting pointer to pointer to addr to pointer to int64) + # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) + # decompressed_int = p_decompressed_int.contents + # # -- + # pointer_for_free = decompressed_int.value + # # self.decompressed_own.append(decompressed_int.value) + # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) + # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) + #print("mem ptr") + #print(mem_ptr) + # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) + #print(nbEle) + if isGrouped: + res = cp.zeros((nbEle,)) + # ## need to convert newData to cupy + cp.place(res,bitmap,arr) + + c_res = cp.zeros(int(nbEle/2), np.complex64) + #c_res.real = arr[0:int(nbEle/2)] + #c_res.imag = arr[int(nbEle/2):] + + c_res.real = res[0:int(nbEle/2)] + c_res.imag = res[int(nbEle/2):] + else: + c_res = cp.zeros(int(nbEle/2), np.complex64) + c_res.real = arr[0:int(nbEle/2)] + c_res.imag = arr[int(nbEle/2):] + return (c_res, None) + +### Example of device compress/decompress wrapper usage +class Comp(): + def __init__(self): + self.name = "dummy" + +def free_compressed(ptr): + p_ptr = ctypes.addressof(ptr) + p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) + decomp_int = p_int.contents + cp.cuda.runtime.free(decomp_int.value) + + +if __name__ == "__main__": + + DATA_SIZE = int(1024) + MAX_D = 10.0 + MIN_D = -10.0 + RANGE = MAX_D - MIN_D + r2r_threshold = 0.002 + r2r_error = 0.0001 + + in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) + #print(np.max(in_vector)) + DATA_SIZE = len(in_vector) + #range_vr = np.max(in_vector)-np.min(in_vector) + #r2r_threshold = r2r_threshold*range_vr + #r2r_error = r2r_error*range_vr + #in_vector = np.zeros((DATA_SIZE,)) + #for i in range(0,int(DATA_SIZE/4)): + # in_vector[i] = 0.0 + #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): + # in_vector[i] = 5.0 + #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): + # in_vector[i] = random.uniform(MIN_D, MAX_D) + #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): + # in_vector[i] = -7.0 + #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): + # in_vector[i] = 0.001 + + print(DATA_SIZE) + #in_vector = in_vector.astype('float32') + in_vector_gpu = cp.asarray(in_vector) + + # variable = ctypes.c_size_t(0) + # outSize = ctypes.pointer(variable) + for i in range(200): + s_time = time.time() + o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold) + print("Time python: "+str(time.time()-s_time)) + # print(outSize[0]) + print("Compress Success...starting decompress ") + comp = Comp() + + s_time = time.time() + (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) + + # free_compressed(o_bytes[0]) + # cp.cuda.runtime.free(ptr) + print("Time python: "+str(time.time()-s_time)) + #for i in d_bytes: + # print(i) + print("Decompress Success") diff --git a/qtensor/compression/torch_quant/torch_quant_perchannel.py b/qtensor/compression/torch_quant/torch_quant_perchannel.py index a41606b2..24cf703e 100644 --- a/qtensor/compression/torch_quant/torch_quant_perchannel.py +++ b/qtensor/compression/torch_quant/torch_quant_perchannel.py @@ -1,203 +1,203 @@ -import numpy as np -import ctypes -from ctypes import * -import random -from qtensor.tools.lazy_import import cupy as cp -import time -import torch - -from pathlib import Path - -BS = 32 - -def quant_device_compress(oriData, nbEle, blockSize,threshold): - #print(nbEle) - ori_nbEle = nbEle - variable = ctypes.c_size_t(0) - outSize = ctypes.pointer(variable) - - oriData = oriData.flatten() - ori_real = oriData.real - ori_imag = oriData.imag - oriData = cp.concatenate((ori_real, ori_imag)) - sample = oriData[::2] - max_val = cp.amax(oriData).get() - min_val = cp.amin(oriData).get() - d = max_val - min_val - if d.dtype == np.complex64: - d = d.real - threshold = threshold*(d) - s_1 = time.time() - truth_values = abs(oriData)<=threshold - oriData[truth_values] = 0.0 - truth_values = cp.invert(truth_values) - ori_len = oriData.shape[0] - nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0] - print("Percent nonzero: "+str(nonzero_percent)) - - isGrouped = False - if nonzero_percent<=0.5: - isGrouped=True - oriData = oriData[truth_values] - - nbEle = oriData.shape[0] - - # oriData = cp.reshape(oriData, (-1, blockSize)) # Reshape to blocksize - tensor = torch.as_tensor(oriData, device='cuda') - # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d)) -# scale = d/255.0 -# zero_point = -1*round(min_val*scale) - 128 - if isGrouped: - pad_rows = int(nbEle/BS) - if nbEle%BS != 0: - pad_rows +=1 - - padded = torch.zeros(pad_rows*BS, device='cuda') - padded[:nbEle] = tensor - tensor = padded - tensor = torch.reshape(tensor, (-1, BS)) - maxs = torch.flatten(torch.max(tensor, dim=1)[0]) - mins = torch.flatten(torch.min(tensor, dim=1)[0]) - - #scales = torch.ones(tensor.shape[0], device='cuda') - #scales = torch.mul(scales, d/255.0) - #print(d) - #print(torch.max(torch.sub(maxs,mins))) - scales = torch.abs(torch.sub(maxs,mins))/127.0 - zero_points = torch.zeros(tensor.shape[0], device='cuda') - #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales)) - #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64 - - #print(zero_points) - - #scale = d/((2**8) - 1) - #zero_point = -1*round(min_val*scale) - #zero_point = -1*round(min_val*scale)+32 -# q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) - #tensor = torch.flatten(tensor) - #tensor = torch.split(tensor, BS) - #print(maxs) - #print(mins) - #print(scales) - - q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8) - #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) - del tensor - torch.cuda.empty_cache() - if isGrouped: - bitmap = cp.packbits(truth_values) - else: - bitmap = None - del truth_values - #q_ten2 = torch.dequantize(q_tensor) - #print(tensor) - #print(q_ten2) - #print("Max PW error") - #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0]))) - return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8) - - -def quant_device_decompress(nbEle, cmpBytes, owner, dtype): - (q_tensor, bitmap, isGrouped) = cmpBytes - if isGrouped: - bitmap = cp.unpackbits(bitmap) - restored = torch.flatten(torch.dequantize(q_tensor)) - - arr = cp.asarray(restored) - # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error - - # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) - # -- Workaround to convert GPU pointer to int - # p_decompressed_ptr = ctypes.addressof(newData) - # cast to int64 pointer - # (effectively converting pointer to pointer to addr to pointer to int64) - # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) - # decompressed_int = p_decompressed_int.contents - # # -- - # pointer_for_free = decompressed_int.value - # # self.decompressed_own.append(decompressed_int.value) - # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) - # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) - #print("mem ptr") - #print(mem_ptr) - # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) - #print(nbEle) - if isGrouped: - res = cp.zeros((nbEle,)) - # ## need to convert newData to cupy - cp.place(res,bitmap,arr) - - c_res = cp.zeros(int(nbEle/2), np.complex64) - #c_res.real = arr[0:int(nbEle/2)] - #c_res.imag = arr[int(nbEle/2):] - - c_res.real = res[0:int(nbEle/2)] - c_res.imag = res[int(nbEle/2):] - else: - c_res = cp.zeros(int(nbEle/2), np.complex64) - c_res.real = arr[0:int(nbEle/2)] - c_res.imag = arr[int(nbEle/2):] - return (c_res, None) - -### Example of device compress/decompress wrapper usage -class Comp(): - def __init__(self): - self.name = "dummy" - -def free_compressed(ptr): - p_ptr = ctypes.addressof(ptr) - p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) - decomp_int = p_int.contents - cp.cuda.runtime.free(decomp_int.value) - - -if __name__ == "__main__": - - DATA_SIZE = int(1024) - MAX_D = 10.0 - MIN_D = -10.0 - RANGE = MAX_D - MIN_D - r2r_threshold = 0.002 - r2r_error = 0.0001 - - in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) - #print(np.max(in_vector)) - DATA_SIZE = len(in_vector) - #range_vr = np.max(in_vector)-np.min(in_vector) - #r2r_threshold = r2r_threshold*range_vr - #r2r_error = r2r_error*range_vr - #in_vector = np.zeros((DATA_SIZE,)) - #for i in range(0,int(DATA_SIZE/4)): - # in_vector[i] = 0.0 - #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): - # in_vector[i] = 5.0 - #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): - # in_vector[i] = random.uniform(MIN_D, MAX_D) - #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): - # in_vector[i] = -7.0 - #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): - # in_vector[i] = 0.001 - - print(DATA_SIZE) - #in_vector = in_vector.astype('float32') - in_vector_gpu = cp.asarray(in_vector) - - # variable = ctypes.c_size_t(0) - # outSize = ctypes.pointer(variable) - for i in range(200): - s_time = time.time() - o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold) - print("Time python: "+str(time.time()-s_time)) - # print(outSize[0]) - print("Compress Success...starting decompress ") - comp = Comp() - - s_time = time.time() - (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) - - # free_compressed(o_bytes[0]) - # cp.cuda.runtime.free(ptr) - print("Time python: "+str(time.time()-s_time)) - #for i in d_bytes: - # print(i) - print("Decompress Success") +import numpy as np +import ctypes +from ctypes import * +import random +from qtensor.tools.lazy_import import cupy as cp +import time +import torch + +from pathlib import Path + +BS = 32 + +def quant_device_compress(oriData, nbEle, blockSize,threshold): + #print(nbEle) + ori_nbEle = nbEle + variable = ctypes.c_size_t(0) + outSize = ctypes.pointer(variable) + + oriData = oriData.flatten() + ori_real = oriData.real + ori_imag = oriData.imag + oriData = cp.concatenate((ori_real, ori_imag)) + sample = oriData[::2] + max_val = cp.amax(oriData).get() + min_val = cp.amin(oriData).get() + d = max_val - min_val + if d.dtype == np.complex64: + d = d.real + threshold = threshold*(d) + s_1 = time.time() + truth_values = abs(oriData)<=threshold + oriData[truth_values] = 0.0 + truth_values = cp.invert(truth_values) + ori_len = oriData.shape[0] + nonzero_percent = cp.count_nonzero(oriData)/oriData.shape[0] + print("Percent nonzero: "+str(nonzero_percent)) + + isGrouped = False + if nonzero_percent<=0.5: + isGrouped=True + oriData = oriData[truth_values] + + nbEle = oriData.shape[0] + + # oriData = cp.reshape(oriData, (-1, blockSize)) # Reshape to blocksize + tensor = torch.as_tensor(oriData, device='cuda') + # print("Min val: "+str(cp.amin(oriData).get())+" range: "+str(d)) +# scale = d/255.0 +# zero_point = -1*round(min_val*scale) - 128 + if isGrouped: + pad_rows = int(nbEle/BS) + if nbEle%BS != 0: + pad_rows +=1 + + padded = torch.zeros(pad_rows*BS, device='cuda') + padded[:nbEle] = tensor + tensor = padded + tensor = torch.reshape(tensor, (-1, BS)) + maxs = torch.flatten(torch.max(tensor, dim=1)[0]) + mins = torch.flatten(torch.min(tensor, dim=1)[0]) + + #scales = torch.ones(tensor.shape[0], device='cuda') + #scales = torch.mul(scales, d/255.0) + #print(d) + #print(torch.max(torch.sub(maxs,mins))) + scales = torch.abs(torch.sub(maxs,mins))/127.0 + zero_points = torch.zeros(tensor.shape[0], device='cuda') + #zero_points = torch.round(torch.div(torch.add(maxs,mins)/2,scales)) + #zero_points = torch.neg(torch.round(torch.div(mins,scales)))+64 + + #print(zero_points) + + #scale = d/((2**8) - 1) + #zero_point = -1*round(min_val*scale) + #zero_point = -1*round(min_val*scale)+32 +# q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) + #tensor = torch.flatten(tensor) + #tensor = torch.split(tensor, BS) + #print(maxs) + #print(mins) + #print(scales) + + q_tensor = torch.quantize_per_channel(tensor, scales, zero_points,0, dtype=torch.qint8) + #q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype=torch.qint8) + del tensor + torch.cuda.empty_cache() + if isGrouped: + bitmap = cp.packbits(truth_values) + else: + bitmap = None + del truth_values + #q_ten2 = torch.dequantize(q_tensor) + #print(tensor) + #print(q_ten2) + #print("Max PW error") + #print(torch.max(torch.div(torch.abs(torch.sub(tensor[tensor!=0.0],q_ten2[tensor!=0.0])),tensor[tensor!=0.0]))) + return (q_tensor, bitmap, isGrouped), (nbEle/2)+(ori_len/8) + + +def quant_device_decompress(nbEle, cmpBytes, owner, dtype): + (q_tensor, bitmap, isGrouped) = cmpBytes + if isGrouped: + bitmap = cp.unpackbits(bitmap) + restored = torch.flatten(torch.dequantize(q_tensor)) + + arr = cp.asarray(restored) + # uint8_t* cmpbytes, size_t len, size_t compressed_len, float r2r_error + + # decompressed_ptr = self.cuszx_decompress(isCuPy, cmp_bytes, num_elements_eff) + # -- Workaround to convert GPU pointer to int + # p_decompressed_ptr = ctypes.addressof(newData) + # cast to int64 pointer + # (effectively converting pointer to pointer to addr to pointer to int64) + # p_decompressed_int= ctypes.cast(p_decompressed_ptr, ctypes.POINTER(ctypes.c_uint64)) + # decompressed_int = p_decompressed_int.contents + # # -- + # pointer_for_free = decompressed_int.value + # # self.decompressed_own.append(decompressed_int.value) + # mem = cp.cuda.UnownedMemory(decompressed_int.value, nbEle*4, owner, device_id=0) + # mem_ptr = cp.cuda.memory.MemoryPointer(mem, 0) + #print("mem ptr") + #print(mem_ptr) + # arr = cp.ndarray(shape=(nbEle,), dtype=np.float32, memptr=mem_ptr) + #print(nbEle) + if isGrouped: + res = cp.zeros((nbEle,)) + # ## need to convert newData to cupy + cp.place(res,bitmap,arr) + + c_res = cp.zeros(int(nbEle/2), np.complex64) + #c_res.real = arr[0:int(nbEle/2)] + #c_res.imag = arr[int(nbEle/2):] + + c_res.real = res[0:int(nbEle/2)] + c_res.imag = res[int(nbEle/2):] + else: + c_res = cp.zeros(int(nbEle/2), np.complex64) + c_res.real = arr[0:int(nbEle/2)] + c_res.imag = arr[int(nbEle/2):] + return (c_res, None) + +### Example of device compress/decompress wrapper usage +class Comp(): + def __init__(self): + self.name = "dummy" + +def free_compressed(ptr): + p_ptr = ctypes.addressof(ptr) + p_int = ctypes.cast(p_ptr, ctypes.POINTER(ctypes.c_uint64)) + decomp_int = p_int.contents + cp.cuda.runtime.free(decomp_int.value) + + +if __name__ == "__main__": + + DATA_SIZE = int(1024) + MAX_D = 10.0 + MIN_D = -10.0 + RANGE = MAX_D - MIN_D + r2r_threshold = 0.002 + r2r_error = 0.0001 + + in_vector = np.fromfile("all_sample.bin", dtype=np.complex64) + #print(np.max(in_vector)) + DATA_SIZE = len(in_vector) + #range_vr = np.max(in_vector)-np.min(in_vector) + #r2r_threshold = r2r_threshold*range_vr + #r2r_error = r2r_error*range_vr + #in_vector = np.zeros((DATA_SIZE,)) + #for i in range(0,int(DATA_SIZE/4)): + # in_vector[i] = 0.0 + #for i in range(int(DATA_SIZE/4), int(2*DATA_SIZE/4)): + # in_vector[i] = 5.0 + #for i in range(int(2*DATA_SIZE/4), int(3*DATA_SIZE/4)): + # in_vector[i] = random.uniform(MIN_D, MAX_D) + #for i in range(int(3*DATA_SIZE/4), int(3*DATA_SIZE/4)+6): + # in_vector[i] = -7.0 + #for i in range(int(3*DATA_SIZE/4)+6, DATA_SIZE): + # in_vector[i] = 0.001 + + print(DATA_SIZE) + #in_vector = in_vector.astype('float32') + in_vector_gpu = cp.asarray(in_vector) + + # variable = ctypes.c_size_t(0) + # outSize = ctypes.pointer(variable) + for i in range(200): + s_time = time.time() + o_bytes, outSize = quant_device_compress(in_vector_gpu, DATA_SIZE, 256, r2r_threshold) + print("Time python: "+str(time.time()-s_time)) + # print(outSize[0]) + print("Compress Success...starting decompress ") + comp = Comp() + + s_time = time.time() + (d_bytes,ptr )= quant_device_decompress(DATA_SIZE*2, o_bytes, comp, in_vector_gpu.dtype) + + # free_compressed(o_bytes[0]) + # cp.cuda.runtime.free(ptr) + print("Time python: "+str(time.time()-s_time)) + #for i in d_bytes: + # print(i) + print("Decompress Success") From 7e0286f4a2a55de74d4d08bc398f6e0ff59deba4 Mon Sep 17 00:00:00 2001 From: Dan Lykov Date: Tue, 19 Mar 2024 18:43:45 -0500 Subject: [PATCH 06/14] torch backend no_sum fix --- qtensor/contraction_backends/torch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index e684bde4..8bb2a06c 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -140,7 +140,10 @@ def process_bucket(self, bucket, no_sum=False): key=int, reverse=True )) else: - result_data = result_data.sum(axis=-1) + if not no_sum: + result_data = result_data.sum(axis=-1) + else: + result_data = result_data From 53770f22622cd325a04c520bb8afeef3bb30e152 Mon Sep 17 00:00:00 2001 From: Dan Lykov Date: Wed, 20 Mar 2024 22:39:16 -0500 Subject: [PATCH 07/14] fix torch backend no_sum bug --- qtensor/contraction_backends/torch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 8bb2a06c..4b92d956 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -65,7 +65,7 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): indices_sliced = [ i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int) ] - #print(f'indicies_in {indices_in}, slice_dict {slice_dict}, bounds {slice_bounds}, slicedix {indices_sliced}, sshape {s_data.shape}') + #print(f'{indices_in=}, {indices_sliced=} {slice_dict=}, {slice_bounds=}, slicedix {indices_sliced}, sshape {s_data.shape}') indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)] indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)] assert len(indices_sized) == len(s_data.shape) @@ -130,7 +130,7 @@ def process_bucket(self, bucket, no_sum=False): tensor = bucket[-1] expr = get_einsum_expr( list(map(int, result_indices)), list(map(int, tensor.indices)) - , contract = 1 + , contract = 0 if no_sum else 1 ) logger.trace('Before contract. Expr: {}, inputs: {}, {}', expr, result_data, tensor) result_data = torch.einsum(expr, result_data, tensor.data) @@ -146,10 +146,10 @@ def process_bucket(self, bucket, no_sum=False): result_data = result_data - if len(result_indices) > 0: first_index = result_indices[-1] - result_indices = result_indices[:-1] + if not no_sum: + result_indices = result_indices[:-1] tag = first_index.identity else: tag = 'f' From fd898d1046eab9808c627a6b189da3fd1adfa7d9 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Wed, 3 Apr 2024 23:10:15 -0500 Subject: [PATCH 08/14] fix bug in torch matm backend (scalar output) --- qtensor/contraction_backends/torch.py | 10 +- qtensor/tests/test_bucket_backends.py | 165 +++++++++++++++++++++++--- 2 files changed, 151 insertions(+), 24 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index e684bde4..80bd7707 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -264,7 +264,7 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): kix = common - set(out) fix = common - kix common = list(kix) + list(fix) - print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}') + #print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}') a = tensors[0].permute(*[ list(ixs[0]).index(x) for x in common + list(mix) ]) @@ -285,7 +285,8 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) current_ord_ = list(fix) + list(mix) + list(nix) - c = c.permute(*[current_ord_.index(i) for i in out]) + if len(out): + c = c.permute(*[current_ord_.index(i) for i in out]) return c def process_bucket(self, bucket, no_sum=False): @@ -294,6 +295,7 @@ def process_bucket(self, bucket, no_sum=False): result_data = bucket[0].data width = len(set(bucket[0].indices)) + for tensor in bucket[1:-1]: ixr = list(map(int, result_indices)) @@ -336,12 +338,10 @@ def process_bucket(self, bucket, no_sum=False): result_data = result_data_new else: result_data = result_data.sum(axis=-1) - - + result_indices = result_indices[:-1] if len(result_indices) > 0: first_index = result_indices[-1] - result_indices = result_indices[:-1] tag = first_index.identity else: tag = 'f' diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py index d26ba8a5..46600f30 100644 --- a/qtensor/tests/test_bucket_backends.py +++ b/qtensor/tests/test_bucket_backends.py @@ -1,8 +1,10 @@ from qtensor import QtreeQAOAComposer from qtensor.contraction_backends import PerfNumpyBackend from qtensor.contraction_backends import CuPyBackend, NumpyBackend, CompressionBackend +from qtensor.contraction_backends.torch import TorchBackendMatm from qtensor.compression import NumpyCompressor, CUSZCompressor from qtensor.Simulate import CirqSimulator, QtreeSimulator +import qtree import pytest import qtensor @@ -16,8 +18,7 @@ def test_profiled(capsys): G, gamma, beta = get_test_problem() - composer = QtreeQAOAComposer( - graph=G, gamma=[np.pi/3], beta=[np.pi/4]) + composer = QtreeQAOAComposer(graph=G, gamma=[np.pi / 3], beta=[np.pi / 4]) composer.ansatz_state() print(composer.circuit) @@ -32,30 +33,42 @@ def test_profiled(capsys): assert qtree_amp + def test_reverse_order_switch(): - backend = qtensor.contraction_backends.get_backend('torch') + backend = qtensor.contraction_backends.get_backend("torch") reverse = is_reverse_order_backend(backend) assert reverse - backend = qtensor.contraction_backends.get_backend('einsum') + backend = qtensor.contraction_backends.get_backend("einsum") reverse = is_reverse_order_backend(backend) assert not reverse -ref_backend_name = 'cupy' -@pytest.mark.parametrize('circ', [ - get_test_qaoa_ansatz_circ(n=6, p=3), - get_test_qaoa_ansatz_circ(n=12, p=4), -]) -@pytest.mark.parametrize(['backend', 'atol'], [ - ('cupy', 1e-10), - ('torch', 1e-10), - ('cupy_compressed', 1e-10), - (CompressionBackend( - CuPyBackend(), - CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5), - 11 ), - 1e-5) -]) + +ref_backend_name = "einsum" + + +@pytest.mark.parametrize( + "circ", + [ + get_test_qaoa_ansatz_circ(n=6, p=3), + get_test_qaoa_ansatz_circ(n=12, p=4), + ], +) +@pytest.mark.parametrize( + ["backend", "atol"], + [ + # NOTE: 04/02/24 temporary disable cupy backend, it is not working on my machine + # ('cupy', 1e-10), + ("torch", 1e-10), + # ('cupy_compressed', 1e-10), + (TorchBackendMatm(), 1e-10), + # (CompressionBackend( + # CuPyBackend(), + # CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5), + # 11 ), + # 1e-5) + ], +) def test_backends(circ, backend, atol): ref_backend = qtensor.contraction_backends.get_backend(ref_backend_name) if isinstance(backend, str): @@ -65,3 +78,117 @@ def test_backends(circ, backend, atol): sim_ref = QtreeSimulator(backend=ref_backend) res_ref = sim_ref.simulate(circ) assert np.allclose(res, res_ref, atol=atol) + + +ref_backend_name = "einsum" + + +# -- Bucket contraction tests + +def contract_bucket( + indices_list, backend: qtensor.contraction_backends.ContractionBackend, data_dict, + slice_dict={} +): + vars_list = [[qtree.optimizer.Var(i, size=2) for i in ix] for ix in indices_list] + bucket = [ + qtree.optimizer.Tensor(f"T{i}", indices=ix, data_key=i) + for i, ix in enumerate(vars_list) + ] + print(f"bucket: {bucket}") + # Empty slice, ensure compatible datatype + slice_dict = {qtree.optimizer.Var(i, size=2): v for i, v in slice_dict.items()} + buckets = backend.get_sliced_buckets([bucket], data_dict, slice_dict=slice_dict) + print(f"sliced bucket: {buckets}") + result = backend.process_bucket(buckets[0]) + return backend.get_result_data(result) + + +def contract_bucket_einsum(indices_list, data_dict): + index_strs = ["".join([chr(97 + i) for i in ix]) for ix in indices_list] + out_indices = [] + for ix in indices_list: + out_indices.extend(ix[1:]) + expr = ",".join(index_strs) + "->" + "".join([chr(97 + i) for i in out_indices]) + print(f"expr: {expr}") + res = np.einsum(expr, *[data_dict[i] for i in range(len(indices_list))]) + return res + + +@pytest.mark.parametrize( + ["backend", "atol"], + [ + # NOTE: 04/02/24 temporary disable cupy backend, it is not working on my machine + # ('cupy', 1e-10), + (qtensor.contraction_backends.get_backend("einsum"), 1e-10), + (qtensor.contraction_backends.get_backend("torch"), 1e-10), + # ('cupy_compressed', 1e-10), + (TorchBackendMatm(), 1e-10), + # (CompressionBackend( + # CuPyBackend(), + # CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5), + # 11 ), + # 1e-5) + ], +) +def test_backend_single_bucket_general(backend, atol): + """ + + Test a single bucket contraction with multiple tensors and a single common + index + """ + + # -- Generate a simple bucket with decreasing number of indices + n_tensors = 3 + ix_common = 1 + ix_counter = 2 + indices_list = [] + for nown in range(n_tensors, 0, -1): + tensor_indices = [ix_common] + list(range(ix_counter, ix_counter + nown)) + ix_counter += nown + indices_list.append(tensor_indices) + print(f"indices_list: {indices_list}") + # -- Generate random data for the tensors + data_dict = {i: np.random.rand(*[2] * len(ix)) for i, ix in enumerate(indices_list)} + + # Test the slicing correctness as well + slice_dict = {ix_counter - 1: 1} + res_ref = contract_bucket_einsum(indices_list, data_dict) + res_ref = res_ref[..., 1] + + res = contract_bucket(indices_list, backend, data_dict, slice_dict=slice_dict) + assert np.allclose(res, res_ref, atol=atol) + +@pytest.mark.parametrize( + ["backend", "atol"], + [ + # NOTE: 04/02/24 temporary disable cupy backend, it is not working on my machine + # ('cupy', 1e-10), + (qtensor.contraction_backends.get_backend("einsum"), 1e-10), + (qtensor.contraction_backends.get_backend("torch"), 1e-10), + # ('cupy_compressed', 1e-10), + (TorchBackendMatm(), 1e-10), + # (CompressionBackend( + # CuPyBackend(), + # CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5), + # 11 ), + # 1e-5) + ], +) +def test_backend_single_bucket_one_index(backend, atol): + """ + + Test a single bucket with several tensors sharing a single index + """ + + # Simple bucket with decreasing number of indices + n_tensors = 3 + ix = 1 + indices_list = [[ix] for _ in range(n_tensors)] + print(f"indices_list: {indices_list}") + data_dict = {i: np.random.rand(*[2] * len(ix)) for i, ix in enumerate(indices_list)} + + # Test the slicing correctness as well + res_ref = contract_bucket_einsum(indices_list, data_dict) + + res = contract_bucket(indices_list, backend, data_dict) + assert np.allclose(res, res_ref, atol=atol) From ab7bf85f5ad529b5af57868626caaa73cc977202 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Thu, 4 Apr 2024 22:04:38 -0500 Subject: [PATCH 09/14] update qtree commit --- qtree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qtree b/qtree index 16efbba2..24d905e6 160000 --- a/qtree +++ b/qtree @@ -1 +1 @@ -Subproject commit 16efbba2566e65a37bb7927f06a80c9f88ac57ff +Subproject commit 24d905e60351e2758f3cdafe3497fb0947f1ea2c From 334148c1f1bf3ca49e0b5df7f82b3fbbd0679e74 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Fri, 5 Apr 2024 00:32:31 -0500 Subject: [PATCH 10/14] torch matm backend to support sizes --- qtensor/contraction_backends/torch.py | 76 ++++++++++++++++++--------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index c90c3165..0074c1e9 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -228,18 +228,20 @@ def get_result_data(self, result): class TorchBackendMatm(TorchBackend): - def _get_index_sizes(self, *ixs): + def _get_index_sizes(self, *ixs, size_dict = None): + if size_dict is not None: + return [size_dict[i] for i in ixs] try: sizes = [ i.size for i in ixs ] except AttributeError: sizes = [2] * len(ixs) return sizes - def _get_index_space_size(self, *ixs): - sizes = self._get_index_sizes(*ixs) + def _get_index_space_size(self, *ixs, size_dict = None): + sizes = self._get_index_sizes(*ixs, size_dict = size_dict) return reduce(np.multiply, sizes, 1) - def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): + def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict = None): out = ixout common = set(ixa).intersection(set(ixb)) # -- sum indices that are in one tensor only @@ -267,16 +269,18 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): kix = common - set(out) fix = common - kix common = list(kix) + list(fix) - #print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}') - a = tensors[0].permute(*[ - list(ixs[0]).index(x) for x in common + list(mix) - ]) - - b = tensors[1].permute(*[ - list(ixs[1]).index(x) for x in common + list(nix) - ]) - - k, f, m, n = [self._get_index_space_size(*ix) + #print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}, {size_dict=}') + if tensors[0].numel() > 1: + a = tensors[0].permute(*[ + list(ixs[0]).index(x) for x in common + list(mix) + ]) + + if tensors[1].numel() > 1: + b = tensors[1].permute(*[ + list(ixs[1]).index(x) for x in common + list(nix) + ]) + + k, f, m, n = [self._get_index_space_size(*ix, size_dict=size_dict) for ix in (kix, fix, mix, nix) ] a = a.reshape(k, f, m) @@ -284,12 +288,15 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout): c = torch.einsum('kfm, kfn -> fmn', a, b) if len(out): #print('out ix', out, 'kfmnix', kix, fix, mix, nix) - c = c.reshape(*self._get_index_sizes(*out)) + c = c.reshape(*self._get_index_sizes(*out, size_dict=size_dict)) #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) current_ord_ = list(fix) + list(mix) + list(nix) if len(out): c = c.permute(*[current_ord_.index(i) for i in out]) + else: + c = c.flatten() + #print(f'c shape {c.shape}') return c def process_bucket(self, bucket, no_sum=False): @@ -303,17 +310,24 @@ def process_bucket(self, bucket, no_sum=False): ixr = list(map(int, result_indices)) ixt = list(map(int, tensor.indices)) - result_indices = tuple(sorted( + out_indices = tuple(sorted( set(result_indices + tensor.indices), key=int, reverse=True ) ) - ixout = list(map(int, result_indices)) - - logger.trace('Before contract. expr: {}, {} ->', ixr, ixt, ixout) - result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout) + ixout = list(map(int, out_indices)) + + logger.trace('Before contract. expr: {}, {} -> {}', ixr, ixt, ixout) + size_dict = {} + for i in result_indices: + size_dict[int(i)] = i.size + for i in tensor.indices: + size_dict[int(i)] = i.size + logger.debug("result_indices: {}", result_indices) + result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout, size_dict = size_dict) + result_indices = out_indices #result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("Data: {}, -> {}", result_data, tensor.data, result_data_new) + logger.trace("Data: {}, {} -> {}", result_data.shape, tensor.data.shape, result_data_new.shape) result_data = result_data_new # Merge and sort indices and shapes @@ -334,10 +348,24 @@ def process_bucket(self, bucket, no_sum=False): ))[:-1] ixout = list(map(int, result_indices)) - logger.trace('Before contract. expr: {}, {} ->', ixr, ixt, ixout) - result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout) + logger.trace('Before contract. expr: {}, {} -> {}', ixr, ixt, ixout) + size_dict = {} + for i in result_indices: + size_dict[int(i)] = i.size + for i in tensor.indices: + size_dict[int(i)] = i.size + #logger.debug("result_indices: {}", result_indices) + result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout, size_dict = size_dict) #result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("Data: {}, -> {}", result_data, tensor.data, result_data_new) + logger.trace("Data: {}, {} -> {}", result_data.mean(), tensor.data.mean(), result_data_new.mean()) + #if result_data_new.mean() == 0: + # logger.warning("Result is zero") + # logger.debug("result_indices: {}", result_indices) + # logger.debug("result_data: {}", result_data) + # logger.debug("tensor: {}", tensor) + # logger.debug("tensor_data: {}", tensor.data) + # logger.debug("result_data_new: {}", result_data_new) + # raise ValueError("Result is zero") result_data = result_data_new else: result_data = result_data.sum(axis=-1) From 9fb06b9b0213c52b25bea21e40c3b401905343c9 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Mon, 8 Apr 2024 00:12:59 -0400 Subject: [PATCH 11/14] torch matm backend with flattened reshaping --- .../contraction_backends/tests/test_torch.py | 19 +- qtensor/contraction_backends/torch.py | 347 ++++++++++++------ 2 files changed, 252 insertions(+), 114 deletions(-) diff --git a/qtensor/contraction_backends/tests/test_torch.py b/qtensor/contraction_backends/tests/test_torch.py index 32a4757d..36a1eacc 100644 --- a/qtensor/contraction_backends/tests/test_torch.py +++ b/qtensor/contraction_backends/tests/test_torch.py @@ -1,7 +1,8 @@ import qtensor import pytest import numpy as np -from qtensor.contraction_backends import TorchBackend, NumpyBackend +from qtensor.contraction_backends import NumpyBackend +from qtensor.contraction_backends.torch import TorchBackend, TorchBackendMatm, permute_flattened from qtensor import QtreeSimulator from qtensor.tests import get_test_qaoa_ansatz_circ torch = pytest.importorskip('torch') @@ -61,6 +62,22 @@ def contract_tn(backend, search_len=1, test_problem_kwargs={}): assert restr.shape == resnp.shape assert np.allclose(restr, resnp) +# -- Testing low-level functions for torch matm backend + +def test_torch_matm_permute(): + K = 5 + d = 2 + shape = [5] + [d]*(K-1) + x = torch.randn(shape) + for i in range(20): + perm = list(np.random.permutation(K)) + y = permute_flattened(x.flatten(), perm, shape) + assert y.ndim == 1 + assert y.numel() == x.numel() + print('perm', perm) + assert torch.allclose(y, x.permute(perm).flatten()) + +# -- Testing get_sliced_buckets def test_torch_get_sliced__slice(): backend = TorchBackend() diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 0074c1e9..0f314834 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -7,10 +7,12 @@ from .common import get_slice_bounds, get_einsum_expr, slice_numpy_tensor import string from loguru import logger + CHARS = string.ascii_lowercase + string.ascii_uppercase + def qtree2torch_tensor(tensor, data_dict): - """ Converts qtree tensor to pytorch tensor using data dict""" + """Converts qtree tensor to pytorch tensor using data dict""" if isinstance(tensor.data, torch.Tensor): return tensor if tensor.data is not None: @@ -21,25 +23,25 @@ def qtree2torch_tensor(tensor, data_dict): data_dict[tensor.data_key] = torch_t return tensor.copy(data=torch_t) + def get_einsum_expr_bucket(bucket, all_indices_list, result_indices): - # converting elements to int will make stuff faster, + # converting elements to int will make stuff faster, # but will drop support for char indices # all_indices_list = [int(x) for x in all_indices] # to_small_int = lambda x: all_indices_list.index(int(x)) to_small_int = lambda x: all_indices_list.index(x) - expr = ','.join( - ''.join(CHARS[to_small_int(i)] for i in t.indices) - for t in bucket) +\ - '->'+''.join(CHARS[to_small_int(i)] for i in result_indices) + expr = ( + ",".join("".join(CHARS[to_small_int(i)] for i in t.indices) for t in bucket) + + "->" + + "".join(CHARS[to_small_int(i)] for i in result_indices) + ) return expr - - -def permute_torch_tensor_data(data:np.ndarray, indices_in, indices_out): +def permute_torch_tensor_data(data: np.ndarray, indices_in, indices_out): """ Permute the data of a numpy tensor to the given indices_out. - + Returns: permuted data """ @@ -49,7 +51,8 @@ def permute_torch_tensor_data(data:np.ndarray, indices_in, indices_out): # permute tensor return torch.permute(data, perm) -def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): + +def slice_torch_tensor(data: np.ndarray, indices_in, indices_out, slice_dict): """ Args: data : np.ndarray @@ -65,9 +68,11 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): indices_sliced = [ i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int) ] - #print(f'{indices_in=}, {indices_sliced=} {slice_dict=}, {slice_bounds=}, slicedix {indices_sliced}, sshape {s_data.shape}') + # print(f'{indices_in=}, {indices_sliced=} {slice_dict=}, {slice_bounds=}, slicedix {indices_sliced}, sshape {s_data.shape}') indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)] - indices_out = [v for v in indices_out if not isinstance(slice_dict.get(v, None), int)] + indices_out = [ + v for v in indices_out if not isinstance(slice_dict.get(v, None), int) + ] assert len(indices_sized) == len(s_data.shape) assert len(indices_sliced) == len(s_data.shape) st_data = permute_torch_tensor_data(s_data, indices_sliced, indices_out) @@ -75,49 +80,48 @@ def slice_torch_tensor(data:np.ndarray, indices_in, indices_out, slice_dict): class TorchBackend(ContractionBackend): - - def __init__(self, device='cpu'): + def __init__(self, device="cpu"): # alias of gpu -> cuda - if device=='gpu': - device='cuda' + if device == "gpu": + device = "cuda" # Check that CUDA is available if specified - if device=='cuda': + if device == "cuda": if not torch.cuda.is_available(): logger.warning("Cuda is not available. Falling back to CPU") - device = 'cpu' - if device=='xpu': + device = "cpu" + if device == "xpu": import intel_extension_for_pytorch as ipex - self.device = torch.device(device) logger.debug("Torch backend using device {}", self.device) - self.dtype = ['float', 'double', 'complex64', 'complex128'] + self.dtype = ["float", "double", "complex64", "complex128"] self.width_dict = [set() for i in range(30)] - self.width_bc = [[0,0] for i in range(30)] #(#distinct_bc, #bc) + self.width_bc = [[0, 0] for i in range(30)] # (#distinct_bc, #bc) def process_bucket(self, bucket, no_sum=False): - bucket.sort(key = lambda x: len(x.indices)) + bucket.sort(key=lambda x: len(x.indices)) result_indices = bucket[0].indices result_data = bucket[0].data width = len(set(bucket[0].indices)) for tensor in bucket[1:-1]: - expr = get_einsum_expr( list(map(int, result_indices)), list(map(int, tensor.indices)) ) - logger.trace('Before contract. Expr: {}, inputs: {}, {}', expr, result_data, tensor) + logger.trace( + "Before contract. Expr: {}, inputs: {}, {}", expr, result_data, tensor + ) result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("expression {}. Data: {}, -> {}", expr, tensor.data, result_data) + logger.trace( + "expression {}. Data: {}, -> {}", expr, tensor.data, result_data + ) # Merge and sort indices and shapes - result_indices = tuple(sorted( - set(result_indices + tensor.indices), - key=int, reverse=True - ) + result_indices = tuple( + sorted(set(result_indices + tensor.indices), key=int, reverse=True) ) - + size = len(set(tensor.indices)) if size > width: width = size @@ -126,38 +130,40 @@ def process_bucket(self, bucket, no_sum=False): self.width_bc[width][0] = len(self.width_dict[width]) self.width_bc[width][1] += 1 - if len(bucket)>1: + if len(bucket) > 1: tensor = bucket[-1] expr = get_einsum_expr( - list(map(int, result_indices)), list(map(int, tensor.indices)) - , contract = 0 if no_sum else 1 + list(map(int, result_indices)), + list(map(int, tensor.indices)), + contract=0 if no_sum else 1, + ) + logger.trace( + "Before contract. Expr: {}, inputs: {}, {}", expr, result_data, tensor ) - logger.trace('Before contract. Expr: {}, inputs: {}, {}', expr, result_data, tensor) result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("expression {}. Data: {}, -> {}", expr, tensor.data, result_data) - result_indices = tuple(sorted( - set(result_indices + tensor.indices), - key=int, reverse=True - )) + logger.trace( + "expression {}. Data: {}, -> {}", expr, tensor.data, result_data + ) + result_indices = tuple( + sorted(set(result_indices + tensor.indices), key=int, reverse=True) + ) else: if not no_sum: result_data = result_data.sum(axis=-1) else: result_data = result_data - if len(result_indices) > 0: first_index = result_indices[-1] if not no_sum: result_indices = result_indices[:-1] tag = first_index.identity else: - tag = 'f' + tag = "f" result_indices = [] # reduce - result = qtree.optimizer.Tensor(f'E{tag}', result_indices, - data=result_data) + result = qtree.optimizer.Tensor(f"E{tag}", result_indices, data=result_data) return result def process_bucket_merged(self, ixs, bucket, no_sum=False): @@ -177,11 +183,11 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False): tensors.append(tensor.data) if tensor.data.dtype == torch.complex128: is128 = True - + if is128: for i in range(len(tensors)): tensors[i] = tensors[i].type(torch.complex128) - + expr = get_einsum_expr_bucket(bucket, all_indices_list, result_indices) expect = len(result_indices) result_data = torch.einsum(expr, *tensors) @@ -190,11 +196,10 @@ def process_bucket_merged(self, ixs, bucket, no_sum=False): first_index, *_ = result_indices tag = str(first_index) else: - tag = 'f' + tag = "f" + + result = qtree.optimizer.Tensor(f"E{tag}", result_indices, data=result_data) - result = qtree.optimizer.Tensor(f'E{tag}', result_indices, - data=result_data) - return result def get_sliced_buckets(self, buckets, data_dict, slice_dict): @@ -210,15 +215,16 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): else: data = tensor.data # Works for torch tensors just fine - if not isinstance(data, torch.Tensor): + if not isinstance(data, torch.Tensor): data = torch.from_numpy(data.astype(np.complex128)).to(self.device) else: data = data.type(torch.complex128) # slice data - data, new_indices = slice_torch_tensor(data, tensor.indices, out_indices, slice_dict) + data, new_indices = slice_torch_tensor( + data, tensor.indices, out_indices, slice_dict + ) - sliced_bucket.append( - tensor.copy(indices=new_indices, data=data)) + sliced_bucket.append(tensor.copy(indices=new_indices, data=data)) sliced_buckets.append(sliced_bucket) return sliced_buckets @@ -226,36 +232,113 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): def get_result_data(self, result): return torch.permute(result.data, tuple(reversed(range(result.data.ndim)))) -class TorchBackendMatm(TorchBackend): - def _get_index_sizes(self, *ixs, size_dict = None): +def _swap_flattened(data, a: int, b: int, sprod, different_dims=False): + """ + Swap two dimensions in a flattened tensor. + + Args: + data: flattened tensor + a, b: dimensions to swap + sprod (iterable of ints): ith element is the product of dimensions 0 to i Last element should be 1 + """ + if a == b: + return data + ndim = len(sprod) - 1 + assert ndim >= max(a, b) + a, b = min(a, b), max(a, b) + d5 = data.reshape( + ( + sprod[a - 1], + sprod[a] // sprod[a - 1], + sprod[b - 1] // sprod[a], + sprod[b] // sprod[b - 1], + sprod[ndim - 1] // sprod[b], + ) + ) + # -- modify sprod accordingly + if different_dims: + adim = sprod[a] // sprod[a - 1] + bdim = sprod[b] // sprod[b - 1] + for i in range(a, b): + sprod[i] *= bdim + sprod[i] //= adim + return d5.transpose(1, 3).flatten() + + +def permute_flattened(data, perm, shape): + """ + Permute the data of a many-dimensional tensor stored as a flattened array. + This is a workaround for the limitation of 12 dimensions in intel extension + for pytorch. + + While permuting, tensor is reshaped to maximum of 5 dimensions: + + for each dimension swap a-b: + 1. Reshape to 5-dimensional tensor ... a ... b ... + 2. Swap a and b. + 3. Flatten to 1-dimensional tensor. + + Args: + data: flattened data + perm (iterable of ints): permutation, as in torch.permute + shape (iterable of ints): shape of the original tensor + + Returns: + permuted data, equivalent to torch.permute(data.reshape(shape), perm).flatten() + """ + sprod = [] + k = 1 + different_dims = False + for i in shape: + if i != shape[0]: + different_dims = True + k *= i + sprod.append(k) + sprod.append(1) + # print(f'different_dims {different_dims}') + # Is there a way to use only one dict? + d2l = {i: i for i in range(len(shape))} + + l2d = {i: i for i in range(len(shape))} + for t, s in enumerate(perm): + s = d2l[s] + data = _swap_flattened(data, s, t, sprod, different_dims) + l2d[s], l2d[t] = l2d[t], l2d[s] + d2l[l2d[s]], d2l[l2d[t]] = s, t + # print(f'{s=}, {t=}, {d2l=}, {l2d=}') + return data + + +class TorchBackendMatm(TorchBackend): + def _get_index_sizes(self, *ixs, size_dict=None): if size_dict is not None: return [size_dict[i] for i in ixs] try: - sizes = [ i.size for i in ixs ] + sizes = [i.size for i in ixs] except AttributeError: sizes = [2] * len(ixs) return sizes - def _get_index_space_size(self, *ixs, size_dict = None): - sizes = self._get_index_sizes(*ixs, size_dict = size_dict) + def _get_index_space_size(self, *ixs, size_dict=None): + sizes = self._get_index_sizes(*ixs, size_dict=size_dict) return reduce(np.multiply, sizes, 1) - def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict = None): + def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict=None): out = ixout common = set(ixa).intersection(set(ixb)) # -- sum indices that are in one tensor only - all_ix = set(ixa+ixb) + all_ix = set(ixa + ixb) sum_ix = all_ix - set(out) a_sum = sum_ix.intersection(set(ixa) - common) b_sum = sum_ix.intersection(set(ixb) - common) - #print('ab', ixa, ixb) - #print('all sum', sum_ix, 'a/b_sum', a_sum, b_sum) + # print('ab', ixa, ixb) + # print('all sum', sum_ix, 'a/b_sum', a_sum, b_sum) if len(a_sum): - a = a.sum(axis=tuple(ixa.index(x) for x in a_sum)) + #a = a.sum(axis=tuple(ixa.index(x) for x in a_sum)) ixa = [x for x in ixa if x not in a_sum] if len(b_sum): - b = b.sum(axis=tuple(ixb.index(x) for x in b_sum)) + #b = b.sum(axis=tuple(ixb.index(x) for x in b_sum)) ixb = [x for x in ixb if x not in b_sum] tensors = a, b # -- @@ -269,96 +352,120 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict = None): kix = common - set(out) fix = common - kix common = list(kix) + list(fix) - #print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}, {size_dict=}') + # print(f'{ixa=} {ixb=} {ixout=}; {common=} {mix=} {nix=}, {size_dict=}') if tensors[0].numel() > 1: - a = tensors[0].permute(*[ - list(ixs[0]).index(x) for x in common + list(mix) - ]) + # a = tensors[0].permute(*[ + # list(ixs[0]).index(x) for x in common + list(mix) + # ]) + a = permute_flattened( + tensors[0], + [list(ixs[0]).index(x) for x in common + list(mix)], + self._get_index_sizes(*ixa, size_dict=size_dict), + ) if tensors[1].numel() > 1: - b = tensors[1].permute(*[ - list(ixs[1]).index(x) for x in common + list(nix) - ]) + # b = tensors[1].permute(*[ + # list(ixs[1]).index(x) for x in common + list(nix) + # ]) + b = permute_flattened( + tensors[1], + [list(ixs[1]).index(x) for x in common + list(nix)], + self._get_index_sizes(*ixb, size_dict=size_dict), + ) - k, f, m, n = [self._get_index_space_size(*ix, size_dict=size_dict) - for ix in (kix, fix, mix, nix) - ] + k, f, m, n = [ + self._get_index_space_size(*ix, size_dict=size_dict) + for ix in (kix, fix, mix, nix) + ] a = a.reshape(k, f, m) b = b.reshape(k, f, n) - c = torch.einsum('kfm, kfn -> fmn', a, b) - if len(out): - #print('out ix', out, 'kfmnix', kix, fix, mix, nix) - c = c.reshape(*self._get_index_sizes(*out, size_dict=size_dict)) - #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) + c = torch.einsum("kfm, kfn -> fmn", a, b) + #if len(out): + # print('out ix', out, 'kfmnix', kix, fix, mix, nix) + #c = c.reshape(*self._get_index_sizes(*out, size_dict=size_dict)) + # print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) current_ord_ = list(fix) + list(mix) + list(nix) + c = c.flatten() if len(out): - c = c.permute(*[current_ord_.index(i) for i in out]) - else: - c = c.flatten() - #print(f'c shape {c.shape}') + #c = c.permute(*[current_ord_.index(i) for i in out]) + c = permute_flattened( + c, + [current_ord_.index(i) for i in out], + self._get_index_sizes(*out, size_dict=size_dict), + ) + # print(f'c shape {c.shape}') return c def process_bucket(self, bucket, no_sum=False): - bucket.sort(key = lambda x: len(x.indices)) + bucket.sort(key=lambda x: len(x.indices)) result_indices = bucket[0].indices result_data = bucket[0].data width = len(set(bucket[0].indices)) - + print("bucket", bucket) for tensor in bucket[1:-1]: - ixr = list(map(int, result_indices)) ixt = list(map(int, tensor.indices)) - out_indices = tuple(sorted( - set(result_indices + tensor.indices), - key=int, reverse=True - ) + out_indices = tuple( + sorted(set(result_indices + tensor.indices), key=int, reverse=True) ) ixout = list(map(int, out_indices)) - logger.trace('Before contract. expr: {}, {} -> {}', ixr, ixt, ixout) + logger.trace("Before contract. expr: {}, {} -> {}", ixr, ixt, ixout) size_dict = {} for i in result_indices: size_dict[int(i)] = i.size for i in tensor.indices: size_dict[int(i)] = i.size logger.debug("result_indices: {}", result_indices) - result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout, size_dict = size_dict) + result_data_new = self.pairwise_sum_contract( + ixr, result_data, ixt, tensor.data, ixout, size_dict=size_dict + ) result_indices = out_indices - #result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("Data: {}, {} -> {}", result_data.shape, tensor.data.shape, result_data_new.shape) + # result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace( + "Data: {}, {} -> {}", + result_data.shape, + tensor.data.shape, + result_data_new.shape, + ) result_data = result_data_new # Merge and sort indices and shapes - + size = len(set(tensor.indices)) if size > width: width = size - - if len(bucket)>1: + if len(bucket) > 1: tensor = bucket[-1] ixr = list(map(int, result_indices)) ixt = list(map(int, tensor.indices)) - result_indices = tuple(sorted( - set(result_indices + tensor.indices), - key=int, reverse=True - ))[:-1] + result_indices = tuple( + sorted(set(result_indices + tensor.indices), key=int, reverse=True) + )[:-1] ixout = list(map(int, result_indices)) - logger.trace('Before contract. expr: {}, {} -> {}', ixr, ixt, ixout) + logger.trace("Before contract. expr: {}, {} -> {}", ixr, ixt, ixout) size_dict = {} for i in result_indices: size_dict[int(i)] = i.size for i in tensor.indices: size_dict[int(i)] = i.size - #logger.debug("result_indices: {}", result_indices) - result_data_new = self.pairwise_sum_contract(ixr, result_data, ixt, tensor.data, ixout, size_dict = size_dict) - #result_data = torch.einsum(expr, result_data, tensor.data) - logger.trace("Data: {}, {} -> {}", result_data.mean(), tensor.data.mean(), result_data_new.mean()) - #if result_data_new.mean() == 0: + # logger.debug("result_indices: {}", result_indices) + result_data_new = self.pairwise_sum_contract( + ixr, result_data, ixt, tensor.data, ixout, size_dict=size_dict + ) + # result_data = torch.einsum(expr, result_data, tensor.data) + logger.trace( + "Data: {}, {} -> {}", + result_data.mean(), + tensor.data.mean(), + result_data_new.mean(), + ) + # if result_data_new.mean() == 0: # logger.warning("Result is zero") # logger.debug("result_indices: {}", result_indices) # logger.debug("result_data: {}", result_data) @@ -368,19 +475,33 @@ def process_bucket(self, bucket, no_sum=False): # raise ValueError("Result is zero") result_data = result_data_new else: - result_data = result_data.sum(axis=-1) + # Sum the last index + print("result_data", result_data.shape) + #shape = self._get_index_sizes(*result_indices) + if result_data.numel() > 2: + result_data = result_data.reshape(-1, 2).sum(axis=-1) + else: + result_data = result_data.reshape(2, 1).sum(axis=-1) + #result_data = result_data.sum(axis=-1) result_indices = result_indices[:-1] if len(result_indices) > 0: first_index = result_indices[-1] tag = first_index.identity else: - tag = 'f' + tag = "f" result_indices = [] # reduce - result = qtree.optimizer.Tensor(f'E{tag}', result_indices, - data=result_data) + result = qtree.optimizer.Tensor(f"E{tag}", result_indices, data=result_data) + print("result", result) + print("result_data", result_data.shape) return result + def get_result_data(self, result): + if len(result.indices): + d = result.data.reshape(self._get_index_sizes(*result.indices)) + else: + d = result.data + return torch.permute(d, tuple(reversed(range(d.ndim)))) From 3d9ba6e440e9808c146d65a7bc5ae1aa229e8959 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Thu, 11 Apr 2024 02:41:04 -0500 Subject: [PATCH 12/14] torch backend fix bug in data reshape --- qtensor/contraction_backends/torch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 0f314834..01a413ee 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -46,8 +46,8 @@ def permute_torch_tensor_data(data: np.ndarray, indices_in, indices_out): permuted data """ # permute indices - out_locs = {idx: i for i, idx in enumerate(indices_out)} - perm = [out_locs[i] for i in indices_in] + in_locs = {idx: i for i, idx in enumerate(indices_in)} + perm = [in_locs[i] for i in indices_out] # permute tensor return torch.permute(data, perm) @@ -138,7 +138,7 @@ def process_bucket(self, bucket, no_sum=False): contract=0 if no_sum else 1, ) logger.trace( - "Before contract. Expr: {}, inputs: {}, {}", expr, result_data, tensor + "Before contract. Expr: {}, inputs: {}, {}", expr, result_data, tensor.data ) result_data = torch.einsum(expr, result_data, tensor.data) logger.trace( From 447a13da1dc02a140074af2c277d1118061391b0 Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Fri, 12 Apr 2024 01:25:40 -0500 Subject: [PATCH 13/14] torch matm backend fixes and tests --- qtensor/contraction_backends/__init__.py | 3 +- .../contraction_backends/tests/test_common.py | 80 ++++++++++++++++- .../contraction_backends/tests/test_cupy.py | 19 ---- qtensor/contraction_backends/torch.py | 89 ++++++++++++++----- qtensor/optimisation/Optimizer.py | 29 ++++-- qtensor/tests/test_bucket_backends.py | 37 +++++++- 6 files changed, 207 insertions(+), 50 deletions(-) diff --git a/qtensor/contraction_backends/__init__.py b/qtensor/contraction_backends/__init__.py index adaa68d1..cb380d6d 100644 --- a/qtensor/contraction_backends/__init__.py +++ b/qtensor/contraction_backends/__init__.py @@ -2,7 +2,7 @@ from .base_class import ContractionBackend from .common import slice_numpy_tensor from .numpy import NumpyBackend -from .torch import TorchBackend +from .torch import TorchBackend, TorchBackendMatm from .cupy import CuPyBackend from .mkl import CMKLExtendedBackend from .cupy import CuPyBackend @@ -22,6 +22,7 @@ def get_backend(name): 'opt_einsum': OptEinusmBackend, 'torch_cpu': TorchBackend, 'torch_gpu': TorchBackend, + 'torch_matm': TorchBackendMatm, 'torch': TorchBackend, 'cupy': CuPyBackend, 'cutensor': CuTensorBackend, diff --git a/qtensor/contraction_backends/tests/test_common.py b/qtensor/contraction_backends/tests/test_common.py index c5905932..b30b4d92 100644 --- a/qtensor/contraction_backends/tests/test_common.py +++ b/qtensor/contraction_backends/tests/test_common.py @@ -1,6 +1,84 @@ +import qtensor +import pytest from qtensor.contraction_backends.common import slice_numpy_tensor +from qtensor.contraction_backends import ( + CuPyBackend, NumpyBackend, TorchBackend, + TorchBackendMatm +) +from qtensor.tests import get_test_qaoa_ansatz_circ +from qtensor import QtreeSimulator +from qtensor.contraction_algos import bucket_elimination import numpy as np -from qtree.optimizer import Var +from qtree.optimizer import Var, Tensor + +# -- Contraction + +def circ2tn(circ): + return qtensor.optimisation.TensorNet.QtreeTensorNet.from_qtree_gates(circ) + +TEST_BACKENDS = [ + #CuPyBackend, + NumpyBackend, + TorchBackend, + TorchBackendMatm, +] + +REFERENCE_BACKEND = NumpyBackend() + +# ---- Custom TN + +@pytest.mark.parametrize("backend_cls", TEST_BACKENDS) +def test_custom_tn_contract(backend_cls): + buckets = [ + [Tensor('a', indices=(Var(0, size=2), Var(1, size=3), Var(2, size=4)), data_key='a')], + [], + [Tensor('b', indices=(Var(2, size=4), Var(3, size=3)), data_key='b')], + [Tensor('c', indices=(Var(3, size=3), Var(4, size=2)), data_key='c')], + [], + ] + buckets = [ + [ + Tensor('a', indices=(Var(0, size=2), Var(1, size=3), Var(2, size=4)), data_key='a'), + Tensor('c', indices=(Var(3, size=3), Var(0, size=2)), data_key='c') + ], + [], + [Tensor('b', indices=(Var(2, size=4), Var(3, size=3)), data_key='b')], + [], + ] + data_dict = { + 'a': np.random.rand(2, 3, 4), + 'b': np.random.rand(4, 3), + 'c': np.random.rand(3, 2), + } + slice_dict = { + Var(1, size=3): 1 + } + bref = REFERENCE_BACKEND + def contract_buckets(buckets, slice_dict, data_dict, backend): + sliced_buckets = backend.get_sliced_buckets(buckets, data_dict, slice_dict) + print('sliced_buckets', sliced_buckets) + res = bucket_elimination(sliced_buckets, backend, n_var_nosum=0) + return backend.get_result_data(res) + ref = contract_buckets([]+buckets, slice_dict, data_dict, bref) + b = backend_cls() + ref = np.einsum('ij,jl,li->', + data_dict['a'][:, 1,:], + data_dict['b'], data_dict['c']) + res = contract_buckets([]+buckets, slice_dict, data_dict, b) + assert np.allclose(ref, res) + +# ---- QAOA ansatz + +@pytest.mark.parametrize("backend_cls", TEST_BACKENDS) +def test_qaoa_ansatz_contract(backend_cls): + circ = get_test_qaoa_ansatz_circ(p=3) + sim_ref = QtreeSimulator(backend=REFERENCE_BACKEND) + sim = QtreeSimulator(backend=backend_cls()) + ref = sim_ref.simulate(circ) + res = sim.simulate(circ) + assert np.allclose(ref, res) + +# -- Slicing def test_slice_numpy_tensor(): shape = (2, 3, 4, 5) diff --git a/qtensor/contraction_backends/tests/test_cupy.py b/qtensor/contraction_backends/tests/test_cupy.py index a559ae94..8f6c6a72 100644 --- a/qtensor/contraction_backends/tests/test_cupy.py +++ b/qtensor/contraction_backends/tests/test_cupy.py @@ -2,7 +2,6 @@ import numpy as np from qtensor.contraction_backends import CuPyBackend, NumpyBackend -from qtensor import QtreeSimulator import pytest cp = pytest.importorskip('cupy') @@ -31,24 +30,6 @@ def get_test_qaoa_tn(n=10, p=2, d=3, type='random'): return tn -def test_cupy_numpy(): - circ = get_test_qaoa_circ(p=3) - bcp = CuPyBackend() - bnp = NumpyBackend() - simcp = QtreeSimulator(backend=bcp) - simnp = QtreeSimulator(backend=bnp) - rescp = simcp.simulate(circ) - resnp = simnp.simulate(circ) - assert np.allclose(rescp, resnp) - -def test_cupy_vanilla(): - circ = get_test_qaoa_circ(p=3) - bcp = CuPyBackend() - simcp = QtreeSimulator(backend=bcp) - sim = QtreeSimulator() - rescp = simcp.simulate(circ) - res = sim.simulate(circ) - assert np.allclose(rescp, res) def test_cupy_process_bucket(): diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 01a413ee..07ddb5de 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -68,15 +68,15 @@ def slice_torch_tensor(data: np.ndarray, indices_in, indices_out, slice_dict): indices_sliced = [ i for sl, i in zip(slice_bounds, indices_in) if not isinstance(sl, int) ] - # print(f'{indices_in=}, {indices_sliced=} {slice_dict=}, {slice_bounds=}, slicedix {indices_sliced}, sshape {s_data.shape}') - indices_sized = [v.copy(size=size) for v, size in zip(indices_sliced, s_data.shape)] + #print(f'{indices_in=}, {indices_sliced=} {slice_dict=}, {slice_bounds=}, slicedix {indices_sliced}, sshape {s_data.shape}') indices_out = [ v for v in indices_out if not isinstance(slice_dict.get(v, None), int) ] - assert len(indices_sized) == len(s_data.shape) assert len(indices_sliced) == len(s_data.shape) st_data = permute_torch_tensor_data(s_data, indices_sliced, indices_out) - return st_data, indices_out + indices_sized = [v.copy(size=size) for v, size in zip(indices_out, st_data.shape)] + assert len(indices_sized) == len(st_data.shape) + return st_data, indices_sized class TorchBackend(ContractionBackend): @@ -164,6 +164,7 @@ def process_bucket(self, bucket, no_sum=False): # reduce result = qtree.optimizer.Tensor(f"E{tag}", result_indices, data=result_data) + # print("returning result", [t.data.sum() for t in bucket], bucket, result.data.sum(), no_sum) return result def process_bucket_merged(self, ixs, bucket, no_sum=False): @@ -214,6 +215,7 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): data = data_dict[tensor.data_key] else: data = tensor.data + #print("data", data.shape, tensor.data_key, data.sum(), tensor.data is None) # Works for torch tensors just fine if not isinstance(data, torch.Tensor): data = torch.from_numpy(data.astype(np.complex128)).to(self.device) @@ -223,7 +225,13 @@ def get_sliced_buckets(self, buckets, data_dict, slice_dict): data, new_indices = slice_torch_tensor( data, tensor.indices, out_indices, slice_dict ) + #print("slice_dict", slice_dict) + #print("tensor", tensor) + #print("tensorshape", data.shape) + #print("tensorindices", new_indices) + #print("tensorindicessizes", [v.size for v in new_indices]) + #print("sliced tensor: T, daata, indices", tensor, data.sum(), new_indices, ', old:', tensor.indices) sliced_bucket.append(tensor.copy(indices=new_indices, data=data)) sliced_buckets.append(sliced_bucket) @@ -309,6 +317,28 @@ def permute_flattened(data, perm, shape): # print(f'{s=}, {t=}, {d2l=}, {l2d=}') return data +def sum_flattened(data, axes, shape): + sprod = [] + k = 1 + for i in shape: + k *= i + sprod.append(k) + sprod.append(1) + ndim = len(shape) + for ix in axes: + ixd = sprod[ix] // sprod[ix - 1] + d3 = data.reshape( + ( + sprod[ix - 1], + ixd, + sprod[ndim-1] // sprod[ix], + ) + ) + sprod = sprod[:ix] + list(np.array(sprod[ix + 1:]) * ixd) + data = d3.sum(axis=1).flatten() + return data + + class TorchBackendMatm(TorchBackend): def _get_index_sizes(self, *ixs, size_dict=None): @@ -336,9 +366,12 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict=None): # print('all sum', sum_ix, 'a/b_sum', a_sum, b_sum) if len(a_sum): #a = a.sum(axis=tuple(ixa.index(x) for x in a_sum)) + a = sum_flattened(a, [ixa.index(x) for x in a_sum], self._get_index_sizes(*ixa, size_dict=size_dict)) ixa = [x for x in ixa if x not in a_sum] + if len(b_sum): #b = b.sum(axis=tuple(ixb.index(x) for x in b_sum)) + b = sum_flattened(b, [ixb.index(x) for x in b_sum], self._get_index_sizes(*ixb, size_dict=size_dict)) ixb = [x for x in ixb if x not in b_sum] tensors = a, b # -- @@ -383,7 +416,7 @@ def pairwise_sum_contract(self, ixa, a, ixb, b, ixout, size_dict=None): #if len(out): # print('out ix', out, 'kfmnix', kix, fix, mix, nix) #c = c.reshape(*self._get_index_sizes(*out, size_dict=size_dict)) - # print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) + #print('outix', out, 'res', c.shape, 'kfmn',kix, fix, mix, nix) current_ord_ = list(fix) + list(mix) + list(nix) c = c.flatten() @@ -402,7 +435,7 @@ def process_bucket(self, bucket, no_sum=False): result_indices = bucket[0].indices result_data = bucket[0].data width = len(set(bucket[0].indices)) - print("bucket", bucket) + #print("bucket", bucket) for tensor in bucket[1:-1]: ixr = list(map(int, result_indices)) @@ -418,7 +451,7 @@ def process_bucket(self, bucket, no_sum=False): size_dict[int(i)] = i.size for i in tensor.indices: size_dict[int(i)] = i.size - logger.debug("result_indices: {}", result_indices) + logger.trace("result_indices: {}, out_indices {}, tensor {}, tensor.data.shape {}", result_indices, out_indices, tensor, tensor.data.shape) result_data_new = self.pairwise_sum_contract( ixr, result_data, ixt, tensor.data, ixout, size_dict=size_dict ) @@ -443,10 +476,12 @@ def process_bucket(self, bucket, no_sum=False): ixr = list(map(int, result_indices)) ixt = list(map(int, tensor.indices)) - result_indices = tuple( + out_indices = tuple( sorted(set(result_indices + tensor.indices), key=int, reverse=True) - )[:-1] - ixout = list(map(int, result_indices)) + ) + if not no_sum: + out_indices = out_indices[:-1] + ixout = list(map(int, out_indices)) logger.trace("Before contract. expr: {}, {} -> {}", ixr, ixt, ixout) size_dict = {} @@ -454,10 +489,11 @@ def process_bucket(self, bucket, no_sum=False): size_dict[int(i)] = i.size for i in tensor.indices: size_dict[int(i)] = i.size - # logger.debug("result_indices: {}", result_indices) + logger.trace("result_indices: {}, out_indices {}, tensor {}, tensor.data.shape {}", result_indices, out_indices, tensor, tensor.data.shape) result_data_new = self.pairwise_sum_contract( ixr, result_data, ixt, tensor.data, ixout, size_dict=size_dict ) + result_indices = out_indices # result_data = torch.einsum(expr, result_data, tensor.data) logger.trace( "Data: {}, {} -> {}", @@ -465,6 +501,11 @@ def process_bucket(self, bucket, no_sum=False): tensor.data.mean(), result_data_new.mean(), ) + #print("result_data", result_data_new.shape) + #print("result_indices", result_indices) + #print("ixonut", ixout) + #print("result_indicessizes", [v.size for v in result_indices]) + #print("size_dict", size_dict) # if result_data_new.mean() == 0: # logger.warning("Result is zero") # logger.debug("result_indices: {}", result_indices) @@ -476,14 +517,17 @@ def process_bucket(self, bucket, no_sum=False): result_data = result_data_new else: # Sum the last index - print("result_data", result_data.shape) - #shape = self._get_index_sizes(*result_indices) - if result_data.numel() > 2: - result_data = result_data.reshape(-1, 2).sum(axis=-1) - else: - result_data = result_data.reshape(2, 1).sum(axis=-1) + #print("result_data", result_data.shape) + #print("result_indices", result_indices) + #print("result_indicessizes", [v.size for v in result_indices]) + shape = self._get_index_sizes(*result_indices) + #print("shape", shape) + #print("no_sum", no_sum) + if not no_sum: + #print("reshaping",(-1, shape[-1])) + result_data = result_data.reshape(-1, shape[-1]).sum(axis=-1) #result_data = result_data.sum(axis=-1) - result_indices = result_indices[:-1] + result_indices = result_indices[:-1] if len(result_indices) > 0: first_index = result_indices[-1] @@ -494,13 +538,16 @@ def process_bucket(self, bucket, no_sum=False): # reduce result = qtree.optimizer.Tensor(f"E{tag}", result_indices, data=result_data) - print("result", result) - print("result_data", result_data.shape) + #print("returning result", result) + #print("returning result_data.shape", result_data.shape) + #print("returning result", [t.data.sum() for t in bucket], bucket,'r', result, result.data.sum(), no_sum) + #print(f'{result.name}({len(result.indices)})', end='', flush=True) return result def get_result_data(self, result): - if len(result.indices): + # In theory, This condition is redundant, both should be either True or False. + if len(result.indices) or result.data.ndim > 1: d = result.data.reshape(self._get_index_sizes(*result.indices)) else: d = result.data diff --git a/qtensor/optimisation/Optimizer.py b/qtensor/optimisation/Optimizer.py index dfa38957..ccb9a37b 100644 --- a/qtensor/optimisation/Optimizer.py +++ b/qtensor/optimisation/Optimizer.py @@ -51,14 +51,22 @@ def optimize(self, tensor_net): if free_vars: # It's more efficient to find ordering in-place to avoid copying # We'll need the copy of a graph only if we have free_vars + print('free vars', free_vars) + self.free_indices = free_vars graph = qtree.graph_model.make_clique_on(graph, free_vars) graph_copy = copy.deepcopy(graph) self.graph = graph_copy + else: + self.free_indices = None peo, path = self._get_ordering(graph, inplace=True) self.treewidth = max(path) if free_vars: + free_vars_trunk = [v for v in free_vars if int(v) in self.graph.nodes] + if len(free_vars_trunk) != len(free_vars): + raise ValueError(f'Free vars were sliced: {free_vars} -> {free_vars_trunk}') + free_vars = free_vars_trunk peo = qtree.graph_model.get_equivalent_peo(self.graph, peo, free_vars) peo = ignored_vars + peo @@ -199,16 +207,15 @@ def _update_peo_after_slice(self, p_graph, slice_vars): nodes, path = qtensor.utils.get_neighbors_path(p_graph, peo_ints) # -- Tree re-peo g_components = list(nx.connected_components(p_graph)) - print(f"# of components: {len(g_components)}, # of nodes total: {p_graph.number_of_nodes()}, # of nodes per component: {[len(c) for c in g_components]}") + #print(f"# of components: {len(g_components)}, # of nodes total: {p_graph.number_of_nodes()}, # of nodes per component: {[len(c) for c in g_components]}") from qtree.graph_model.clique_trees import ( get_tree_from_peo, get_peo_from_tree) tree = get_tree_from_peo(p_graph, peo_ints) clique_vertices = [] - print("Calling get_peo_from_tree") # ---- re-create peo from tree peo_recreate = [] components = list(nx.connected_components(tree)) - print("# of components: ", len(components)) + #print("# of components: ", len(components)) for subtree in components: peo_recreate += get_peo_from_tree(tree.subgraph(subtree).copy(), clique_vertices=clique_vertices) # ---- @@ -272,6 +279,12 @@ def get_ordering_ints(self, graph, inplace=True): name=graph.nodes[var]['name']) for var in par_vars] #log.info('peo {}', self.peo) + #print('graph nodes', len(graph.nodes)) + #print('pgraph nodes', len(p_graph.nodes)) + # Remove parallel vars from graph + for var in par_vars: + qtree.graph_model.base.remove_node(self.graph, var) + #self.graph = p_graph return peo, [self.treewidth] class TamakiOptimizer(Optimizer): @@ -339,10 +352,16 @@ def _split_graph(self, p_graph, max_tw): graph, label_dict = qtree.graph_model.relabel_graph_nodes( p_graph, dict(zip(peo_ints, range(len(p_graph.nodes())))) ) + if self.free_indices: + inv_label_dict = {v:k for k,v in label_dict.items()} + ignore_indices = [inv_label_dict[int(v)] for v in self.free_indices] + else: + ignore_indices = [] + #print('ignore_indices', ignore_indices) if self.cost_type == 'width': - par_vars, _ = qtree.graph_model.splitters.split_graph_by_tree_trimming_width(graph, var_target) + par_vars, _ = qtree.graph_model.splitters.split_graph_by_tree_trimming_width(graph, var_target, ignore_indices=ignore_indices) else: - par_vars, _ = qtree.graph_model.splitters.split_graph_by_tree_trimming(graph, var_target) + par_vars, _ = qtree.graph_model.splitters.split_graph_by_tree_trimming(graph, var_target, ignore_indices=ignore_indices) par_vars = [label_dict[i] for i in par_vars] for var in par_vars: log.debug('Remove node {}. Hood size {}', var, utils.n_neighbors(p_graph, var)) diff --git a/qtensor/tests/test_bucket_backends.py b/qtensor/tests/test_bucket_backends.py index 46600f30..8389dadf 100644 --- a/qtensor/tests/test_bucket_backends.py +++ b/qtensor/tests/test_bucket_backends.py @@ -107,7 +107,9 @@ def contract_bucket_einsum(indices_list, data_dict): index_strs = ["".join([chr(97 + i) for i in ix]) for ix in indices_list] out_indices = [] for ix in indices_list: - out_indices.extend(ix[1:]) + for i in ix[1:]: + if i not in out_indices: + out_indices.append(i) expr = ",".join(index_strs) + "->" + "".join([chr(97 + i) for i in out_indices]) print(f"expr: {expr}") res = np.einsum(expr, *[data_dict[i] for i in range(len(indices_list))]) @@ -186,9 +188,38 @@ def test_backend_single_bucket_one_index(backend, atol): indices_list = [[ix] for _ in range(n_tensors)] print(f"indices_list: {indices_list}") data_dict = {i: np.random.rand(*[2] * len(ix)) for i, ix in enumerate(indices_list)} - - # Test the slicing correctness as well res_ref = contract_bucket_einsum(indices_list, data_dict) + res = contract_bucket(indices_list, backend, data_dict) + assert np.allclose(res, res_ref, atol=atol) + +@pytest.mark.parametrize( + ["backend", "atol"], + [ + # NOTE: 04/02/24 temporary disable cupy backend, it is not working on my machine + # ('cupy', 1e-10), + (qtensor.contraction_backends.get_backend("einsum"), 1e-10), + (qtensor.contraction_backends.get_backend("torch"), 1e-10), + # ('cupy_compressed', 1e-10), + (TorchBackendMatm(), 1e-10), + # (CompressionBackend( + # CuPyBackend(), + # CUSZCompressor(r2r_error=1e-4, r2r_threshold=1e-5), + # 11 ), + # 1e-5) + ], +) +def test_backend_single_bucket_trick(backend, atol): + """ + Test a single bucket with different common indices + """ + + # Simple bucket with decreasing number of indices + n_tensors = 3 + ix = 1 + indices_list = [[1, 2], [2, 3], [3]] + print(f"indices_list: {indices_list}") + data_dict = {i: np.random.rand(*[2] * len(ix)) for i, ix in enumerate(indices_list)} + res_ref = contract_bucket_einsum(indices_list, data_dict) res = contract_bucket(indices_list, backend, data_dict) assert np.allclose(res, res_ref, atol=atol) From fdbff76571e95523c0aa6a588acbcddd0800086f Mon Sep 17 00:00:00 2001 From: Danylo Lykov Date: Fri, 12 Apr 2024 01:37:34 -0500 Subject: [PATCH 14/14] add moving results to host memory after contraction using torch --- qtensor/contraction_backends/torch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qtensor/contraction_backends/torch.py b/qtensor/contraction_backends/torch.py index 07ddb5de..b168c20b 100644 --- a/qtensor/contraction_backends/torch.py +++ b/qtensor/contraction_backends/torch.py @@ -551,4 +551,6 @@ def get_result_data(self, result): d = result.data.reshape(self._get_index_sizes(*result.indices)) else: d = result.data + # move to cpu + d = d.cpu() return torch.permute(d, tuple(reversed(range(d.ndim))))