From 3532399a1297b070eb980d3e668f5ca1321cdf6f Mon Sep 17 00:00:00 2001 From: Ashish Shevale Date: Wed, 13 Jan 2021 09:30:16 +0530 Subject: [PATCH 1/5] Added wrapper code for building extension --- rle_fast/rle_fast_extension.c | 84 +++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 rle_fast/rle_fast_extension.c diff --git a/rle_fast/rle_fast_extension.c b/rle_fast/rle_fast_extension.c new file mode 100644 index 0000000..decd3aa --- /dev/null +++ b/rle_fast/rle_fast_extension.c @@ -0,0 +1,84 @@ +// include the utils header containing code for encode and decode operations +#include "rle_utils.h" +// include the docs header for docstrings for package +#include "rle_docs.h" + +PyObject* encode_c(PyObject* self, PyObject* args) +{ + // Create a Python object to hold the iterable to be processed + PyObject* iterable = NULL; + + // Parse the input arguments to get the input parameters + if( !PyArg_ParseTuple(args, "O", &iterable) ) + return Py_None; + + // check if the iterable has zero length, + if( PySequence_Length(iterable) == 0 ) + PyErr_SetString( PyExc_AssertionError, "Sequence passed has zero length"); + + // if the length of sequence is returned -1 + else if( PySequence_Length(iterable) == -1 ) + PyErr_SetString(PyExc_ValueError, "The given object is not iterable."); + + // create a code structure to hold the encoded result + struct code result; + + // get the run length encoding of the iterable + result = encode_sequence(iterable); + + // return value and count list as object back to the caller + return Py_BuildValue("OO", result.values, result.counts); +} + + +PyObject* decode_c(PyObject* self, PyObject* args) +{ + // create a python object to hold the values + PyObject* values = NULL; + // create a python object to hold the counts + PyObject* counts = NULL; + + // Parse the input arguments to get the input parameters + if( !PyArg_ParseTuple(args, "OO", &values, &counts) ) + return Py_None; + + // check if the value argument is iterable + if( PySequence_Length(values) == -1 ) + PyErr_SetString(PyExc_ValueError, "values argument is not iterable."); + + // check if the count argument is iterable + else if( PySequence_Length(counts) == -1) + PyErr_SetString(PyExc_ValueError, "counts argument is not iterable."); + + // check if the length of value list is same as that if count list + else if( PySequence_Length(values) != PySequence_Length(counts) ) + PyErr_SetString(PyExc_AssertionError, "len(values) != len(counts)"); + + PyObject* decoded_list = decode_sequence(values, counts); + + // return value and count list as object back to the caller + return Py_BuildValue("O", decoded_list); +} +//######## MODULE LEVEL FUNCTIONS ######## + +// method definitions +static PyMethodDef methods[] = { + { "encode", encode_c, METH_VARARGS, encode_doc}, + { "decode", decode_c, METH_VARARGS, decode_doc}, + { NULL, NULL, 0, NULL } +}; + +// module definition +static struct PyModuleDef rle_fast = { + PyModuleDef_HEAD_INIT, + "rle_fast", + rle_module_doc, + -1, + methods +}; + +// create the module +PyMODINIT_FUNC PyInit_rle_fast(void) { + Py_Initialize(); + return PyModule_Create(&rle_fast); +} From 4c101faef006b9f5a3ec4a2b76f6d692d882f46a Mon Sep 17 00:00:00 2001 From: Ashish Shevale Date: Wed, 13 Jan 2021 09:30:44 +0530 Subject: [PATCH 2/5] Added utility functions for encode and decode operations --- rle_fast/rle_utils.h | 123 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 rle_fast/rle_utils.h diff --git a/rle_fast/rle_utils.h b/rle_fast/rle_utils.h new file mode 100644 index 0000000..075925e --- /dev/null +++ b/rle_fast/rle_utils.h @@ -0,0 +1,123 @@ +#define PY_SSIZE_T_CLEAN +#include +#include + +// create a structure to hold the encoded result +struct code +{ + // python list object containing the values in encoded result + PyObject* values; + // python list object containing the counts to values in encoded result + PyObject* counts; +}; + +struct code encode_sequence( PyObject* sequence ) +{ + // create an iterator object to the sequence + PyObject* iterator = PyObject_GetIter(sequence); + // create a python object to hold the current item from the sequence + PyObject* current_item; + + // create a new empty python list object to hold all values in sequence + PyObject* values_list = PyList_New(0); + // create a new empty python list object to hold counts of values in value list + PyObject* count_list = PyList_New(0); + + // get first item from sequence and store it in variable for further use + PyObject* current_run_value = PyIter_Next(iterator); + + // create variable to hold the run length of current item + Py_ssize_t current_run_length = 1; + + // insert the current_run_value in value list + PyList_Append(values_list, current_run_value); + + // repeat the following as long as some element is returned from sequence + while( (current_item = PyIter_Next(iterator) ) ) + { + // if the current element is still part of the lenght, + if ( PyObject_RichCompareBool(current_item,current_run_value, Py_EQ) ) + // increment to count of current element + ++current_run_length; + // otherwise, + else + { + // add count of current run to the count list + PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) ); + + // add the current element to value list + PyList_Append(values_list, current_item); + + // update the current run length for current element + current_run_length = 1; + + // mark this element as the current element being processed + current_run_value = current_item; + } + } + + // now process the last count + PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) ); + + // remove the current reference to the iterator + Py_DECREF(iterator); + + // create a structure to hold the value and count lists. + // this is because, C does not support returning multiple values from functions + // so we will wrap these lists in a structure, and return the structure + struct code encoded; + + // add the value list to the structure + encoded.values = values_list; + + // add the count list to the structure + encoded.counts = count_list; + + if (PyErr_Occurred()) + { + // if an error is thrown at some point in the code, + // most likely, the user is comparing strings. + // this is because, we are assuming the user only uses sequences with numbers, + // so when a string is passed, the function PyLong_AsLong throws an error. + // because, characters and strings cant be interpreted as long objects + PyErr_SetString(PyExc_NotImplementedError, "Datatype not supported."); + } + + // return this structure + return encoded; +} + +PyObject* decode_sequence(PyObject* values, PyObject* counts) +{ + // create a empty python list object to hold the calculated sequence + PyObject* decoded_list = PyList_New(0); + + // create an iterator object for the value list + PyObject* value_iterator = PyObject_GetIter( values ); + // create a python object to hold the current item from the sequence + PyObject* current_value; + + // create an iterator object for the value list + PyObject* count_iterator = PyObject_GetIter( counts ); + // create a python object to hold the current item from the sequence + PyObject* current_count; + + // keep repeating until you can get the next value from value list + while( ( current_value = PyIter_Next(value_iterator) ) ) + { + // get the count of current value from count list + current_count = PyIter_Next( count_iterator); + + Py_ssize_t current_count_as_integer = PyLong_AsSsize_t(current_count); + + for( Py_ssize_t i = 0; i < current_count_as_integer; ++i ) + { + PyList_Append(decoded_list, current_value); + } + } + + Py_DECREF(value_iterator); + Py_DECREF(count_iterator); + + return decoded_list; +} From 68dddce5728babca28998107bc352ec52376133e Mon Sep 17 00:00:00 2001 From: Ashish Shevale Date: Wed, 13 Jan 2021 09:31:07 +0530 Subject: [PATCH 3/5] Added docstrings for module and functions --- rle_fast/rle_docs.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 rle_fast/rle_docs.h diff --git a/rle_fast/rle_docs.h b/rle_fast/rle_docs.h new file mode 100644 index 0000000..d1802b3 --- /dev/null +++ b/rle_fast/rle_docs.h @@ -0,0 +1,48 @@ +char rle_module_doc[] = "\n\ +Performs run-length encoding and decoding of a sequence in real time.\n\ +\n\ +Module containing C Extension for RLE operations on large sequences.\n\ +\n\ +Methods\n\ +-------\n\ +encode: performs run-length encoding of given sequence\n\ +decode: performs run-length decoding of given sequence\n\ +\n\ +Usage\n\ +-----\n\ +\n\ +# import module\n\ +import rle.rle_fast as rle\n\ +\n\ +# perform encoding on sequence 'a'\n\ +values, counts = rle.encode(a)\n\ +\n\ +# decode the values and counts to recover original sequence\n\ +sequence = rle.decode(values, counts)\n\ +"; + +char encode_doc[] = "\n\ +Encodes run-length encoding of given iterable.\n\ +\n\ +Parameters\n\ +----------\n\ +seq: Any Python iterable, e.g. lists, strings, tuples,\n\ + pandas Series, to perform run-length encoding on.\n\ +\n\ +Returns\n\ +-------\n\ +values, counts: list of contiguous unique values, and list of\n\ + counts\n\ +"; + +char decode_doc[] = "\n\ +Decodes run-length encoding of given iterable.\n\ +\n\ +Parameters\n\ +----------\n\ +values, counts: List of contiguous unique values, and list of counts\n\ +\n\ +Returns\n\ +-------\n\ +seq: Decoded sequence\n\ +"; From 0e57e399c5b9e1799266ab3d97b095d9c214280f Mon Sep 17 00:00:00 2001 From: Ashish Shevale Date: Wed, 13 Jan 2021 09:31:26 +0530 Subject: [PATCH 4/5] Updated setup.py to build extension --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 864886e..ac1d67a 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,18 @@ import setuptools +rle_fast_extension = setuptools.Extension("rle.rle_fast", + sources = ["rle_fast/rle_fast_extension.c"] ) + with open('README.md', "r") as f: long_description = f.read() setuptools.setup( - name='python-rle', + name='python-rle', version="0.0.3", author="Tan Nian Wei", author_email="tannianwei@aggienetwork.com", description="Run-length encoding for data analysis in Python", + ext_modules = [rle_fast_extension], long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/tnwei/python-rle", From eb4c1c49b2f89652a980eaa0400a9b50eae1e639 Mon Sep 17 00:00:00 2001 From: Ashish Shevale Date: Wed, 13 Jan 2021 09:31:42 +0530 Subject: [PATCH 5/5] Added unittests for testing rle_fast --- tests/test_decode_rlefast.py | 27 +++++++++++++++++++++++++++ tests/test_encode_rlefast.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 tests/test_decode_rlefast.py create mode 100644 tests/test_encode_rlefast.py diff --git a/tests/test_decode_rlefast.py b/tests/test_decode_rlefast.py new file mode 100644 index 0000000..944b3b8 --- /dev/null +++ b/tests/test_decode_rlefast.py @@ -0,0 +1,27 @@ +import rle.rle_fast as rle +import unittest + +class TestDecodeRleFast(unittest.TestCase): + def test_working(self): + values, counts = ['a', 'b', 'c', 'd', 'e'], [1, 2, 3, 2, 1] + assert rle.decode(values, counts) == ['a', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'e'] + + def test_formats(self): + # Mixed formats + values, counts = [6, 2, 'abc', 3], [2, 2, 1, 1] + assert rle.decode(values, counts) == [6, 6, 2, 2, 'abc', 3] + + # All chars + values, counts = ['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1] + assert rle.decode(values, counts) == [i for i in 'aabbeeeeddsde'] + + def test_one_unique_value_only(self): + values, counts = [1], [50] + assert rle.decode(values, counts) == [1]*50 + + def test_one_value_only(self): + values, counts = [1], [1] + assert rle.decode(values, counts) == ([1]) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_encode_rlefast.py b/tests/test_encode_rlefast.py new file mode 100644 index 0000000..1d0ae5d --- /dev/null +++ b/tests/test_encode_rlefast.py @@ -0,0 +1,29 @@ +import rle.rle_fast as rle +import unittest +## Test rle.encode + +class TestEncodeRleFast(unittest.TestCase): + + def test_working(self,): + test_list = [1, 2, 2, 4, 4, 4, 5, 3] + assert rle.encode(test_list) == ([1, 2, 4, 5, 3], [1, 2, 3, 1, 1]) + + + def test_formats(self,): + test_tuples = (6, 6, 2, 2, 'abc', 3) + assert rle.encode(test_tuples) == ([6, 2, 'abc', 3], [2, 2, 1, 1]) + + test_string = 'aabbeeeeddsde' + assert rle.encode(test_string) == (['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1]) + + + def test_one_unique_value_only(self,): + test_list = [1]*50 + assert rle.encode(test_list) == ([1], [50]) + + def test_last_value_different(self,): + test_list = [1] * 49 + [2] + assert rle.encode(test_list) == ([1, 2], [49, 1]) + +if __name__ == "__main__": + unittest.main()