tnwei · AshishS-1123 · Jan 13, 2021 · Jan 13, 2021 · Jan 13, 2021 · Jan 13, 2021
diff --git a/rle_fast/rle_docs.h b/rle_fast/rle_docs.h
@@ -0,0 +1,48 @@
+char rle_module_doc[] = "\n\
+Performs run-length encoding and decoding of a sequence in real time.\n\
+\n\
+Module containing C Extension for RLE operations on large sequences.\n\
+\n\
+Methods\n\
+-------\n\
+encode: performs run-length encoding of given sequence\n\
+decode: performs run-length decoding of given sequence\n\
+\n\
+Usage\n\
+-----\n\
+\n\
+# import module\n\
+import rle.rle_fast as rle\n\
+\n\
+# perform encoding on sequence 'a'\n\
+values, counts = rle.encode(a)\n\
+\n\
+# decode the values and counts to recover original sequence\n\
+sequence = rle.decode(values, counts)\n\
+";
+
+char encode_doc[] = "\n\
+Encodes run-length encoding of given iterable.\n\
+\n\
+Parameters\n\
+----------\n\
+seq: Any Python iterable, e.g.  lists, strings, tuples,\n\
+    pandas Series, to perform run-length encoding on.\n\
+\n\
+Returns\n\
+-------\n\
+values, counts: list of contiguous unique values, and list of\n\
+    counts\n\
+";
+
+char decode_doc[] = "\n\
+Decodes run-length encoding of given iterable.\n\
+\n\
+Parameters\n\
+----------\n\
+values, counts: List of contiguous unique values, and list of counts\n\
+\n\
+Returns\n\
+-------\n\
+seq: Decoded sequence\n\
+";
diff --git a/rle_fast/rle_fast_extension.c b/rle_fast/rle_fast_extension.c
@@ -0,0 +1,84 @@
+// include the utils header containing code for encode and decode operations
+#include "rle_utils.h"
+// include the docs header for docstrings for package
+#include "rle_docs.h"
+
+PyObject* encode_c(PyObject* self, PyObject* args)
+{
+    // Create a Python object to hold the iterable to be processed
+    PyObject* iterable = NULL;
+
+    // Parse the input arguments to get the input parameters
+    if( !PyArg_ParseTuple(args, "O", &iterable) )
+        return Py_None;
+
+    // check if the iterable has zero length,
+    if( PySequence_Length(iterable) == 0 )
+        PyErr_SetString( PyExc_AssertionError, "Sequence passed has zero length");
+
+    // if the length of sequence is returned -1
+    else if( PySequence_Length(iterable) == -1 )
+        PyErr_SetString(PyExc_ValueError, "The given object is not iterable.");
+
+    // create a code structure to hold the encoded result
+    struct code result;
+
+    // get the run length encoding of the iterable
+    result = encode_sequence(iterable);
+
+    // return value and count list as object back to the caller
+    return Py_BuildValue("OO", result.values, result.counts);
+}
+
+
+PyObject* decode_c(PyObject* self, PyObject* args)
+{
+    // create a python object to hold the values
+    PyObject* values = NULL;
+    // create a python object to hold the counts
+    PyObject* counts = NULL;
+
+    // Parse the input arguments to get the input parameters
+    if( !PyArg_ParseTuple(args, "OO", &values, &counts) )
+        return Py_None;
+
+    // check if the value argument is iterable
+    if( PySequence_Length(values) == -1 )
+        PyErr_SetString(PyExc_ValueError, "values argument is not iterable.");
+
+    // check if the count argument is iterable
+    else if( PySequence_Length(counts) == -1)
+        PyErr_SetString(PyExc_ValueError, "counts argument is not iterable.");
+
+    // check if the length of value list is same as that if count list
+    else if( PySequence_Length(values) != PySequence_Length(counts) )
+        PyErr_SetString(PyExc_AssertionError, "len(values) != len(counts)");
+
+    PyObject* decoded_list = decode_sequence(values, counts);
+
+    // return value and count list as object back to the caller
+    return Py_BuildValue("O", decoded_list);
+}
+//########        MODULE LEVEL FUNCTIONS        ########
+
+// method definitions
+static PyMethodDef methods[] = {
+  { "encode", encode_c, METH_VARARGS, encode_doc},
+  { "decode", decode_c, METH_VARARGS, decode_doc},
+  { NULL, NULL, 0, NULL }
+};
+
+// module definition
+static struct PyModuleDef rle_fast = {
+    PyModuleDef_HEAD_INIT,
+    "rle_fast",
+    rle_module_doc,
+    -1,
+    methods
+};
+
+// create the module
+PyMODINIT_FUNC PyInit_rle_fast(void) {
+    Py_Initialize();
+    return PyModule_Create(&rle_fast);
+}
diff --git a/rle_fast/rle_utils.h b/rle_fast/rle_utils.h
@@ -0,0 +1,123 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdbool.h>
+
+// create a structure to hold the encoded result
+struct code
+{
+    // python list object containing the values in encoded result
+    PyObject* values;
+    // python list object containing the counts to values in encoded result
+    PyObject* counts;
+};
+
+struct code encode_sequence( PyObject* sequence )
+{
+    // create an iterator object to the sequence
+    PyObject* iterator = PyObject_GetIter(sequence);
+    // create a python object to hold the current item from the sequence
+    PyObject* current_item;
+
+    // create a new empty python list object to hold all values in sequence
+    PyObject* values_list = PyList_New(0);
+    // create a new empty python list object to hold counts of values in value list
+    PyObject* count_list = PyList_New(0);
+
+    // get first item from sequence and store it in variable for further use
+    PyObject* current_run_value = PyIter_Next(iterator);
+
+    // create variable to hold the run length of current item
+    Py_ssize_t current_run_length = 1;
+
+    // insert the current_run_value in value list
+    PyList_Append(values_list, current_run_value);
+
+    // repeat the following as long as some element is returned from sequence
+    while( (current_item = PyIter_Next(iterator) ) )
+    {
+        // if the current element is still part of the lenght,
+        if ( PyObject_RichCompareBool(current_item,current_run_value, Py_EQ) )
+            // increment to count of current element
+            ++current_run_length;
+        // otherwise,
+        else
+        {
+            // add count of current run to the count list
+            PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) );
+
+            // add the current element to value list
+            PyList_Append(values_list, current_item);
+
+            // update the current run length for current element
+            current_run_length = 1;
+
+            // mark this element as the current element being processed
+            current_run_value = current_item;
+        }
+    }
+
+    // now process the last count
+    PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) );
+
+    // remove the current reference to the iterator
+    Py_DECREF(iterator);
+
+    // create a structure to hold the value and count lists.
+    // this is because, C does not support returning multiple values from functions
+    // so we will wrap these lists in a structure, and return the structure
+    struct code encoded;
+
+    // add the value list to the structure
+    encoded.values = values_list;
+
+    // add the count list to the structure
+    encoded.counts = count_list;
+
+    if (PyErr_Occurred())
+    {
+        // if an error is thrown at some point in the code,
+        // most likely, the user is comparing strings.
+        // this is because, we are assuming the user only uses sequences with numbers,
+        // so when a string is passed, the function PyLong_AsLong throws an error.
+        // because, characters and strings cant be interpreted as long objects
+        PyErr_SetString(PyExc_NotImplementedError, "Datatype not supported.");
+    }
+
+    // return this structure
+    return encoded;
+}
+
+PyObject* decode_sequence(PyObject* values, PyObject* counts)
+{
+    // create a empty python list object to hold the calculated sequence
+    PyObject* decoded_list = PyList_New(0);
+
+    // create an iterator object for the value list
+    PyObject* value_iterator = PyObject_GetIter( values );
+    // create a python object to hold the current item from the sequence
+    PyObject* current_value;
+
+    // create an iterator object for the value list
+    PyObject* count_iterator = PyObject_GetIter( counts );
+    // create a python object to hold the current item from the sequence
+    PyObject* current_count;
+
+    // keep repeating until you can get the next value from value list
+    while( ( current_value = PyIter_Next(value_iterator) ) )
+    {
+        // get the count of current value from count list
+        current_count = PyIter_Next( count_iterator);
+
+        Py_ssize_t current_count_as_integer = PyLong_AsSsize_t(current_count);
+
+        for( Py_ssize_t i = 0; i < current_count_as_integer; ++i )
+        {
+            PyList_Append(decoded_list, current_value);
+        }
+    }
+
+    Py_DECREF(value_iterator);
+    Py_DECREF(count_iterator);
+
+    return decoded_list;
+}
diff --git a/setup.py b/setup.py
@@ -1,14 +1,18 @@
 import setuptools
 
+rle_fast_extension = setuptools.Extension("rle.rle_fast",
+      sources = ["rle_fast/rle_fast_extension.c"] )
+
 with open('README.md', "r") as f:
     long_description = f.read()
 
 setuptools.setup(
-    name='python-rle', 
+    name='python-rle',
     version="0.0.3",
     author="Tan Nian Wei",
     author_email="[email protected]",
     description="Run-length encoding for data analysis in Python",
+    ext_modules = [rle_fast_extension],
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/tnwei/python-rle",

diff --git a/tests/test_decode_rlefast.py b/tests/test_decode_rlefast.py
@@ -0,0 +1,27 @@
+import rle.rle_fast as rle
+import unittest
+
+class TestDecodeRleFast(unittest.TestCase):
+    def test_working(self):
+        values, counts = ['a', 'b', 'c', 'd', 'e'], [1, 2, 3, 2, 1]
+        assert rle.decode(values, counts) == ['a', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'e']
+
+    def test_formats(self):
+        # Mixed formats
+        values, counts = [6, 2, 'abc', 3], [2, 2, 1, 1]
+        assert rle.decode(values, counts) == [6, 6, 2, 2, 'abc', 3]
+
+        # All chars
+        values, counts = ['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1]
+        assert rle.decode(values, counts) == [i for i in 'aabbeeeeddsde']
+
+    def test_one_unique_value_only(self):
+        values, counts = [1], [50]
+        assert rle.decode(values, counts) == [1]*50
+
+    def test_one_value_only(self):
+        values, counts = [1], [1]
+        assert rle.decode(values, counts) == ([1])
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_encode_rlefast.py b/tests/test_encode_rlefast.py
@@ -0,0 +1,29 @@
+import rle.rle_fast as rle
+import unittest
+## Test rle.encode
+
+class TestEncodeRleFast(unittest.TestCase):
+
+    def test_working(self,):
+        test_list = [1, 2, 2, 4, 4, 4, 5, 3]
+        assert rle.encode(test_list) == ([1, 2, 4, 5, 3], [1, 2, 3, 1, 1])
+
+
+    def test_formats(self,):
+        test_tuples = (6, 6, 2, 2, 'abc', 3)
+        assert rle.encode(test_tuples) == ([6, 2, 'abc', 3], [2, 2, 1, 1])
+
+        test_string = 'aabbeeeeddsde'
+        assert rle.encode(test_string) == (['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1])
+
+
+    def test_one_unique_value_only(self,):
+        test_list = [1]*50
+        assert rle.encode(test_list) == ([1], [50])
+
+    def test_last_value_different(self,):
+        test_list = [1] * 49 + [2]
+        assert rle.encode(test_list) == ([1, 2], [49, 1])
+
+if __name__ == "__main__":
+    unittest.main()