Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added rle_fast C extension to improve speed #2

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions rle_fast/rle_docs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
char rle_module_doc[] = "\n\
Performs run-length encoding and decoding of a sequence in real time.\n\
\n\
Module containing C Extension for RLE operations on large sequences.\n\
\n\
Methods\n\
-------\n\
encode: performs run-length encoding of given sequence\n\
decode: performs run-length decoding of given sequence\n\
\n\
Usage\n\
-----\n\
\n\
# import module\n\
import rle.rle_fast as rle\n\
\n\
# perform encoding on sequence 'a'\n\
values, counts = rle.encode(a)\n\
\n\
# decode the values and counts to recover original sequence\n\
sequence = rle.decode(values, counts)\n\
";

char encode_doc[] = "\n\
Encodes run-length encoding of given iterable.\n\
\n\
Parameters\n\
----------\n\
seq: Any Python iterable, e.g. lists, strings, tuples,\n\
pandas Series, to perform run-length encoding on.\n\
\n\
Returns\n\
-------\n\
values, counts: list of contiguous unique values, and list of\n\
counts\n\
";

char decode_doc[] = "\n\
Decodes run-length encoding of given iterable.\n\
\n\
Parameters\n\
----------\n\
values, counts: List of contiguous unique values, and list of counts\n\
\n\
Returns\n\
-------\n\
seq: Decoded sequence\n\
";
84 changes: 84 additions & 0 deletions rle_fast/rle_fast_extension.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// include the utils header containing code for encode and decode operations
#include "rle_utils.h"
// include the docs header for docstrings for package
#include "rle_docs.h"

PyObject* encode_c(PyObject* self, PyObject* args)
{
// Create a Python object to hold the iterable to be processed
PyObject* iterable = NULL;

// Parse the input arguments to get the input parameters
if( !PyArg_ParseTuple(args, "O", &iterable) )
return Py_None;

// check if the iterable has zero length,
if( PySequence_Length(iterable) == 0 )
PyErr_SetString( PyExc_AssertionError, "Sequence passed has zero length");

// if the length of sequence is returned -1
else if( PySequence_Length(iterable) == -1 )
PyErr_SetString(PyExc_ValueError, "The given object is not iterable.");

// create a code structure to hold the encoded result
struct code result;

// get the run length encoding of the iterable
result = encode_sequence(iterable);

// return value and count list as object back to the caller
return Py_BuildValue("OO", result.values, result.counts);
}


PyObject* decode_c(PyObject* self, PyObject* args)
{
// create a python object to hold the values
PyObject* values = NULL;
// create a python object to hold the counts
PyObject* counts = NULL;

// Parse the input arguments to get the input parameters
if( !PyArg_ParseTuple(args, "OO", &values, &counts) )
return Py_None;

// check if the value argument is iterable
if( PySequence_Length(values) == -1 )
PyErr_SetString(PyExc_ValueError, "values argument is not iterable.");

// check if the count argument is iterable
else if( PySequence_Length(counts) == -1)
PyErr_SetString(PyExc_ValueError, "counts argument is not iterable.");

// check if the length of value list is same as that if count list
else if( PySequence_Length(values) != PySequence_Length(counts) )
PyErr_SetString(PyExc_AssertionError, "len(values) != len(counts)");

PyObject* decoded_list = decode_sequence(values, counts);

// return value and count list as object back to the caller
return Py_BuildValue("O", decoded_list);
}
//######## MODULE LEVEL FUNCTIONS ########

// method definitions
static PyMethodDef methods[] = {
{ "encode", encode_c, METH_VARARGS, encode_doc},
{ "decode", decode_c, METH_VARARGS, decode_doc},
{ NULL, NULL, 0, NULL }
};

// module definition
static struct PyModuleDef rle_fast = {
PyModuleDef_HEAD_INIT,
"rle_fast",
rle_module_doc,
-1,
methods
};

// create the module
PyMODINIT_FUNC PyInit_rle_fast(void) {
Py_Initialize();
return PyModule_Create(&rle_fast);
}
123 changes: 123 additions & 0 deletions rle_fast/rle_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdbool.h>

// create a structure to hold the encoded result
struct code
{
// python list object containing the values in encoded result
PyObject* values;
// python list object containing the counts to values in encoded result
PyObject* counts;
};

struct code encode_sequence( PyObject* sequence )
{
// create an iterator object to the sequence
PyObject* iterator = PyObject_GetIter(sequence);
// create a python object to hold the current item from the sequence
PyObject* current_item;

// create a new empty python list object to hold all values in sequence
PyObject* values_list = PyList_New(0);
// create a new empty python list object to hold counts of values in value list
PyObject* count_list = PyList_New(0);

// get first item from sequence and store it in variable for further use
PyObject* current_run_value = PyIter_Next(iterator);

// create variable to hold the run length of current item
Py_ssize_t current_run_length = 1;

// insert the current_run_value in value list
PyList_Append(values_list, current_run_value);

// repeat the following as long as some element is returned from sequence
while( (current_item = PyIter_Next(iterator) ) )
{
// if the current element is still part of the lenght,
if ( PyObject_RichCompareBool(current_item,current_run_value, Py_EQ) )
// increment to count of current element
++current_run_length;
// otherwise,
else
{
// add count of current run to the count list
PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) );

// add the current element to value list
PyList_Append(values_list, current_item);

// update the current run length for current element
current_run_length = 1;

// mark this element as the current element being processed
current_run_value = current_item;
}
}

// now process the last count
PyList_Append(count_list, PyLong_FromSsize_t(current_run_length) );

// remove the current reference to the iterator
Py_DECREF(iterator);

// create a structure to hold the value and count lists.
// this is because, C does not support returning multiple values from functions
// so we will wrap these lists in a structure, and return the structure
struct code encoded;

// add the value list to the structure
encoded.values = values_list;

// add the count list to the structure
encoded.counts = count_list;

if (PyErr_Occurred())
{
// if an error is thrown at some point in the code,
// most likely, the user is comparing strings.
// this is because, we are assuming the user only uses sequences with numbers,
// so when a string is passed, the function PyLong_AsLong throws an error.
// because, characters and strings cant be interpreted as long objects
PyErr_SetString(PyExc_NotImplementedError, "Datatype not supported.");
}

// return this structure
return encoded;
}

PyObject* decode_sequence(PyObject* values, PyObject* counts)
{
// create a empty python list object to hold the calculated sequence
PyObject* decoded_list = PyList_New(0);

// create an iterator object for the value list
PyObject* value_iterator = PyObject_GetIter( values );
// create a python object to hold the current item from the sequence
PyObject* current_value;

// create an iterator object for the value list
PyObject* count_iterator = PyObject_GetIter( counts );
// create a python object to hold the current item from the sequence
PyObject* current_count;

// keep repeating until you can get the next value from value list
while( ( current_value = PyIter_Next(value_iterator) ) )
{
// get the count of current value from count list
current_count = PyIter_Next( count_iterator);

Py_ssize_t current_count_as_integer = PyLong_AsSsize_t(current_count);

for( Py_ssize_t i = 0; i < current_count_as_integer; ++i )
{
PyList_Append(decoded_list, current_value);
}
}

Py_DECREF(value_iterator);
Py_DECREF(count_iterator);

return decoded_list;
}
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import setuptools

rle_fast_extension = setuptools.Extension("rle.rle_fast",
sources = ["rle_fast/rle_fast_extension.c"] )

with open('README.md', "r") as f:
long_description = f.read()

setuptools.setup(
name='python-rle',
name='python-rle',
version="0.0.3",
author="Tan Nian Wei",
author_email="[email protected]",
description="Run-length encoding for data analysis in Python",
ext_modules = [rle_fast_extension],
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/tnwei/python-rle",
Expand Down
27 changes: 27 additions & 0 deletions tests/test_decode_rlefast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import rle.rle_fast as rle
import unittest

class TestDecodeRleFast(unittest.TestCase):
def test_working(self):
values, counts = ['a', 'b', 'c', 'd', 'e'], [1, 2, 3, 2, 1]
assert rle.decode(values, counts) == ['a', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'e']

def test_formats(self):
# Mixed formats
values, counts = [6, 2, 'abc', 3], [2, 2, 1, 1]
assert rle.decode(values, counts) == [6, 6, 2, 2, 'abc', 3]

# All chars
values, counts = ['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1]
assert rle.decode(values, counts) == [i for i in 'aabbeeeeddsde']

def test_one_unique_value_only(self):
values, counts = [1], [50]
assert rle.decode(values, counts) == [1]*50

def test_one_value_only(self):
values, counts = [1], [1]
assert rle.decode(values, counts) == ([1])

if __name__ == "__main__":
unittest.main()
29 changes: 29 additions & 0 deletions tests/test_encode_rlefast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import rle.rle_fast as rle
import unittest
## Test rle.encode

class TestEncodeRleFast(unittest.TestCase):

def test_working(self,):
test_list = [1, 2, 2, 4, 4, 4, 5, 3]
assert rle.encode(test_list) == ([1, 2, 4, 5, 3], [1, 2, 3, 1, 1])


def test_formats(self,):
test_tuples = (6, 6, 2, 2, 'abc', 3)
assert rle.encode(test_tuples) == ([6, 2, 'abc', 3], [2, 2, 1, 1])

test_string = 'aabbeeeeddsde'
assert rle.encode(test_string) == (['a', 'b', 'e', 'd', 's', 'd', 'e'], [2, 2, 4, 2, 1, 1, 1])


def test_one_unique_value_only(self,):
test_list = [1]*50
assert rle.encode(test_list) == ([1], [50])

def test_last_value_different(self,):
test_list = [1] * 49 + [2]
assert rle.encode(test_list) == ([1, 2], [49, 1])

if __name__ == "__main__":
unittest.main()