Added tests, added skipRaw, simplified ensureAvailable

markrileybot · Dec 8, 2016 · 345362d · 345362d
1 parent b86ad77
commit 345362d
Show file tree

Hide file tree

Showing 4 changed files with 217 additions and 29 deletions.
diff --git a/src/main/java/heatshrink/HsInputStream.java b/src/main/java/heatshrink/HsInputStream.java
@@ -3,24 +3,20 @@
 import java.io.EOFException;
 import java.io.FilterInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 
 /**
  *
  */
 public class HsInputStream extends FilterInputStream {
 
-	private static final double LN2 = Math.log(2);
-
 	/**
 	 * State machine states.  Don't really need this but it's nice
 	 * to match the c code.
 	 */
 	enum State {
 		TAG_BIT,                /* tag bit */
 		YIELD_LITERAL,          /* ready to yield literal byte */
-		BACKREF_INDEX,          /* ready to yield backref index */
-		BACKREF_COUNT,          /* ready to yield backref count */
+		BACKREF_BOUNDS,         /* ready to yield backref bounds */
 		YIELD_BACKREF,          /* ready to yield back-reference */
 		BUFFER_EMPTY,           /* Not enough data to continue */
 	}
@@ -87,17 +83,17 @@ enum State {
 	 *           this instance is to be created without an underlying stream.
 	 */
 	public HsInputStream(java.io.InputStream in) {
-		this(in, 2 << 11, 11, 4);
+		this(in, 11, 4);
 	}
 
 
 	public HsInputStream(java.io.InputStream in, int windowSize, int lookaheadSize) {
-		this(in, 2 << windowSize, windowSize, lookaheadSize);
+		this(in, bestInputBufferSize(0, windowSize), windowSize, lookaheadSize);
 	}
 
 	public HsInputStream(java.io.InputStream in, int bufferSize, int windowSize, int lookaheadSize) {
 		super(in);
-		this.inputBuffer = new byte[Math.max(2 << windowSize, bufferSize)];
+		this.inputBuffer = new byte[bestInputBufferSize(bufferSize, windowSize)];
 		this.window = new byte[1 << windowSize];
 		this.windowSize = windowSize;
 		this.lookaheadSize = lookaheadSize;
@@ -185,17 +181,16 @@ public int read(byte b[], int off, int len) throws IOException {
 				case YIELD_LITERAL:
 					state = readLiteral(rr);
 					break;
-				case BACKREF_INDEX:
-					state = readBackrefIndex();
-					break;
-				case BACKREF_COUNT:
-					state = readBackrefCount();
+				case BACKREF_BOUNDS:
+					state = readBackrefBounds();
 					break;
 				case YIELD_BACKREF:
 					state = readBackref(rr);
 					break;
 				case BUFFER_EMPTY:
 					break;
+				default:
+					break;
 			}
 
 			if(state == State.BUFFER_EMPTY) {
@@ -216,18 +211,15 @@ private State readTagBit() throws IOException {
 			return State.YIELD_LITERAL;
 		}
 		outputCount = outputIndex = 0;
-		return State.BACKREF_INDEX;
+		return State.BACKREF_BOUNDS;
 	}
 
-	private State readBackrefIndex() throws IOException {
+	private State readBackrefBounds() throws IOException {
 		int bits = getBits(windowSize);
 		if(bits == -1) return State.BUFFER_EMPTY;
 		outputIndex = bits + 1;
-		return State.BACKREF_COUNT;
-	}
 
-	private State readBackrefCount() throws IOException {
-		int bits = getBits(lookaheadSize);
+		bits = getBits(lookaheadSize);
 		if(bits == -1) return State.BUFFER_EMPTY;
 		outputCount = bits + 1;
 		return State.YIELD_BACKREF;
@@ -264,13 +256,14 @@ private State readLiteral(ReadResult rr) throws IOException {
 	}
 
 	/**
+	 * Skips bytes (decompressing first).  This means that <code>n</code>
+	 * and the return value will be relative to the uncompressed bytes.
+	 *
 	 * Skips over and discards <code>n</code> bytes of data from the
 	 * input stream. The <code>skip</code> method may, for a variety of
 	 * reasons, end up skipping over some smaller number of bytes,
 	 * possibly <code>0</code>. The actual number of bytes skipped is
 	 * returned.
-	 * <p>
-	 * This method simply performs <code>in.skip(n)</code>.
 	 *
 	 * @param      n   the number of bytes to be skipped.
 	 * @return     the actual number of bytes skipped.
@@ -285,6 +278,25 @@ public long skip(long n) throws IOException {
 		return r;
 	}
 
+	/**
+	 * Like skip but skips compressed bytes
+	 *
+	 * @param n the number of compressed bytes to skip
+	 * @return The number of bytes skipped
+	 * @throws IOException If something breaks
+	 * @see #skip(long)
+	 */
+	public long skipRaw(long n) throws IOException {
+		long toSkip = Math.min(n, inputBufferLen - inputBufferPos);
+		currentBytePos = 0;
+		inputBufferPos += toSkip;
+		n -= toSkip;
+		if(n > 0) {
+			toSkip += in.skip(n);
+		}
+		return toSkip;
+	}
+
 	/**
 	 * Returns an estimate of the number of bytes that can be read (or
 	 * skipped over) from this input stream without blocking by the next
@@ -384,15 +396,17 @@ public void clear() {
 		state = State.TAG_BIT;
 		outputCount = outputIndex = 0;
 		inputBufferPos = inputBufferLen = 0;
-		currentBytePos = 0;
 		inputExhausted = false;
+		currentBytePos = 0;
+		windowPos = 0;
 	}
 
-	private boolean ensureAvailable(int bitsRequired) throws IOException {
+	// exposed for testing
+	boolean ensureAvailable(int bitsRequired) throws IOException {
 		int bytesRemaining = inputBufferLen - inputBufferPos;
 		int bitsAvailable = bytesRemaining * 8;
 
-		bitsRequired -= (currentBytePos > 0 ? 8 - ((Math.log(currentBytePos) / LN2) + 1) : 0);
+		bitsRequired -= currentBytePos;
 		if(bitsRequired > bitsAvailable) {
 			if(bytesRemaining > 0) {
 				// lame buffer shift won't happen often
@@ -417,26 +431,30 @@ private int getBits(int numBits) throws IOException {
 		if(!ensureAvailable(numBits)) {
 			return -1;
 		}
-		for(; numBits > 0; numBits--, currentBytePos >>= 1) {
+		for(; numBits > 0; numBits--, currentBytePos--) {
 			if(currentBytePos == 0) {
 				currentByte = inputBuffer[inputBufferPos++];
-				currentBytePos = 0x80;
+				currentBytePos = 8;
 			}
 
 			ret <<= 1;
-			if (currentBytePos == 0x80 && numBits >= 8) {
+			if (currentBytePos == 8 && numBits >= 8) {
 				ret <<= 7;
 				ret |= currentByte & 0xff;
 				numBits -= 7;
-				currentBytePos = 0;
-			} else if ((currentByte & currentBytePos) != 0) {
+				currentBytePos = 1;
+			} else if ((currentByte & (1 << (currentBytePos - 1))) != 0) {
 				ret |= 0x01;
 			}
 		}
 
 		return ret;
 	}
 
+	private static int bestInputBufferSize(int bufferSize, int windowSize) {
+		return Math.max(1 << windowSize, bufferSize);
+	}
+
 	private static final class ReadResult {
 		int off;
 		int len;

diff --git a/src/test/java/heatshrink/HsInputStreamTest.java b/src/test/java/heatshrink/HsInputStreamTest.java
@@ -41,4 +41,36 @@ public void testAvailable() throws IOException {
 			Assert.assertEquals(3, hsi.available());
 		}
 	}
+
+	@Test
+	public void testSkip() throws IOException {
+		try (HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[]{1,2,3}))) {
+			Assert.assertEquals(3, hsi.skip(10));
+		}
+	}
+
+	@Test
+	public void testSkipRaw() throws IOException {
+		try(HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[] {}))) {
+			hsi.ensureAvailable(1);
+			Assert.assertEquals(0, hsi.skipRaw(10));
+		}
+		try(HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[] {1,2,3}))) {
+			hsi.ensureAvailable(1);
+			Assert.assertEquals(3, hsi.skipRaw(10));
+		}
+		try(HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[] {0,1,2,3,4,5,6,7,8,9}))) {
+			hsi.ensureAvailable(1);
+			Assert.assertEquals(10, hsi.skipRaw(10));
+		}
+		try(HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[] {0,1,2,3,4,5,6,7,8,9}))) {
+			hsi.ensureAvailable(1);
+			Assert.assertEquals(5, hsi.skipRaw(5));
+		}
+		try(HsInputStream hsi = new HsInputStream(new ByteArrayInputStream(new byte[1024]), 5, 4)) {
+			hsi.ensureAvailable(1);
+			Assert.assertEquals(513, hsi.skipRaw(513));
+			Assert.assertEquals(511, hsi.skipRaw(513));
+		}
+	}
 }
diff --git a/src/test/resources/testfiles/README.md b/src/test/resources/testfiles/README.md
@@ -0,0 +1,138 @@
+# heatshrink
+
+A data compression/decompression library for embedded/real-time systems.
+
+
+## Key Features:
+
+- **Low memory usage (as low as 50 bytes)**
+    It is useful for some cases with less than 50 bytes, and useful
+    for many general cases with < 300 bytes.
+- **Incremental, bounded CPU use**
+    You can chew on input data in arbitrarily tiny bites.
+    This is a useful property in hard real-time environments.
+- **Can use either static or dynamic memory allocation**
+    The library doesn't impose any constraints on memory management.
+- **ISC license**
+    You can use it freely, even for commercial purposes.
+
+
+## Getting Started:
+
+There is a standalone command-line program, `heatshrink`, but the
+encoder and decoder can also be used as libraries, independent of each
+other. To do so, copy `heatshrink_common.h`, `heatshrink_config.h`, and
+either `heatshrink_encoder.c` or `heatshrink_decoder.c` (and their
+respective header) into your project. For projects that use both,
+static libraries are built that use static and dynamic allocation.
+
+Dynamic allocation is used by default, but in an embedded context, you
+probably want to statically allocate the encoder/decoder. Set
+`HEATSHRINK_DYNAMIC_ALLOC` to 0 in `heatshrink_config.h`.
+
+
+### Basic Usage
+
+1. Allocate a `heatshrink_encoder` or `heatshrink_decoder` state machine
+using their `alloc` function, or statically allocate one and call their
+`reset` function to initialize them. (See below for configuration
+options.)
+
+2. Use `sink` to sink an input buffer into the state machine. The
+`input_size` pointer argument will be set to indicate how many bytes of
+the input buffer were actually consumed. (If 0 bytes were conusmed, the
+buffer is full.)
+
+3. Use `poll` to move output from the state machine into an output
+buffer. The `output_size` pointer argument will be set to indicate how
+many bytes were output, and the function return value will indicate
+whether further output is available. (The state machine may not output
+any data until it has received enough input.)
+
+Repeat steps 2 and 3 to stream data through the state machine. Since
+it's doing data compression, the input and output sizes can vary
+significantly. Looping will be necessary to buffer the input and output
+as the data is processed.
+
+4. When the end of the input stream is reached, call `finish` to notify
+the state machine that no more input is available. The return value from
+`finish` will indicate whether any output remains. if so, call `poll` to
+get more.
+
+Continue calling `finish` and `poll`ing to flush remaining output until
+`finish` indicates that the output has been exhausted.
+
+Sinking more data after `finish` has been called will not work without
+calling `reset` on the state machine.
+
+
+## Configuration
+
+heatshrink has a couple configuration options, which impact its resource
+usage and how effectively it can compress data. These are set when
+dynamically allocating an encoder or decoder, or in `heatshrink_config.h`
+if they are statically allocated.
+
+- `window_sz2`, `-w` in the CLI: Set the window size to 2^W bytes.
+
+The window size determines how far back in the input can be searched for
+repeated patterns. A `window_sz2` of 8 will only use 256 bytes (2^8),
+while a `window_sz2` of 10 will use 1024 bytes (2^10). The latter uses
+more memory, but may also compress more effectively by detecting more
+repetition.
+
+The `window_sz2` setting currently must be between 4 and 15.
+
+- `lookahead_sz2`, `-l` in the CLI: Set the lookahead size to 2^L bytes.
+
+The lookahead size determines the max length for repeated patterns that
+are found. If the `lookahead_sz2` is 4, a 50-byte run of 'a' characters
+will be represented as several repeated 16-byte patterns (2^4 is 16),
+whereas a larger `lookahead_sz2` may be able to represent it all at
+once. The number of bits used for the lookahead size is fixed, so an
+overly large lookahead size can reduce compression by adding unused
+size bits to small patterns.
+
+The `lookahead_sz2` setting currently must be between 3 and the
+`window_sz2` - 1.
+
+- `input_buffer_size` - How large an input buffer to use for the
+decoder. This impacts how much work the decoder can do in a single
+step, and a larger buffer will use more memory. An extremely small
+buffer (say, 1 byte) will add overhead due to lots of suspend/resume
+function calls, but should not change how well data compresses.
+
+
+### Recommended Defaults
+
+For embedded/low memory contexts, a `window_sz2` in the 8 to 10 range is
+probably a good default, depending on how tight memory is. Smaller or
+larger window sizes may make better trade-offs in specific
+circumstances, but should be checked with representative data.
+
+The `lookahead_sz2` should probably start near the `window_sz2`/2, e.g.
+-w 8 -l 4 or -w 10 -l 5. The command-line program can be used to measure
+how well test data works with different settings.
+
+
+## More Information and Benchmarks:
+
+heatshrink is based on [LZSS], since it's particularly suitable for
+compression in small amounts of memory. It can use an optional, small
+[index] to make compression significantly faster, but otherwise can run
+in under 100 bytes of memory. The index currently adds 2^(window size+1)
+bytes to memory usage for compression, and temporarily allocates 512
+bytes on the stack during index construction (if the index is enabled).
+
+For more information, see the [blog post] for an overview, and the
+`heatshrink_encoder.h` / `heatshrink_decoder.h` header files for API
+documentation.
+
+[blog post]: http://spin.atomicobject.com/2013/03/14/heatshrink-embedded-data-compression/
+[index]: http://spin.atomicobject.com/2014/01/13/lightweight-indexing-for-embedded-systems/
+[LZSS]: http://en.wikipedia.org/wiki/Lempel-Ziv-Storer-Szymanski
+
+
+## Build Status
+
+  [![Build Status](https://travis-ci.org/atomicobject/heatshrink.png)](http://travis-ci.org/atomicobject/heatshrink)
diff --git a/src/test/resources/testfiles/README.md.hs.9.8 b/src/test/resources/testfiles/README.md.hs.9.8