Simple light-weight LZ compression library.

Dependents:   display-puck

Revision:
1:0c9aeae76ab2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lz.cpp	Tue Jul 29 14:02:53 2014 +0000
@@ -0,0 +1,542 @@
+/*************************************************************************
+* Name:        lz.c
+* Author:      Marcus Geelnard
+* Description: LZ77 coder/decoder implementation.
+* Reentrant:   Yes
+*
+* The LZ77 compression scheme is a substitutional compression scheme
+* proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in
+* its design, and uses no fancy bit level compression.
+*
+* This is my first attempt at an implementation of a LZ77 code/decoder.
+*
+* The principle of the LZ77 compression algorithm is to store repeated
+* occurrences of strings as references to previous occurrences of the same
+* string. The point is that the reference consumes less space than the
+* string itself, provided that the string is long enough (in this
+* implementation, the string has to be at least 4 bytes long, since the
+* minimum coded reference is 3 bytes long). Also note that the term
+* "string" refers to any kind of byte sequence (it does not have to be
+* an ASCII string, for instance).
+*
+* The coder uses a brute force approach to finding string matches in the
+* history buffer (or "sliding window", if you wish), which is very, very
+* slow. I recon the complexity is somewhere between O(n^2) and O(n^3),
+* depending on the input data.
+*
+* There is also a faster implementation that uses a large working buffer
+* in which a "jump table" is stored, which is used to quickly find
+* possible string matches (see the source code for LZ_CompressFast() for
+* more information). The faster method is an order of magnitude faster,
+* but still quite slow compared to other compression methods.
+*
+* The upside is that decompression is very fast, and the compression ratio
+* is often very good.
+*
+* The reference to a string is coded as a (length,offset) pair, where the
+* length indicates the length of the string, and the offset gives the
+* offset from the current data position. To distinguish between string
+* references and literal strings (uncompressed bytes), a string reference
+* is preceded by a marker byte, which is chosen as the least common byte
+* symbol in the input data stream (this marker byte is stored in the
+* output stream as the first byte).
+*
+* Occurrences of the marker byte in the stream are encoded as the marker
+* byte followed by a zero byte, which means that occurrences of the marker
+* byte have to be coded with two bytes.
+*
+* The lengths and offsets are coded in a variable length fashion, allowing
+* values of any magnitude (up to 4294967295 in this implementation).
+*
+* With this compression scheme, the worst case compression result is
+* (257/256)*insize + 1.
+*
+*-------------------------------------------------------------------------
+* Copyright (c) 2003-2006 Marcus Geelnard
+*
+* This software is provided 'as-is', without any express or implied
+* warranty. In no event will the authors be held liable for any damages
+* arising from the use of this software.
+*
+* Permission is granted to anyone to use this software for any purpose,
+* including commercial applications, and to alter it and redistribute it
+* freely, subject to the following restrictions:
+*
+* 1. The origin of this software must not be misrepresented; you must not
+*    claim that you wrote the original software. If you use this software
+*    in a product, an acknowledgment in the product documentation would
+*    be appreciated but is not required.
+*
+* 2. Altered source versions must be plainly marked as such, and must not
+*    be misrepresented as being the original software.
+*
+* 3. This notice may not be removed or altered from any source
+*    distribution.
+*
+* Marcus Geelnard
+* marcus.geelnard at home.se
+*************************************************************************/
+
+
+/*************************************************************************
+* Constants used for LZ77 coding
+*************************************************************************/
+
+/* Maximum offset (can be any size < 2^31). Lower values give faster
+   compression, while higher values gives better compression. The default
+   value of 100000 is quite high. Experiment to see what works best for
+   you. */
+#define LZ_MAX_OFFSET 100000
+
+/*************************************************************************
+*                           INTERNAL FUNCTIONS                           *
+*************************************************************************/
+
+
+/*************************************************************************
+* _LZ_StringCompare() - Return maximum length string match.
+*************************************************************************/
+
+static unsigned int _LZ_StringCompare( unsigned char * str1,
+  unsigned char * str2, unsigned int minlen, unsigned int maxlen )
+{
+    unsigned int len;
+
+    for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len );
+
+    return len;
+}
+
+
+/*************************************************************************
+* _LZ_WriteVarSize() - Write unsigned integer with variable number of
+* bytes depending on value.
+*************************************************************************/
+
+static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf )
+{
+    unsigned int y;
+    int num_bytes, i, b;
+
+    /* Determine number of bytes needed to store the number x */
+    y = x >> 3;
+    for( num_bytes = 5; num_bytes >= 2; -- num_bytes )
+    {
+        if( y & 0xfe000000 ) break;
+        y <<= 7;
+    }
+
+    /* Write all bytes, seven bits in each, with 8:th bit set for all */
+    /* but the last byte. */
+    for( i = num_bytes-1; i >= 0; -- i )
+    {
+        b = (x >> (i*7)) & 0x0000007f;
+        if( i > 0 )
+        {
+            b |= 0x00000080;
+        }
+        *buf ++ = (unsigned char) b;
+    }
+
+    /* Return number of bytes written */
+    return num_bytes;
+}
+
+
+/*************************************************************************
+* _LZ_ReadVarSize() - Read unsigned integer with variable number of
+* bytes depending on value.
+*************************************************************************/
+
+static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf )
+{
+    unsigned int y, b, num_bytes;
+
+    /* Read complete value (stop when byte contains zero in 8:th bit) */
+    y = 0;
+    num_bytes = 0;
+    do
+    {
+        b = (unsigned int) (*buf ++);
+        y = (y << 7) | (b & 0x0000007f);
+        ++ num_bytes;
+    }
+    while( b & 0x00000080 );
+
+    /* Store value in x */
+    *x = y;
+
+    /* Return number of bytes read */
+    return num_bytes;
+}
+
+
+
+/*************************************************************************
+*                            PUBLIC FUNCTIONS                            *
+*************************************************************************/
+
+
+/*************************************************************************
+* LZ_Compress() - Compress a block of data using an LZ77 coder.
+*  in     - Input (uncompressed) buffer.
+*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
+*           than the input buffer, plus one byte.
+*  insize - Number of input bytes.
+* The function returns the size of the compressed data.
+*************************************************************************/
+
+int LZ_Compress( unsigned char *in, unsigned char *out,
+    unsigned int insize )
+{
+    unsigned char marker, symbol;
+    unsigned int  inpos, outpos, bytesleft, i;
+    unsigned int  maxoffset, offset, bestoffset;
+    unsigned int  maxlength, length, bestlength;
+    unsigned int  histogram[ 256 ];
+    unsigned char *ptr1, *ptr2;
+    
+
+    /* Do we have anything to compress? */
+    if( insize < 1 )
+    {
+        return 0;
+    }
+    
+
+    /* Create histogram */
+    for( i = 0; i < 256; ++ i )
+    {
+        histogram[ i ] = 0;
+    }
+    for( i = 0; i < insize; ++ i )
+    {
+        ++ histogram[ in[ i ] ];
+    }
+
+    /* Find the least common byte, and use it as the marker symbol */
+    marker = 0;
+    for( i = 1; i < 256; ++ i )
+    {
+        if( histogram[ i ] < histogram[ marker ] )
+        {
+            marker = i;
+        }
+    }
+
+    /* Remember the marker symbol for the decoder */
+    out[ 0 ] = marker;
+
+    /* Start of compression */
+    inpos = 0;
+    outpos = 1;
+
+    /* Main compression loop */
+    bytesleft = insize;
+    do
+    {
+        
+        
+        /* Determine most distant position */
+        if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET;
+        else                        maxoffset = inpos;
+
+        /* Get pointer to current position */
+        ptr1 = &in[ inpos ];
+
+        /* Search history window for maximum length string match */
+        bestlength = 3;
+        bestoffset = 0;
+        for( offset = 3; offset <= maxoffset; ++ offset )
+        {
+            /* Get pointer to candidate string */
+            ptr2 = &ptr1[ -(int)offset ];
+
+            /* Quickly determine if this is a candidate (for speed) */
+            if( (ptr1[ 0 ] == ptr2[ 0 ]) &&
+                (ptr1[ bestlength ] == ptr2[ bestlength ]) )
+            {
+                /* Determine maximum length for this offset */
+                maxlength = (bytesleft < offset ? bytesleft : offset);
+
+                /* Count maximum length match at this offset */
+                length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength );
+
+                /* Better match than any previous match? */
+                if( length > bestlength )
+                {
+                    bestlength = length;
+                    bestoffset = offset;
+                }
+            }
+        }
+
+        /* Was there a good enough match? */
+        if( (bestlength >= 8) ||
+            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
+            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
+            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
+            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
+        {
+            out[ outpos ++ ] = (unsigned char) marker;
+            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
+            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
+            inpos += bestlength;
+            bytesleft -= bestlength;
+        }
+        else
+        {
+            /* Output single byte (or two bytes if marker byte) */
+            symbol = in[ inpos ++ ];
+            out[ outpos ++ ] = symbol;
+            if( symbol == marker )
+            {
+                out[ outpos ++ ] = 0;
+            }
+            -- bytesleft;
+        }
+    }
+    while( bytesleft > 3 );
+
+    /* Dump remaining bytes, if any */
+    while( inpos < insize )
+    {
+        if( in[ inpos ] == marker )
+        {
+            out[ outpos ++ ] = marker;
+            out[ outpos ++ ] = 0;
+        }
+        else
+        {
+            out[ outpos ++ ] = in[ inpos ];
+        }
+        ++ inpos;
+    }
+
+    return outpos;
+}
+
+
+/*************************************************************************
+* LZ_CompressFast() - Compress a block of data using an LZ77 coder.
+*  in     - Input (uncompressed) buffer.
+*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
+*           than the input buffer, plus one byte.
+*  insize - Number of input bytes.
+*  work   - Pointer to a temporary buffer (internal working buffer), which
+*           must be able to hold (insize+65536) unsigned integers.
+* The function returns the size of the compressed data.
+*************************************************************************/
+
+int LZ_CompressFast( unsigned char *in, unsigned char *out,
+    unsigned int insize, unsigned int *work )
+{
+    unsigned char marker, symbol;
+    unsigned int  inpos, outpos, bytesleft, i, index, symbols;
+    unsigned int  offset, bestoffset;
+    unsigned int  maxlength, length, bestlength;
+    unsigned int  histogram[ 256 ], *lastindex, *jumptable;
+    unsigned char *ptr1, *ptr2;
+
+    /* Do we have anything to compress? */
+    if( insize < 1 )
+    {
+        return 0;
+    }
+
+    /* Assign arrays to the working area */
+    lastindex = work;
+    jumptable = &work[ 65536 ];
+
+    /* Build a "jump table". Here is how the jump table works:
+       jumptable[i] points to the nearest previous occurrence of the same
+       symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and
+       in[i+1] == in[jumptable[i]+1], and so on... Following the jump table
+       gives a dramatic boost for the string search'n'match loop compared
+       to doing a brute force search. The jump table is built in O(n) time,
+       so it is a cheap operation in terms of time, but it is expensice in
+       terms of memory consumption. */
+    for( i = 0; i < 65536; ++ i )
+    {
+        lastindex[ i ] = 0xffffffff;
+    }
+    for( i = 0; i < insize-1; ++ i )
+    {
+        symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]);
+        index = lastindex[ symbols ];
+        lastindex[ symbols ] = i;
+        jumptable[ i ] = index;
+    }
+    jumptable[ insize-1 ] = 0xffffffff;
+
+    /* Create histogram */
+    for( i = 0; i < 256; ++ i )
+    {
+        histogram[ i ] = 0;
+    }
+    for( i = 0; i < insize; ++ i )
+    {
+        ++ histogram[ in[ i ] ];
+    }
+
+    /* Find the least common byte, and use it as the marker symbol */
+    marker = 0;
+    for( i = 1; i < 256; ++ i )
+    {
+        if( histogram[ i ] < histogram[ marker ] )
+        {
+            marker = i;
+        }
+    }
+
+    /* Remember the marker symbol for the decoder */
+    out[ 0 ] = marker;
+
+    /* Start of compression */
+    inpos = 0;
+    outpos = 1;
+
+    /* Main compression loop */
+    bytesleft = insize;
+    do
+    {
+        /* Get pointer to current position */
+        ptr1 = &in[ inpos ];
+
+        /* Search history window for maximum length string match */
+        bestlength = 3;
+        bestoffset = 0;
+        index = jumptable[ inpos ];
+        while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) )
+        {
+            /* Get pointer to candidate string */
+            ptr2 = &in[ index ];
+
+            /* Quickly determine if this is a candidate (for speed) */
+            if( ptr2[ bestlength ] == ptr1[ bestlength ] )
+            {
+                /* Determine maximum length for this offset */
+                offset = inpos - index;
+                maxlength = (bytesleft < offset ? bytesleft : offset);
+
+                /* Count maximum length match at this offset */
+                length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength );
+
+                /* Better match than any previous match? */
+                if( length > bestlength )
+                {
+                    bestlength = length;
+                    bestoffset = offset;
+                }
+            }
+
+            /* Get next possible index from jump table */
+            index = jumptable[ index ];
+        }
+
+        /* Was there a good enough match? */
+        if( (bestlength >= 8) ||
+            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
+            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
+            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
+            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
+        {
+            out[ outpos ++ ] = (unsigned char) marker;
+            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
+            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
+            inpos += bestlength;
+            bytesleft -= bestlength;
+        }
+        else
+        {
+            /* Output single byte (or two bytes if marker byte) */
+            symbol = in[ inpos ++ ];
+            out[ outpos ++ ] = symbol;
+            if( symbol == marker )
+            {
+                out[ outpos ++ ] = 0;
+            }
+            -- bytesleft;
+        }
+    }
+    while( bytesleft > 3 );
+
+    /* Dump remaining bytes, if any */
+    while( inpos < insize )
+    {
+        if( in[ inpos ] == marker )
+        {
+            out[ outpos ++ ] = marker;
+            out[ outpos ++ ] = 0;
+        }
+        else
+        {
+            out[ outpos ++ ] = in[ inpos ];
+        }
+        ++ inpos;
+    }
+
+    return outpos;
+}
+
+
+/*************************************************************************
+* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder.
+*  in      - Input (compressed) buffer.
+*  out     - Output (uncompressed) buffer. This buffer must be large
+*            enough to hold the uncompressed data.
+*  insize  - Number of input bytes.
+*************************************************************************/
+
+void LZ_Uncompress( unsigned char *in, unsigned char *out,
+    unsigned int insize )
+{
+    unsigned char marker, symbol;
+    unsigned int  i, inpos, outpos, length, offset;
+
+    /* Do we have anything to uncompress? */
+    if( insize < 1 )
+    {
+        return;
+    }
+
+    /* Get marker symbol from input stream */
+    marker = in[ 0 ];
+    inpos = 1;
+
+    /* Main decompression loop */
+    outpos = 0;
+    do
+    {
+        symbol = in[ inpos ++ ];
+        if( symbol == marker )
+        {
+            /* We had a marker byte */
+            if( in[ inpos ] == 0 )
+            {
+                /* It was a single occurrence of the marker byte */
+                out[ outpos ++ ] = marker;
+                ++ inpos;
+            }
+            else
+            {
+                /* Extract true length and offset */
+                inpos += _LZ_ReadVarSize( &length, &in[ inpos ] );
+                inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] );
+
+                /* Copy corresponding data from history window */
+                for( i = 0; i < length; ++ i )
+                {
+                    out[ outpos ] = out[ outpos - offset ];
+                    ++ outpos;
+                }
+            }
+        }
+        else
+        {
+            /* No marker, plain copy */
+            out[ outpos ++ ] = symbol;
+        }
+    }
+    while( inpos < insize );
+}