Simple light-weight LZ compression library.
Revision 1:0c9aeae76ab2, committed 2014-07-29
- Comitter:
- sigveseb
- Date:
- Tue Jul 29 14:02:53 2014 +0000
- Parent:
- 0:ce4be867658b
- Commit message:
- Change to cpp
Changed in this revision
diff -r ce4be867658b -r 0c9aeae76ab2 lz.c --- a/lz.c Thu Jul 17 14:16:11 2014 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,540 +0,0 @@ -/************************************************************************* -* Name: lz.c -* Author: Marcus Geelnard -* Description: LZ77 coder/decoder implementation. -* Reentrant: Yes -* -* The LZ77 compression scheme is a substitutional compression scheme -* proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in -* its design, and uses no fancy bit level compression. -* -* This is my first attempt at an implementation of a LZ77 code/decoder. -* -* The principle of the LZ77 compression algorithm is to store repeated -* occurrences of strings as references to previous occurrences of the same -* string. The point is that the reference consumes less space than the -* string itself, provided that the string is long enough (in this -* implementation, the string has to be at least 4 bytes long, since the -* minimum coded reference is 3 bytes long). Also note that the term -* "string" refers to any kind of byte sequence (it does not have to be -* an ASCII string, for instance). -* -* The coder uses a brute force approach to finding string matches in the -* history buffer (or "sliding window", if you wish), which is very, very -* slow. I recon the complexity is somewhere between O(n^2) and O(n^3), -* depending on the input data. -* -* There is also a faster implementation that uses a large working buffer -* in which a "jump table" is stored, which is used to quickly find -* possible string matches (see the source code for LZ_CompressFast() for -* more information). The faster method is an order of magnitude faster, -* but still quite slow compared to other compression methods. -* -* The upside is that decompression is very fast, and the compression ratio -* is often very good. -* -* The reference to a string is coded as a (length,offset) pair, where the -* length indicates the length of the string, and the offset gives the -* offset from the current data position. To distinguish between string -* references and literal strings (uncompressed bytes), a string reference -* is preceded by a marker byte, which is chosen as the least common byte -* symbol in the input data stream (this marker byte is stored in the -* output stream as the first byte). -* -* Occurrences of the marker byte in the stream are encoded as the marker -* byte followed by a zero byte, which means that occurrences of the marker -* byte have to be coded with two bytes. -* -* The lengths and offsets are coded in a variable length fashion, allowing -* values of any magnitude (up to 4294967295 in this implementation). -* -* With this compression scheme, the worst case compression result is -* (257/256)*insize + 1. -* -*------------------------------------------------------------------------- -* Copyright (c) 2003-2006 Marcus Geelnard -* -* This software is provided 'as-is', without any express or implied -* warranty. In no event will the authors be held liable for any damages -* arising from the use of this software. -* -* Permission is granted to anyone to use this software for any purpose, -* including commercial applications, and to alter it and redistribute it -* freely, subject to the following restrictions: -* -* 1. The origin of this software must not be misrepresented; you must not -* claim that you wrote the original software. If you use this software -* in a product, an acknowledgment in the product documentation would -* be appreciated but is not required. -* -* 2. Altered source versions must be plainly marked as such, and must not -* be misrepresented as being the original software. -* -* 3. This notice may not be removed or altered from any source -* distribution. -* -* Marcus Geelnard -* marcus.geelnard at home.se -*************************************************************************/ - - -/************************************************************************* -* Constants used for LZ77 coding -*************************************************************************/ - -/* Maximum offset (can be any size < 2^31). Lower values give faster - compression, while higher values gives better compression. The default - value of 100000 is quite high. Experiment to see what works best for - you. */ -#define LZ_MAX_OFFSET 100000 - - - -/************************************************************************* -* INTERNAL FUNCTIONS * -*************************************************************************/ - - -/************************************************************************* -* _LZ_StringCompare() - Return maximum length string match. -*************************************************************************/ - -static unsigned int _LZ_StringCompare( unsigned char * str1, - unsigned char * str2, unsigned int minlen, unsigned int maxlen ) -{ - unsigned int len; - - for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len ); - - return len; -} - - -/************************************************************************* -* _LZ_WriteVarSize() - Write unsigned integer with variable number of -* bytes depending on value. -*************************************************************************/ - -static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf ) -{ - unsigned int y; - int num_bytes, i, b; - - /* Determine number of bytes needed to store the number x */ - y = x >> 3; - for( num_bytes = 5; num_bytes >= 2; -- num_bytes ) - { - if( y & 0xfe000000 ) break; - y <<= 7; - } - - /* Write all bytes, seven bits in each, with 8:th bit set for all */ - /* but the last byte. */ - for( i = num_bytes-1; i >= 0; -- i ) - { - b = (x >> (i*7)) & 0x0000007f; - if( i > 0 ) - { - b |= 0x00000080; - } - *buf ++ = (unsigned char) b; - } - - /* Return number of bytes written */ - return num_bytes; -} - - -/************************************************************************* -* _LZ_ReadVarSize() - Read unsigned integer with variable number of -* bytes depending on value. -*************************************************************************/ - -static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf ) -{ - unsigned int y, b, num_bytes; - - /* Read complete value (stop when byte contains zero in 8:th bit) */ - y = 0; - num_bytes = 0; - do - { - b = (unsigned int) (*buf ++); - y = (y << 7) | (b & 0x0000007f); - ++ num_bytes; - } - while( b & 0x00000080 ); - - /* Store value in x */ - *x = y; - - /* Return number of bytes read */ - return num_bytes; -} - - - -/************************************************************************* -* PUBLIC FUNCTIONS * -*************************************************************************/ - - -/************************************************************************* -* LZ_Compress() - Compress a block of data using an LZ77 coder. -* in - Input (uncompressed) buffer. -* out - Output (compressed) buffer. This buffer must be 0.4% larger -* than the input buffer, plus one byte. -* insize - Number of input bytes. -* The function returns the size of the compressed data. -*************************************************************************/ - -int LZ_Compress( unsigned char *in, unsigned char *out, - unsigned int insize ) -{ - unsigned char marker, symbol; - unsigned int inpos, outpos, bytesleft, i; - unsigned int maxoffset, offset, bestoffset; - unsigned int maxlength, length, bestlength; - unsigned int histogram[ 256 ]; - unsigned char *ptr1, *ptr2; - - /* Do we have anything to compress? */ - if( insize < 1 ) - { - return 0; - } - - /* Create histogram */ - for( i = 0; i < 256; ++ i ) - { - histogram[ i ] = 0; - } - for( i = 0; i < insize; ++ i ) - { - ++ histogram[ in[ i ] ]; - } - - /* Find the least common byte, and use it as the marker symbol */ - marker = 0; - for( i = 1; i < 256; ++ i ) - { - if( histogram[ i ] < histogram[ marker ] ) - { - marker = i; - } - } - - /* Remember the marker symbol for the decoder */ - out[ 0 ] = marker; - - /* Start of compression */ - inpos = 0; - outpos = 1; - - /* Main compression loop */ - bytesleft = insize; - do - { - /* Determine most distant position */ - if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET; - else maxoffset = inpos; - - /* Get pointer to current position */ - ptr1 = &in[ inpos ]; - - /* Search history window for maximum length string match */ - bestlength = 3; - bestoffset = 0; - for( offset = 3; offset <= maxoffset; ++ offset ) - { - /* Get pointer to candidate string */ - ptr2 = &ptr1[ -(int)offset ]; - - /* Quickly determine if this is a candidate (for speed) */ - if( (ptr1[ 0 ] == ptr2[ 0 ]) && - (ptr1[ bestlength ] == ptr2[ bestlength ]) ) - { - /* Determine maximum length for this offset */ - maxlength = (bytesleft < offset ? bytesleft : offset); - - /* Count maximum length match at this offset */ - length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength ); - - /* Better match than any previous match? */ - if( length > bestlength ) - { - bestlength = length; - bestoffset = offset; - } - } - } - - /* Was there a good enough match? */ - if( (bestlength >= 8) || - ((bestlength == 4) && (bestoffset <= 0x0000007f)) || - ((bestlength == 5) && (bestoffset <= 0x00003fff)) || - ((bestlength == 6) && (bestoffset <= 0x001fffff)) || - ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) - { - out[ outpos ++ ] = (unsigned char) marker; - outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); - outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); - inpos += bestlength; - bytesleft -= bestlength; - } - else - { - /* Output single byte (or two bytes if marker byte) */ - symbol = in[ inpos ++ ]; - out[ outpos ++ ] = symbol; - if( symbol == marker ) - { - out[ outpos ++ ] = 0; - } - -- bytesleft; - } - } - while( bytesleft > 3 ); - - /* Dump remaining bytes, if any */ - while( inpos < insize ) - { - if( in[ inpos ] == marker ) - { - out[ outpos ++ ] = marker; - out[ outpos ++ ] = 0; - } - else - { - out[ outpos ++ ] = in[ inpos ]; - } - ++ inpos; - } - - return outpos; -} - - -/************************************************************************* -* LZ_CompressFast() - Compress a block of data using an LZ77 coder. -* in - Input (uncompressed) buffer. -* out - Output (compressed) buffer. This buffer must be 0.4% larger -* than the input buffer, plus one byte. -* insize - Number of input bytes. -* work - Pointer to a temporary buffer (internal working buffer), which -* must be able to hold (insize+65536) unsigned integers. -* The function returns the size of the compressed data. -*************************************************************************/ - -int LZ_CompressFast( unsigned char *in, unsigned char *out, - unsigned int insize, unsigned int *work ) -{ - unsigned char marker, symbol; - unsigned int inpos, outpos, bytesleft, i, index, symbols; - unsigned int offset, bestoffset; - unsigned int maxlength, length, bestlength; - unsigned int histogram[ 256 ], *lastindex, *jumptable; - unsigned char *ptr1, *ptr2; - - /* Do we have anything to compress? */ - if( insize < 1 ) - { - return 0; - } - - /* Assign arrays to the working area */ - lastindex = work; - jumptable = &work[ 65536 ]; - - /* Build a "jump table". Here is how the jump table works: - jumptable[i] points to the nearest previous occurrence of the same - symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and - in[i+1] == in[jumptable[i]+1], and so on... Following the jump table - gives a dramatic boost for the string search'n'match loop compared - to doing a brute force search. The jump table is built in O(n) time, - so it is a cheap operation in terms of time, but it is expensice in - terms of memory consumption. */ - for( i = 0; i < 65536; ++ i ) - { - lastindex[ i ] = 0xffffffff; - } - for( i = 0; i < insize-1; ++ i ) - { - symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]); - index = lastindex[ symbols ]; - lastindex[ symbols ] = i; - jumptable[ i ] = index; - } - jumptable[ insize-1 ] = 0xffffffff; - - /* Create histogram */ - for( i = 0; i < 256; ++ i ) - { - histogram[ i ] = 0; - } - for( i = 0; i < insize; ++ i ) - { - ++ histogram[ in[ i ] ]; - } - - /* Find the least common byte, and use it as the marker symbol */ - marker = 0; - for( i = 1; i < 256; ++ i ) - { - if( histogram[ i ] < histogram[ marker ] ) - { - marker = i; - } - } - - /* Remember the marker symbol for the decoder */ - out[ 0 ] = marker; - - /* Start of compression */ - inpos = 0; - outpos = 1; - - /* Main compression loop */ - bytesleft = insize; - do - { - /* Get pointer to current position */ - ptr1 = &in[ inpos ]; - - /* Search history window for maximum length string match */ - bestlength = 3; - bestoffset = 0; - index = jumptable[ inpos ]; - while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) ) - { - /* Get pointer to candidate string */ - ptr2 = &in[ index ]; - - /* Quickly determine if this is a candidate (for speed) */ - if( ptr2[ bestlength ] == ptr1[ bestlength ] ) - { - /* Determine maximum length for this offset */ - offset = inpos - index; - maxlength = (bytesleft < offset ? bytesleft : offset); - - /* Count maximum length match at this offset */ - length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength ); - - /* Better match than any previous match? */ - if( length > bestlength ) - { - bestlength = length; - bestoffset = offset; - } - } - - /* Get next possible index from jump table */ - index = jumptable[ index ]; - } - - /* Was there a good enough match? */ - if( (bestlength >= 8) || - ((bestlength == 4) && (bestoffset <= 0x0000007f)) || - ((bestlength == 5) && (bestoffset <= 0x00003fff)) || - ((bestlength == 6) && (bestoffset <= 0x001fffff)) || - ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) - { - out[ outpos ++ ] = (unsigned char) marker; - outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); - outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); - inpos += bestlength; - bytesleft -= bestlength; - } - else - { - /* Output single byte (or two bytes if marker byte) */ - symbol = in[ inpos ++ ]; - out[ outpos ++ ] = symbol; - if( symbol == marker ) - { - out[ outpos ++ ] = 0; - } - -- bytesleft; - } - } - while( bytesleft > 3 ); - - /* Dump remaining bytes, if any */ - while( inpos < insize ) - { - if( in[ inpos ] == marker ) - { - out[ outpos ++ ] = marker; - out[ outpos ++ ] = 0; - } - else - { - out[ outpos ++ ] = in[ inpos ]; - } - ++ inpos; - } - - return outpos; -} - - -/************************************************************************* -* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder. -* in - Input (compressed) buffer. -* out - Output (uncompressed) buffer. This buffer must be large -* enough to hold the uncompressed data. -* insize - Number of input bytes. -*************************************************************************/ - -void LZ_Uncompress( unsigned char *in, unsigned char *out, - unsigned int insize ) -{ - unsigned char marker, symbol; - unsigned int i, inpos, outpos, length, offset; - - /* Do we have anything to uncompress? */ - if( insize < 1 ) - { - return; - } - - /* Get marker symbol from input stream */ - marker = in[ 0 ]; - inpos = 1; - - /* Main decompression loop */ - outpos = 0; - do - { - symbol = in[ inpos ++ ]; - if( symbol == marker ) - { - /* We had a marker byte */ - if( in[ inpos ] == 0 ) - { - /* It was a single occurrence of the marker byte */ - out[ outpos ++ ] = marker; - ++ inpos; - } - else - { - /* Extract true length and offset */ - inpos += _LZ_ReadVarSize( &length, &in[ inpos ] ); - inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] ); - - /* Copy corresponding data from history window */ - for( i = 0; i < length; ++ i ) - { - out[ outpos ] = out[ outpos - offset ]; - ++ outpos; - } - } - } - else - { - /* No marker, plain copy */ - out[ outpos ++ ] = symbol; - } - } - while( inpos < insize ); -}
diff -r ce4be867658b -r 0c9aeae76ab2 lz.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lz.cpp Tue Jul 29 14:02:53 2014 +0000 @@ -0,0 +1,542 @@ +/************************************************************************* +* Name: lz.c +* Author: Marcus Geelnard +* Description: LZ77 coder/decoder implementation. +* Reentrant: Yes +* +* The LZ77 compression scheme is a substitutional compression scheme +* proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in +* its design, and uses no fancy bit level compression. +* +* This is my first attempt at an implementation of a LZ77 code/decoder. +* +* The principle of the LZ77 compression algorithm is to store repeated +* occurrences of strings as references to previous occurrences of the same +* string. The point is that the reference consumes less space than the +* string itself, provided that the string is long enough (in this +* implementation, the string has to be at least 4 bytes long, since the +* minimum coded reference is 3 bytes long). Also note that the term +* "string" refers to any kind of byte sequence (it does not have to be +* an ASCII string, for instance). +* +* The coder uses a brute force approach to finding string matches in the +* history buffer (or "sliding window", if you wish), which is very, very +* slow. I recon the complexity is somewhere between O(n^2) and O(n^3), +* depending on the input data. +* +* There is also a faster implementation that uses a large working buffer +* in which a "jump table" is stored, which is used to quickly find +* possible string matches (see the source code for LZ_CompressFast() for +* more information). The faster method is an order of magnitude faster, +* but still quite slow compared to other compression methods. +* +* The upside is that decompression is very fast, and the compression ratio +* is often very good. +* +* The reference to a string is coded as a (length,offset) pair, where the +* length indicates the length of the string, and the offset gives the +* offset from the current data position. To distinguish between string +* references and literal strings (uncompressed bytes), a string reference +* is preceded by a marker byte, which is chosen as the least common byte +* symbol in the input data stream (this marker byte is stored in the +* output stream as the first byte). +* +* Occurrences of the marker byte in the stream are encoded as the marker +* byte followed by a zero byte, which means that occurrences of the marker +* byte have to be coded with two bytes. +* +* The lengths and offsets are coded in a variable length fashion, allowing +* values of any magnitude (up to 4294967295 in this implementation). +* +* With this compression scheme, the worst case compression result is +* (257/256)*insize + 1. +* +*------------------------------------------------------------------------- +* Copyright (c) 2003-2006 Marcus Geelnard +* +* This software is provided 'as-is', without any express or implied +* warranty. In no event will the authors be held liable for any damages +* arising from the use of this software. +* +* Permission is granted to anyone to use this software for any purpose, +* including commercial applications, and to alter it and redistribute it +* freely, subject to the following restrictions: +* +* 1. The origin of this software must not be misrepresented; you must not +* claim that you wrote the original software. If you use this software +* in a product, an acknowledgment in the product documentation would +* be appreciated but is not required. +* +* 2. Altered source versions must be plainly marked as such, and must not +* be misrepresented as being the original software. +* +* 3. This notice may not be removed or altered from any source +* distribution. +* +* Marcus Geelnard +* marcus.geelnard at home.se +*************************************************************************/ + + +/************************************************************************* +* Constants used for LZ77 coding +*************************************************************************/ + +/* Maximum offset (can be any size < 2^31). Lower values give faster + compression, while higher values gives better compression. The default + value of 100000 is quite high. Experiment to see what works best for + you. */ +#define LZ_MAX_OFFSET 100000 + +/************************************************************************* +* INTERNAL FUNCTIONS * +*************************************************************************/ + + +/************************************************************************* +* _LZ_StringCompare() - Return maximum length string match. +*************************************************************************/ + +static unsigned int _LZ_StringCompare( unsigned char * str1, + unsigned char * str2, unsigned int minlen, unsigned int maxlen ) +{ + unsigned int len; + + for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len ); + + return len; +} + + +/************************************************************************* +* _LZ_WriteVarSize() - Write unsigned integer with variable number of +* bytes depending on value. +*************************************************************************/ + +static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf ) +{ + unsigned int y; + int num_bytes, i, b; + + /* Determine number of bytes needed to store the number x */ + y = x >> 3; + for( num_bytes = 5; num_bytes >= 2; -- num_bytes ) + { + if( y & 0xfe000000 ) break; + y <<= 7; + } + + /* Write all bytes, seven bits in each, with 8:th bit set for all */ + /* but the last byte. */ + for( i = num_bytes-1; i >= 0; -- i ) + { + b = (x >> (i*7)) & 0x0000007f; + if( i > 0 ) + { + b |= 0x00000080; + } + *buf ++ = (unsigned char) b; + } + + /* Return number of bytes written */ + return num_bytes; +} + + +/************************************************************************* +* _LZ_ReadVarSize() - Read unsigned integer with variable number of +* bytes depending on value. +*************************************************************************/ + +static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf ) +{ + unsigned int y, b, num_bytes; + + /* Read complete value (stop when byte contains zero in 8:th bit) */ + y = 0; + num_bytes = 0; + do + { + b = (unsigned int) (*buf ++); + y = (y << 7) | (b & 0x0000007f); + ++ num_bytes; + } + while( b & 0x00000080 ); + + /* Store value in x */ + *x = y; + + /* Return number of bytes read */ + return num_bytes; +} + + + +/************************************************************************* +* PUBLIC FUNCTIONS * +*************************************************************************/ + + +/************************************************************************* +* LZ_Compress() - Compress a block of data using an LZ77 coder. +* in - Input (uncompressed) buffer. +* out - Output (compressed) buffer. This buffer must be 0.4% larger +* than the input buffer, plus one byte. +* insize - Number of input bytes. +* The function returns the size of the compressed data. +*************************************************************************/ + +int LZ_Compress( unsigned char *in, unsigned char *out, + unsigned int insize ) +{ + unsigned char marker, symbol; + unsigned int inpos, outpos, bytesleft, i; + unsigned int maxoffset, offset, bestoffset; + unsigned int maxlength, length, bestlength; + unsigned int histogram[ 256 ]; + unsigned char *ptr1, *ptr2; + + + /* Do we have anything to compress? */ + if( insize < 1 ) + { + return 0; + } + + + /* Create histogram */ + for( i = 0; i < 256; ++ i ) + { + histogram[ i ] = 0; + } + for( i = 0; i < insize; ++ i ) + { + ++ histogram[ in[ i ] ]; + } + + /* Find the least common byte, and use it as the marker symbol */ + marker = 0; + for( i = 1; i < 256; ++ i ) + { + if( histogram[ i ] < histogram[ marker ] ) + { + marker = i; + } + } + + /* Remember the marker symbol for the decoder */ + out[ 0 ] = marker; + + /* Start of compression */ + inpos = 0; + outpos = 1; + + /* Main compression loop */ + bytesleft = insize; + do + { + + + /* Determine most distant position */ + if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET; + else maxoffset = inpos; + + /* Get pointer to current position */ + ptr1 = &in[ inpos ]; + + /* Search history window for maximum length string match */ + bestlength = 3; + bestoffset = 0; + for( offset = 3; offset <= maxoffset; ++ offset ) + { + /* Get pointer to candidate string */ + ptr2 = &ptr1[ -(int)offset ]; + + /* Quickly determine if this is a candidate (for speed) */ + if( (ptr1[ 0 ] == ptr2[ 0 ]) && + (ptr1[ bestlength ] == ptr2[ bestlength ]) ) + { + /* Determine maximum length for this offset */ + maxlength = (bytesleft < offset ? bytesleft : offset); + + /* Count maximum length match at this offset */ + length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength ); + + /* Better match than any previous match? */ + if( length > bestlength ) + { + bestlength = length; + bestoffset = offset; + } + } + } + + /* Was there a good enough match? */ + if( (bestlength >= 8) || + ((bestlength == 4) && (bestoffset <= 0x0000007f)) || + ((bestlength == 5) && (bestoffset <= 0x00003fff)) || + ((bestlength == 6) && (bestoffset <= 0x001fffff)) || + ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) + { + out[ outpos ++ ] = (unsigned char) marker; + outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); + outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); + inpos += bestlength; + bytesleft -= bestlength; + } + else + { + /* Output single byte (or two bytes if marker byte) */ + symbol = in[ inpos ++ ]; + out[ outpos ++ ] = symbol; + if( symbol == marker ) + { + out[ outpos ++ ] = 0; + } + -- bytesleft; + } + } + while( bytesleft > 3 ); + + /* Dump remaining bytes, if any */ + while( inpos < insize ) + { + if( in[ inpos ] == marker ) + { + out[ outpos ++ ] = marker; + out[ outpos ++ ] = 0; + } + else + { + out[ outpos ++ ] = in[ inpos ]; + } + ++ inpos; + } + + return outpos; +} + + +/************************************************************************* +* LZ_CompressFast() - Compress a block of data using an LZ77 coder. +* in - Input (uncompressed) buffer. +* out - Output (compressed) buffer. This buffer must be 0.4% larger +* than the input buffer, plus one byte. +* insize - Number of input bytes. +* work - Pointer to a temporary buffer (internal working buffer), which +* must be able to hold (insize+65536) unsigned integers. +* The function returns the size of the compressed data. +*************************************************************************/ + +int LZ_CompressFast( unsigned char *in, unsigned char *out, + unsigned int insize, unsigned int *work ) +{ + unsigned char marker, symbol; + unsigned int inpos, outpos, bytesleft, i, index, symbols; + unsigned int offset, bestoffset; + unsigned int maxlength, length, bestlength; + unsigned int histogram[ 256 ], *lastindex, *jumptable; + unsigned char *ptr1, *ptr2; + + /* Do we have anything to compress? */ + if( insize < 1 ) + { + return 0; + } + + /* Assign arrays to the working area */ + lastindex = work; + jumptable = &work[ 65536 ]; + + /* Build a "jump table". Here is how the jump table works: + jumptable[i] points to the nearest previous occurrence of the same + symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and + in[i+1] == in[jumptable[i]+1], and so on... Following the jump table + gives a dramatic boost for the string search'n'match loop compared + to doing a brute force search. The jump table is built in O(n) time, + so it is a cheap operation in terms of time, but it is expensice in + terms of memory consumption. */ + for( i = 0; i < 65536; ++ i ) + { + lastindex[ i ] = 0xffffffff; + } + for( i = 0; i < insize-1; ++ i ) + { + symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]); + index = lastindex[ symbols ]; + lastindex[ symbols ] = i; + jumptable[ i ] = index; + } + jumptable[ insize-1 ] = 0xffffffff; + + /* Create histogram */ + for( i = 0; i < 256; ++ i ) + { + histogram[ i ] = 0; + } + for( i = 0; i < insize; ++ i ) + { + ++ histogram[ in[ i ] ]; + } + + /* Find the least common byte, and use it as the marker symbol */ + marker = 0; + for( i = 1; i < 256; ++ i ) + { + if( histogram[ i ] < histogram[ marker ] ) + { + marker = i; + } + } + + /* Remember the marker symbol for the decoder */ + out[ 0 ] = marker; + + /* Start of compression */ + inpos = 0; + outpos = 1; + + /* Main compression loop */ + bytesleft = insize; + do + { + /* Get pointer to current position */ + ptr1 = &in[ inpos ]; + + /* Search history window for maximum length string match */ + bestlength = 3; + bestoffset = 0; + index = jumptable[ inpos ]; + while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) ) + { + /* Get pointer to candidate string */ + ptr2 = &in[ index ]; + + /* Quickly determine if this is a candidate (for speed) */ + if( ptr2[ bestlength ] == ptr1[ bestlength ] ) + { + /* Determine maximum length for this offset */ + offset = inpos - index; + maxlength = (bytesleft < offset ? bytesleft : offset); + + /* Count maximum length match at this offset */ + length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength ); + + /* Better match than any previous match? */ + if( length > bestlength ) + { + bestlength = length; + bestoffset = offset; + } + } + + /* Get next possible index from jump table */ + index = jumptable[ index ]; + } + + /* Was there a good enough match? */ + if( (bestlength >= 8) || + ((bestlength == 4) && (bestoffset <= 0x0000007f)) || + ((bestlength == 5) && (bestoffset <= 0x00003fff)) || + ((bestlength == 6) && (bestoffset <= 0x001fffff)) || + ((bestlength == 7) && (bestoffset <= 0x0fffffff)) ) + { + out[ outpos ++ ] = (unsigned char) marker; + outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] ); + outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] ); + inpos += bestlength; + bytesleft -= bestlength; + } + else + { + /* Output single byte (or two bytes if marker byte) */ + symbol = in[ inpos ++ ]; + out[ outpos ++ ] = symbol; + if( symbol == marker ) + { + out[ outpos ++ ] = 0; + } + -- bytesleft; + } + } + while( bytesleft > 3 ); + + /* Dump remaining bytes, if any */ + while( inpos < insize ) + { + if( in[ inpos ] == marker ) + { + out[ outpos ++ ] = marker; + out[ outpos ++ ] = 0; + } + else + { + out[ outpos ++ ] = in[ inpos ]; + } + ++ inpos; + } + + return outpos; +} + + +/************************************************************************* +* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder. +* in - Input (compressed) buffer. +* out - Output (uncompressed) buffer. This buffer must be large +* enough to hold the uncompressed data. +* insize - Number of input bytes. +*************************************************************************/ + +void LZ_Uncompress( unsigned char *in, unsigned char *out, + unsigned int insize ) +{ + unsigned char marker, symbol; + unsigned int i, inpos, outpos, length, offset; + + /* Do we have anything to uncompress? */ + if( insize < 1 ) + { + return; + } + + /* Get marker symbol from input stream */ + marker = in[ 0 ]; + inpos = 1; + + /* Main decompression loop */ + outpos = 0; + do + { + symbol = in[ inpos ++ ]; + if( symbol == marker ) + { + /* We had a marker byte */ + if( in[ inpos ] == 0 ) + { + /* It was a single occurrence of the marker byte */ + out[ outpos ++ ] = marker; + ++ inpos; + } + else + { + /* Extract true length and offset */ + inpos += _LZ_ReadVarSize( &length, &in[ inpos ] ); + inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] ); + + /* Copy corresponding data from history window */ + for( i = 0; i < length; ++ i ) + { + out[ outpos ] = out[ outpos - offset ]; + ++ outpos; + } + } + } + else + { + /* No marker, plain copy */ + out[ outpos ++ ] = symbol; + } + } + while( inpos < insize ); +}
diff -r ce4be867658b -r 0c9aeae76ab2 lz.h --- a/lz.h Thu Jul 17 14:16:11 2014 +0000 +++ b/lz.h Tue Jul 29 14:02:53 2014 +0000 @@ -32,9 +32,9 @@ #ifndef _lz_h_ #define _lz_h_ -#ifdef __cplusplus -extern "C" { -#endif +//#ifdef __cplusplus +//extern "C" { +//#endif /************************************************************************* @@ -49,8 +49,8 @@ unsigned int insize ); -#ifdef __cplusplus -} -#endif +//#ifdef __cplusplus +//} +//#endif #endif /* _lz_h_ */