CoCalc -- lz.c

GitHub Repository: CTCaer/hekate
Path: blob/master/tools/lz/lz.c
¹⁴⁷⁶ views
1
//
2
// Name:        lz.c
3
// Author:      Marcus Geelnard
4
// Description: LZ77 coder/decoder implementation.
5
// Reentrant:   Yes
6
// $ATH_LICENSE_NULL$
7
//
8
// The LZ77 compression scheme is a substitutional compression scheme
9
// proposed by Abraham Lempel and Jakob Ziv in 1977. It is very simple in
10
// its design, and uses no fancy bit level compression.
11
//
12
// This is my first attempt at an implementation of a LZ77 code/decoder.
13
//
14
// The principle of the LZ77 compression algorithm is to store repeated
15
// occurrences of strings as references to previous occurrences of the same
16
// string. The point is that the reference consumes less space than the
17
// string itself, provided that the string is long enough (in this
18
// implementation, the string has to be at least 4 bytes long, since the
19
// minimum coded reference is 3 bytes long). Also note that the term
20
// "string" refers to any kind of byte sequence (it does not have to be
21
// an ASCII string, for instance).
22
//
23
// The coder uses a brute force approach to finding string matches in the
24
// history buffer (or "sliding window", if you wish), which is very, very
25
// slow. I recon the complexity is somewhere between O(n^2) and O(n^3),
26
// depending on the input data.
27
//
28
// There is also a faster implementation that uses a large working buffer
29
// in which a "jump table" is stored, which is used to quickly find
30
// possible string matches (see the source code for LZ_CompressFast() for
31
// more information). The faster method is an order of magnitude faster,
32
// but still quite slow compared to other compression methods.
33
//
34
// The upside is that decompression is very fast, and the compression ratio
35
// is often very good.
36
//
37
// The reference to a string is coded as a (length,offset) pair, where the
38
// length indicates the length of the string, and the offset gives the
39
// offset from the current data position. To distinguish between string
40
// references and literal strings (uncompressed bytes), a string reference
41
// is preceded by a marker byte, which is chosen as the least common byte
42
// symbol in the input data stream (this marker byte is stored in the
43
// output stream as the first byte).
44
//
45
// Occurrences of the marker byte in the stream are encoded as the marker
46
// byte followed by a zero byte, which means that occurrences of the marker
47
// byte have to be coded with two bytes.
48
//
49
// The lengths and offsets are coded in a variable length fashion, allowing
50
// values of any magnitude (up to 4294967295 in this implementation).
51
//
52
// With this compression scheme, the worst case compression result is
53
// (257/256)*insize + 1.
54
//
55
//------------------------------------------------------------------------
56
// Copyright (c) 2003-2006 Marcus Geelnard
57
//
58
// This software is provided 'as-is', without any express or implied
59
// warranty. In no event will the authors be held liable for any damages
60
// arising from the use of this software.
61
//
62
// Permission is granted to anyone to use this software for any purpose,
63
// including commercial applications, and to alter it and redistribute it
64
// freely, subject to the following restrictions:
65
//
66
// 1. The origin of this software must not be misrepresented; you must not
67
//    claim that you wrote the original software. If you use this software
68
//    in a product, an acknowledgment in the product documentation would
69
//    be appreciated but is not required.
70
//
71
// 2. Altered source versions must be plainly marked as such, and must not
72
//    be misrepresented as being the original software.
73
//
74
// 3. This notice may not be removed or altered from any source
75
//    distribution.
76
//
77
// Marcus Geelnard
78
// marcus.geelnard at home.se
79
//
80

81
//
82
// This file has been altered from the original version.
83
//
84

85
/*************************************************************************
86
* Constants used for LZ77 coding
87
*************************************************************************/
88

89
/* Maximum offset (can be any size < 2^31). Lower values give faster
90
   compression, while higher values gives better compression. The default
91
   value of 100000 is quite high. Experiment to see what works best for
92
   you. */
93
#define LZ_MAX_OFFSET 100000
94

95

96

97
/*************************************************************************
98
*                           INTERNAL FUNCTIONS                           *
99
*************************************************************************/
100

101

102
/*************************************************************************
103
* _LZ_StringCompare() - Return maximum length string match.
104
*************************************************************************/
105

106
static unsigned int _LZ_StringCompare( unsigned char * str1,
107
  unsigned char * str2, unsigned int minlen, unsigned int maxlen )
108
{
109
    unsigned int len;
110

111
    for( len = minlen; (len < maxlen) && (str1[len] == str2[len]); ++ len );
112

113
    return len;
114
}
115

116

117
/*************************************************************************
118
* _LZ_WriteVarSize() - Write unsigned integer with variable number of
119
* bytes depending on value.
120
*************************************************************************/
121

122
static int _LZ_WriteVarSize( unsigned int x, unsigned char * buf )
123
{
124
    unsigned int y;
125
    int num_bytes, i, b;
126

127
    /* Determine number of bytes needed to store the number x */
128
    y = x >> 3;
129
    for( num_bytes = 5; num_bytes >= 2; -- num_bytes )
130
    {
131
        if( y & 0xfe000000 ) break;
132
        y <<= 7;
133
    }
134

135
    /* Write all bytes, seven bits in each, with 8:th bit set for all */
136
    /* but the last byte. */
137
    for( i = num_bytes-1; i >= 0; -- i )
138
    {
139
        b = (x >> (i*7)) & 0x0000007f;
140
        if( i > 0 )
141
        {
142
            b |= 0x00000080;
143
        }
144
        *buf ++ = (unsigned char) b;
145
    }
146

147
    /* Return number of bytes written */
148
    return num_bytes;
149
}
150

151

152
/*************************************************************************
153
* _LZ_ReadVarSize() - Read unsigned integer with variable number of
154
* bytes depending on value.
155
*************************************************************************/
156

157
static int _LZ_ReadVarSize( unsigned int * x, unsigned char * buf )
158
{
159
    unsigned int y, b, num_bytes;
160

161
    /* Read complete value (stop when byte contains zero in 8:th bit) */
162
    y = 0;
163
    num_bytes = 0;
164
    do
165
    {
166
        b = (unsigned int) (*buf ++);
167
        y = (y << 7) | (b & 0x0000007f);
168
        ++ num_bytes;
169
    }
170
    while( b & 0x00000080 );
171

172
    /* Store value in x */
173
    *x = y;
174

175
    /* Return number of bytes read */
176
    return num_bytes;
177
}
178

179

180

181
/*************************************************************************
182
*                            PUBLIC FUNCTIONS                            *
183
*************************************************************************/
184

185

186
/*************************************************************************
187
* LZ_Compress() - Compress a block of data using an LZ77 coder.
188
*  in     - Input (uncompressed) buffer.
189
*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
190
*           than the input buffer, plus one byte.
191
*  insize - Number of input bytes.
192
* The function returns the size of the compressed data.
193
*************************************************************************/
194

195
int LZ_Compress( unsigned char *in, unsigned char *out,
196
    unsigned int insize )
197
{
198
    unsigned char marker, symbol;
199
    unsigned int  inpos, outpos, bytesleft, i;
200
    unsigned int  maxoffset, offset, bestoffset;
201
    unsigned int  maxlength, length, bestlength;
202
    unsigned int  histogram[ 256 ];
203
    unsigned char *ptr1, *ptr2;
204

205
    /* Do we have anything to compress? */
206
    if( insize < 1 )
207
    {
208
        return 0;
209
    }
210

211
    /* Create histogram */
212
    for( i = 0; i < 256; ++ i )
213
    {
214
        histogram[ i ] = 0;
215
    }
216
    for( i = 0; i < insize; ++ i )
217
    {
218
        ++ histogram[ in[ i ] ];
219
    }
220

221
    /* Find the least common byte, and use it as the marker symbol */
222
    marker = 0;
223
    for( i = 1; i < 256; ++ i )
224
    {
225
        if( histogram[ i ] < histogram[ marker ] )
226
        {
227
            marker = i;
228
        }
229
    }
230

231
    /* Remember the marker symbol for the decoder */
232
    out[ 0 ] = marker;
233

234
    /* Start of compression */
235
    inpos = 0;
236
    outpos = 1;
237

238
    /* Main compression loop */
239
    bytesleft = insize;
240
    do
241
    {
242
        /* Determine most distant position */
243
        if( inpos > LZ_MAX_OFFSET ) maxoffset = LZ_MAX_OFFSET;
244
        else                        maxoffset = inpos;
245

246
        /* Get pointer to current position */
247
        ptr1 = &in[ inpos ];
248

249
        /* Search history window for maximum length string match */
250
        bestlength = 3;
251
        bestoffset = 0;
252
        for( offset = 3; offset <= maxoffset; ++ offset )
253
        {
254
            /* Get pointer to candidate string */
255
            ptr2 = &ptr1[ -(int)offset ];
256

257
            /* Quickly determine if this is a candidate (for speed) */
258
            if( (ptr1[ 0 ] == ptr2[ 0 ]) &&
259
                (ptr1[ bestlength ] == ptr2[ bestlength ]) )
260
            {
261
                /* Determine maximum length for this offset */
262
                maxlength = (bytesleft < offset ? bytesleft : offset);
263

264
                /* Count maximum length match at this offset */
265
                length = _LZ_StringCompare( ptr1, ptr2, 0, maxlength );
266

267
                /* Better match than any previous match? */
268
                if( length > bestlength )
269
                {
270
                    bestlength = length;
271
                    bestoffset = offset;
272
                }
273
            }
274
        }
275

276
        /* Was there a good enough match? */
277
        if( (bestlength >= 8) ||
278
            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
279
            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
280
            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
281
            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
282
        {
283
            out[ outpos ++ ] = (unsigned char) marker;
284
            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
285
            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
286
            inpos += bestlength;
287
            bytesleft -= bestlength;
288
        }
289
        else
290
        {
291
            /* Output single byte (or two bytes if marker byte) */
292
            symbol = in[ inpos ++ ];
293
            out[ outpos ++ ] = symbol;
294
            if( symbol == marker )
295
            {
296
                out[ outpos ++ ] = 0;
297
            }
298
            -- bytesleft;
299
        }
300
    }
301
    while( bytesleft > 3 );
302

303
    /* Dump remaining bytes, if any */
304
    while( inpos < insize )
305
    {
306
        if( in[ inpos ] == marker )
307
        {
308
            out[ outpos ++ ] = marker;
309
            out[ outpos ++ ] = 0;
310
        }
311
        else
312
        {
313
            out[ outpos ++ ] = in[ inpos ];
314
        }
315
        ++ inpos;
316
    }
317

318
    return outpos;
319
}
320

321

322
/*************************************************************************
323
* LZ_CompressFast() - Compress a block of data using an LZ77 coder.
324
*  in     - Input (uncompressed) buffer.
325
*  out    - Output (compressed) buffer. This buffer must be 0.4% larger
326
*           than the input buffer, plus one byte.
327
*  insize - Number of input bytes.
328
*  work   - Pointer to a temporary buffer (internal working buffer), which
329
*           must be able to hold (insize+65536) unsigned integers.
330
* The function returns the size of the compressed data.
331
*************************************************************************/
332

333
int LZ_CompressFast( unsigned char *in, unsigned char *out,
334
    unsigned int insize, unsigned int *work )
335
{
336
    unsigned char marker, symbol;
337
    unsigned int  inpos, outpos, bytesleft, i, index, symbols;
338
    unsigned int  offset, bestoffset;
339
    unsigned int  maxlength, length, bestlength;
340
    unsigned int  histogram[ 256 ], *lastindex, *jumptable;
341
    unsigned char *ptr1, *ptr2;
342

343
    /* Do we have anything to compress? */
344
    if( insize < 1 )
345
    {
346
        return 0;
347
    }
348

349
    /* Assign arrays to the working area */
350
    lastindex = work;
351
    jumptable = &work[ 65536 ];
352

353
    /* Build a "jump table". Here is how the jump table works:
354
       jumptable[i] points to the nearest previous occurrence of the same
355
       symbol pair as in[i]:in[i+1], so in[i] == in[jumptable[i]] and
356
       in[i+1] == in[jumptable[i]+1], and so on... Following the jump table
357
       gives a dramatic boost for the string search'n'match loop compared
358
       to doing a brute force search. The jump table is built in O(n) time,
359
       so it is a cheap operation in terms of time, but it is expensice in
360
       terms of memory consumption. */
361
    for( i = 0; i < 65536; ++ i )
362
    {
363
        lastindex[ i ] = 0xffffffff;
364
    }
365
    for( i = 0; i < insize-1; ++ i )
366
    {
367
        symbols = (((unsigned int)in[i]) << 8) | ((unsigned int)in[i+1]);
368
        index = lastindex[ symbols ];
369
        lastindex[ symbols ] = i;
370
        jumptable[ i ] = index;
371
    }
372
    jumptable[ insize-1 ] = 0xffffffff;
373

374
    /* Create histogram */
375
    for( i = 0; i < 256; ++ i )
376
    {
377
        histogram[ i ] = 0;
378
    }
379
    for( i = 0; i < insize; ++ i )
380
    {
381
        ++ histogram[ in[ i ] ];
382
    }
383

384
    /* Find the least common byte, and use it as the marker symbol */
385
    marker = 0;
386
    for( i = 1; i < 256; ++ i )
387
    {
388
        if( histogram[ i ] < histogram[ marker ] )
389
        {
390
            marker = i;
391
        }
392
    }
393

394
    /* Remember the marker symbol for the decoder */
395
    out[ 0 ] = marker;
396

397
    /* Start of compression */
398
    inpos = 0;
399
    outpos = 1;
400

401
    /* Main compression loop */
402
    bytesleft = insize;
403
    do
404
    {
405
        /* Get pointer to current position */
406
        ptr1 = &in[ inpos ];
407

408
        /* Search history window for maximum length string match */
409
        bestlength = 3;
410
        bestoffset = 0;
411
        index = jumptable[ inpos ];
412
        while( (index != 0xffffffff) && ((inpos - index) < LZ_MAX_OFFSET) )
413
        {
414
            /* Get pointer to candidate string */
415
            ptr2 = &in[ index ];
416

417
            /* Quickly determine if this is a candidate (for speed) */
418
            if( ptr2[ bestlength ] == ptr1[ bestlength ] )
419
            {
420
                /* Determine maximum length for this offset */
421
                offset = inpos - index;
422
                maxlength = (bytesleft < offset ? bytesleft : offset);
423

424
                /* Count maximum length match at this offset */
425
                length = _LZ_StringCompare( ptr1, ptr2, 2, maxlength );
426

427
                /* Better match than any previous match? */
428
                if( length > bestlength )
429
                {
430
                    bestlength = length;
431
                    bestoffset = offset;
432
                }
433
            }
434

435
            /* Get next possible index from jump table */
436
            index = jumptable[ index ];
437
        }
438

439
        /* Was there a good enough match? */
440
        if( (bestlength >= 8) ||
441
            ((bestlength == 4) && (bestoffset <= 0x0000007f)) ||
442
            ((bestlength == 5) && (bestoffset <= 0x00003fff)) ||
443
            ((bestlength == 6) && (bestoffset <= 0x001fffff)) ||
444
            ((bestlength == 7) && (bestoffset <= 0x0fffffff)) )
445
        {
446
            out[ outpos ++ ] = (unsigned char) marker;
447
            outpos += _LZ_WriteVarSize( bestlength, &out[ outpos ] );
448
            outpos += _LZ_WriteVarSize( bestoffset, &out[ outpos ] );
449
            inpos += bestlength;
450
            bytesleft -= bestlength;
451
        }
452
        else
453
        {
454
            /* Output single byte (or two bytes if marker byte) */
455
            symbol = in[ inpos ++ ];
456
            out[ outpos ++ ] = symbol;
457
            if( symbol == marker )
458
            {
459
                out[ outpos ++ ] = 0;
460
            }
461
            -- bytesleft;
462
        }
463
    }
464
    while( bytesleft > 3 );
465

466
    /* Dump remaining bytes, if any */
467
    while( inpos < insize )
468
    {
469
        if( in[ inpos ] == marker )
470
        {
471
            out[ outpos ++ ] = marker;
472
            out[ outpos ++ ] = 0;
473
        }
474
        else
475
        {
476
            out[ outpos ++ ] = in[ inpos ];
477
        }
478
        ++ inpos;
479
    }
480

481
    return outpos;
482
}
483

484

485
/*************************************************************************
486
* LZ_Uncompress() - Uncompress a block of data using an LZ77 decoder.
487
*  in      - Input (compressed) buffer.
488
*  out     - Output (uncompressed) buffer. This buffer must be large
489
*            enough to hold the uncompressed data.
490
*  insize  - Number of input bytes.
491
*************************************************************************/
492

493
int LZ_Uncompress( unsigned char *in, unsigned char *out,
494
    unsigned int insize )
495
{
496
    unsigned char marker, symbol;
497
    unsigned int  i, inpos, outpos, length, offset;
498

499
    /* Do we have anything to uncompress? */
500
    if( insize < 1 )
501
    {
502
        return 0;
503
    }
504

505
    /* Get marker symbol from input stream */
506
    marker = in[ 0 ];
507
    inpos = 1;
508

509
    /* Main decompression loop */
510
    outpos = 0;
511
    do
512
    {
513
        symbol = in[ inpos ++ ];
514
        if( symbol == marker )
515
        {
516
            /* We had a marker byte */
517
            if( in[ inpos ] == 0 )
518
            {
519
                /* It was a single occurrence of the marker byte */
520
                out[ outpos ++ ] = marker;
521
                ++ inpos;
522
            }
523
            else
524
            {
525
                /* Extract true length and offset */
526
                inpos += _LZ_ReadVarSize( &length, &in[ inpos ] );
527
                inpos += _LZ_ReadVarSize( &offset, &in[ inpos ] );
528

529
                /* Copy corresponding data from history window */
530
                for( i = 0; i < length; ++ i )
531
                {
532
                    out[ outpos ] = out[ outpos - offset ];
533
                    ++ outpos;
534
                }
535
            }
536
        }
537
        else
538
        {
539
            /* No marker, plain copy */
540
            out[ outpos ++ ] = symbol;
541
        }
542
    }
543
    while( inpos < insize );
544

545
    return outpos;
546
}
547

548
Product

Resources

Company