lib/Support/Compressor.cpp

   1 //===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file was developed by Reid Spencer and is distributed under the
   6 // University of Illinois Open Source License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the llvm::Compressor class, an abstraction for memory
  11 // block compression.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "llvm/Config/config.h"
  16 #include "llvm/Support/Compressor.h"
  17 #include "llvm/ADT/StringExtras.h"
  18 #include <cassert>
  19 #include <string>
  20 #include <ostream>
  21 #include "bzip2/bzlib.h"
  22 using namespace llvm;
  23
  24 enum CompressionTypes {
  25   COMP_TYPE_NONE  = '0',
  26   COMP_TYPE_BZIP2 = '2',
  27 };
  28
  29 static int getdata(char*& buffer, size_t &size,
  30                    llvm::Compressor::OutputDataCallback* cb, void* context) {
  31   buffer = 0;
  32   size = 0;
  33   int result = (*cb)(buffer, size, context);
  34   assert(buffer != 0 && "Invalid result from Compressor callback");
  35   assert(size != 0 && "Invalid result from Compressor callback");
  36   return result;
  37 }
  38
  39 static int getdata_uns(char*& buffer, unsigned &size,
  40                        llvm::Compressor::OutputDataCallback* cb, void* context)
  41 {
  42   size_t SizeOut;
  43   int Res = getdata(buffer, SizeOut, cb, context);
  44   size = SizeOut;
  45   return Res;
  46 }
  47
  48 //===----------------------------------------------------------------------===//
  49 //=== NULLCOMP - a compression like set of routines that just copies data
  50 //===            without doing any compression. This is provided so that if the
  51 //===            configured environment doesn't have a compression library the
  52 //===            program can still work, albeit using more data/memory.
  53 //===----------------------------------------------------------------------===//
  54
  55 struct NULLCOMP_stream {
  56   // User provided fields
  57   char*  next_in;
  58   size_t avail_in;
  59   char*  next_out;
  60   size_t avail_out;
  61
  62   // Information fields
  63   size_t output_count; // Total count of output bytes
  64 };
  65
  66 static void NULLCOMP_init(NULLCOMP_stream* s) {
  67   s->output_count = 0;
  68 }
  69
  70 static bool NULLCOMP_compress(NULLCOMP_stream* s) {
  71   assert(s && "Invalid NULLCOMP_stream");
  72   assert(s->next_in != 0);
  73   assert(s->next_out != 0);
  74   assert(s->avail_in >= 1);
  75   assert(s->avail_out >= 1);
  76
  77   if (s->avail_out >= s->avail_in) {
  78     ::memcpy(s->next_out, s->next_in, s->avail_in);
  79     s->output_count += s->avail_in;
  80     s->avail_out -= s->avail_in;
  81     s->next_in += s->avail_in;
  82     s->avail_in = 0;
  83     return true;
  84   } else {
  85     ::memcpy(s->next_out, s->next_in, s->avail_out);
  86     s->output_count += s->avail_out;
  87     s->avail_in -= s->avail_out;
  88     s->next_in += s->avail_out;
  89     s->avail_out = 0;
  90     return false;
  91   }
  92 }
  93
  94 static bool NULLCOMP_decompress(NULLCOMP_stream* s) {
  95   assert(s && "Invalid NULLCOMP_stream");
  96   assert(s->next_in != 0);
  97   assert(s->next_out != 0);
  98   assert(s->avail_in >= 1);
  99   assert(s->avail_out >= 1);
 100
 101   if (s->avail_out >= s->avail_in) {
 102     ::memcpy(s->next_out, s->next_in, s->avail_in);
 103     s->output_count += s->avail_in;
 104     s->avail_out -= s->avail_in;
 105     s->next_in += s->avail_in;
 106     s->avail_in = 0;
 107     return true;
 108   } else {
 109     ::memcpy(s->next_out, s->next_in, s->avail_out);
 110     s->output_count += s->avail_out;
 111     s->avail_in -= s->avail_out;
 112     s->next_in += s->avail_out;
 113     s->avail_out = 0;
 114     return false;
 115   }
 116 }
 117
 118 static void NULLCOMP_end(NULLCOMP_stream* strm) {
 119 }
 120
 121 namespace {
 122
 123 /// This structure is only used when a bytecode file is compressed.
 124 /// As bytecode is being decompressed, the memory buffer might need
 125 /// to be reallocated. The buffer allocation is handled in a callback
 126 /// and this structure is needed to retain information across calls
 127 /// to the callback.
 128 /// @brief An internal buffer object used for handling decompression
 129 struct BufferContext {
 130   char* buff;
 131   size_t size;
 132   BufferContext(size_t compressedSize) {
 133     // Null to indicate malloc of a new block
 134     buff = 0;
 135
 136     // Compute the initial length of the uncompression buffer. Note that this
 137     // is twice the length of the compressed buffer and will be doubled again
 138     // in the callback for an initial allocation of 4x compressedSize.  This
 139     // calculation is based on the typical compression ratio of bzip2 on LLVM
 140     // bytecode files which typically ranges in the 50%-75% range.   Since we
 141     // typically get at least 50%, doubling is insufficient. By using a 4x
 142     // multiplier on the first allocation, we minimize the impact of having to
 143     // copy the buffer on reallocation.
 144     size = compressedSize*2;
 145   }
 146
 147   /// trimTo - Reduce the size of the buffer down to the specified amount.  This
 148   /// is useful after have read in the bytecode file to discard extra unused
 149   /// memory.
 150   ///
 151   void trimTo(size_t NewSize) {
 152     buff = (char*)::realloc(buff, NewSize);
 153     size = NewSize;
 154   }
 155
 156   /// This function handles allocation of the buffer used for decompression of
 157   /// compressed bytecode files. It is called by Compressor::decompress which is
 158   /// called by BytecodeReader::ParseBytecode.
 159   static size_t callback(char*&buff, size_t &sz, void* ctxt){
 160     // Case the context variable to our BufferContext
 161     BufferContext* bc = reinterpret_cast<BufferContext*>(ctxt);
 162
 163     // Compute the new, doubled, size of the block
 164     size_t new_size = bc->size * 2;
 165
 166     // Extend or allocate the block (realloc(0,n) == malloc(n))
 167     char* new_buff = (char*) ::realloc(bc->buff, new_size);
 168
 169     // Figure out what to return to the Compressor. If this is the first call,
 170     // then bc->buff will be null. In this case we want to return the entire
 171     // buffer because there was no previous allocation.  Otherwise, when the
 172     // buffer is reallocated, we save the new base pointer in the
 173     // BufferContext.buff field but return the address of only the extension,
 174     // mid-way through the buffer (since its size was doubled). Furthermore,
 175     // the sz result must be 1/2 the total size of the buffer.
 176     if (bc->buff == 0 ) {
 177       buff = bc->buff = new_buff;
 178       sz = new_size;
 179     } else {
 180       bc->buff = new_buff;
 181       buff = new_buff + bc->size;
 182       sz = bc->size;
 183     }
 184
 185     // Retain the size of the allocated block
 186     bc->size = new_size;
 187
 188     // Make sure we fail (return 1) if we didn't get any memory.
 189     return (bc->buff == 0 ? 1 : 0);
 190   }
 191 };
 192
 193 } // end anonymous namespace
 194
 195
 196 namespace {
 197
 198 // This structure retains the context when compressing the bytecode file. The
 199 // WriteCompressedData function below uses it to keep track of the previously
 200 // filled chunk of memory (which it writes) and how many bytes have been
 201 // written.
 202 struct WriterContext {
 203   // Initialize the context
 204   WriterContext(std::ostream*OS, size_t CS)
 205     : chunk(0), sz(0), written(0), compSize(CS), Out(OS) {}
 206
 207   // Make sure we clean up memory
 208   ~WriterContext() {
 209     if (chunk)
 210       delete [] chunk;
 211   }
 212
 213   // Write the chunk
 214   void write(size_t size = 0) {
 215     size_t write_size = (size == 0 ? sz : size);
 216     Out->write(chunk,write_size);
 217     written += write_size;
 218     delete [] chunk;
 219     chunk = 0;
 220     sz = 0;
 221   }
 222
 223   // This function is a callback used by the Compressor::compress function to
 224   // allocate memory for the compression buffer. This function fulfills that
 225   // responsibility but also writes the previous (now filled) buffer out to the
 226   // stream.
 227   static size_t callback(char*& buffer, size_t &size, void* context) {
 228     // Cast the context to the structure it must point to.
 229     WriterContext* ctxt = reinterpret_cast<WriterContext*>(context);
 230
 231     // If there's a previously allocated chunk, it must now be filled with
 232     // compressed data, so we write it out and deallocate it.
 233     if (ctxt->chunk != 0 && ctxt->sz > 0 ) {
 234       ctxt->write();
 235     }
 236
 237     // Compute the size of the next chunk to allocate. We attempt to allocate
 238     // enough memory to handle the compression in a single memory allocation. In
 239     // general, the worst we do on compression of bytecode is about 50% so we
 240     // conservatively estimate compSize / 2 as the size needed for the
 241     // compression buffer. compSize is the size of the compressed data, provided
 242     // by WriteBytecodeToFile.
 243     size = ctxt->sz = ctxt->compSize / 2;
 244
 245     // Allocate the chunks
 246     buffer = ctxt->chunk = new char [size];
 247
 248     // We must return 1 if the allocation failed so that the Compressor knows
 249     // not to use the buffer pointer.
 250     return (ctxt->chunk == 0 ? 1 : 0);
 251   }
 252
 253   char* chunk;       // pointer to the chunk of memory filled by compression
 254   size_t sz;         // size of chunk
 255   size_t written;    // aggregate total of bytes written in all chunks
 256   size_t compSize;   // size of the uncompressed buffer
 257   std::ostream* Out; // The stream we write the data to.
 258 };
 259
 260 }  // end anonymous namespace
 261
 262 // Compress in one of three ways
 263 size_t Compressor::compress(const char* in, size_t size,
 264                             OutputDataCallback* cb, void* context) {
 265   assert(in && "Can't compress null buffer");
 266   assert(size && "Can't compress empty buffer");
 267   assert(cb && "Can't compress without a callback function");
 268
 269   size_t result = 0;
 270
 271   // For small files, we just don't bother compressing. bzip2 isn't very good
 272   // with tiny files and can actually make the file larger, so we just avoid
 273   // it altogether.
 274   if (size > 64*1024) {
 275     // Set up the bz_stream
 276     bz_stream bzdata;
 277     bzdata.bzalloc = 0;
 278     bzdata.bzfree = 0;
 279     bzdata.opaque = 0;
 280     bzdata.next_in = (char*)in;
 281     bzdata.avail_in = size;
 282     bzdata.next_out = 0;
 283     bzdata.avail_out = 0;
 284     switch ( BZ2_bzCompressInit(&bzdata, 5, 0, 100) ) {
 285       case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
 286       case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
 287       case BZ_MEM_ERROR:    throw std::string("Out of memory");
 288       case BZ_OK:
 289       default:
 290         break;
 291     }
 292
 293     // Get a block of memory
 294     if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 295       BZ2_bzCompressEnd(&bzdata);
 296       throw std::string("Can't allocate output buffer");
 297     }
 298
 299     // Put compression code in first byte
 300     (*bzdata.next_out++) = COMP_TYPE_BZIP2;
 301     bzdata.avail_out--;
 302
 303     // Compress it
 304     int bzerr = BZ_FINISH_OK;
 305     while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
 306       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 307         BZ2_bzCompressEnd(&bzdata);
 308         throw std::string("Can't allocate output buffer");
 309       }
 310     }
 311     switch (bzerr) {
 312       case BZ_SEQUENCE_ERROR:
 313       case BZ_PARAM_ERROR: throw std::string("Param/Sequence error");
 314       case BZ_FINISH_OK:
 315       case BZ_STREAM_END: break;
 316       default: throw std::string("Oops: ") + utostr(unsigned(bzerr));
 317     }
 318
 319     // Finish
 320     result = bzdata.total_out_lo32 + 1;
 321     if (sizeof(size_t) == sizeof(uint64_t))
 322       result |= static_cast<uint64_t>(bzdata.total_out_hi32) << 32;
 323
 324     BZ2_bzCompressEnd(&bzdata);
 325   } else {
 326     // Do null compression, for small files
 327     NULLCOMP_stream sdata;
 328     sdata.next_in = (char*)in;
 329     sdata.avail_in = size;
 330     NULLCOMP_init(&sdata);
 331
 332     if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 333       throw std::string("Can't allocate output buffer");
 334     }
 335
 336     *(sdata.next_out++) = COMP_TYPE_NONE;
 337     sdata.avail_out--;
 338
 339     while (!NULLCOMP_compress(&sdata)) {
 340       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 341         throw std::string("Can't allocate output buffer");
 342       }
 343     }
 344
 345     result = sdata.output_count + 1;
 346     NULLCOMP_end(&sdata);
 347   }
 348   return result;
 349 }
 350
 351 size_t Compressor::compressToNewBuffer(const char* in, size_t size, char*&out) {
 352   BufferContext bc(size);
 353   size_t result = compress(in,size,BufferContext::callback,(void*)&bc);
 354   bc.trimTo(result);
 355   out = bc.buff;
 356   return result;
 357 }
 358
 359 size_t
 360 Compressor::compressToStream(const char*in, size_t size, std::ostream& out) {
 361   // Set up the context and writer
 362   WriterContext ctxt(&out, size / 2);
 363
 364   // Compress everything after the magic number (which we'll alter).
 365   size_t zipSize = Compressor::compress(in,size,
 366     WriterContext::callback, (void*)&ctxt);
 367
 368   if (ctxt.chunk) {
 369     ctxt.write(zipSize - ctxt.written);
 370   }
 371   return zipSize;
 372 }
 373
 374 // Decompress in one of three ways
 375 size_t Compressor::decompress(const char *in, size_t size,
 376                               OutputDataCallback* cb, void* context) {
 377   assert(in && "Can't decompress null buffer");
 378   assert(size > 1 && "Can't decompress empty buffer");
 379   assert(cb && "Can't decompress without a callback function");
 380
 381   size_t result = 0;
 382
 383   switch (*in++) {
 384     case COMP_TYPE_BZIP2: {
 385       // Set up the bz_stream
 386       bz_stream bzdata;
 387       bzdata.bzalloc = 0;
 388       bzdata.bzfree = 0;
 389       bzdata.opaque = 0;
 390       bzdata.next_in = (char*)in;
 391       bzdata.avail_in = size - 1;
 392       bzdata.next_out = 0;
 393       bzdata.avail_out = 0;
 394       switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
 395         case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
 396         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
 397         case BZ_MEM_ERROR:    throw std::string("Out of memory");
 398         case BZ_OK:
 399         default:
 400           break;
 401       }
 402
 403       // Get a block of memory
 404       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 405         BZ2_bzDecompressEnd(&bzdata);
 406         throw std::string("Can't allocate output buffer");
 407       }
 408
 409       // Decompress it
 410       int bzerr = BZ_OK;
 411       while ( BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata)) &&
 412               bzdata.avail_in != 0 ) {
 413         if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
 414           BZ2_bzDecompressEnd(&bzdata);
 415           throw std::string("Can't allocate output buffer");
 416         }
 417       }
 418
 419       switch (bzerr) {
 420         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
 421         case BZ_MEM_ERROR:    throw std::string("Out of memory");
 422         case BZ_DATA_ERROR:   throw std::string("Data integrity error");
 423         case BZ_DATA_ERROR_MAGIC:throw std::string("Data is not BZIP2");
 424         case BZ_OK:           throw std::string("Insufficient input for bzip2");
 425         case BZ_STREAM_END: break;
 426         default: throw("Ooops");
 427       }
 428
 429
 430       // Finish
 431       result = bzdata.total_out_lo32;
 432       if (sizeof(size_t) == sizeof(uint64_t))
 433         result |= (static_cast<uint64_t>(bzdata.total_out_hi32) << 32);
 434       BZ2_bzDecompressEnd(&bzdata);
 435       break;
 436     }
 437
 438     case COMP_TYPE_NONE: {
 439       NULLCOMP_stream sdata;
 440       sdata.next_in = (char*)in;
 441       sdata.avail_in = size - 1;
 442       NULLCOMP_init(&sdata);
 443
 444       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 445         throw std::string("Can't allocate output buffer");
 446       }
 447
 448       while (!NULLCOMP_decompress(&sdata)) {
 449         if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
 450           throw std::string("Can't allocate output buffer");
 451         }
 452       }
 453
 454       result = sdata.output_count;
 455       NULLCOMP_end(&sdata);
 456       break;
 457     }
 458
 459     default:
 460       throw std::string("Unknown type of compressed data");
 461   }
 462
 463   return result;
 464 }
 465
 466 size_t
 467 Compressor::decompressToNewBuffer(const char* in, size_t size, char*&out) {
 468   BufferContext bc(size);
 469   size_t result = decompress(in,size,BufferContext::callback,(void*)&bc);
 470   out = bc.buff;
 471   return result;
 472 }
 473
 474 size_t
 475 Compressor::decompressToStream(const char*in, size_t size, std::ostream& out){
 476   // Set up the context and writer
 477   WriterContext ctxt(&out,size / 2);
 478
 479   // Decompress everything after the magic number (which we'll alter)
 480   size_t zipSize = Compressor::decompress(in,size,
 481     WriterContext::callback, (void*)&ctxt);
 482
 483   if (ctxt.chunk) {
 484     ctxt.write(zipSize - ctxt.written);
 485   }
 486   return zipSize;
 487 }
 488
 489 // vim: sw=2 ai