unit BWTCompressUnit; {------------------------------------------------------------------------------- Burrows Wheeler Transformation Block Compression Unit ------------------------------ reSource v2.6 Copyright (C) 1998-2001 Victor Kasenda / gruv http://go.to/gruv email: vickas@singnet.com.sg Desc: This is the class that brings all the engines together. It uses the FSortUnit, MTFEncoder, StrucAriEncoder. The whole compression for a block consists of: 1) Burrows Wheeler Transformation (Sort + Retrieve last column) 2) Move To Front encoding 3) Structured Arithmetic encoding Brief Explanation: 1) BWT is the trick to the high performance compression 2) Move to Front coding is done to transform the block into a series of numbers. The more frequantly appearing characters will thus be transformed to lower numbers, resulting a low numbers dominating the block (0 and 1s especially). This aids Arithmetic coding. 3) Arithmetic coding is performed with a structured or hierarchical model. Read the system doc for more information about the structured arithmetic model. For a more in depth discussion of the compression process, refer to the system doc. Usage: - just create the object and call CompressBlockToFile CompressBlockToFile writes out the data header and the data - to not use the structured arithmetic encoder, undefine USE_STRUC_ARI Notes: - read notes.txt for information about the block swapping technique used - certain debug procedures have been commented out to prevent hints - the general rule is pass only what is needed for the engine wrappers -------------------------------------------------------------------------------} (**) interface (**) uses // delphi Classes, Forms, SysUtils, Dialogs, // general OFile, StructsUnit, ArchiveFileUnit, ArchiveHeadersUnit, CRC32Unit, // engine RLEUnit, FSortUnit, MTFEncoderUnit, MTFDecoderUnit, BWTExpandUnit, FileStrucAriEncoderUnit, StreamStrucAriEncoderUnit, // base BWTBaseUnit; type TCompressor = class(TBWTBase) private //block1, block2: PBlock; index: PLongintBlock; // Debug {original_block: PBlock; recovered_block: PBlock;} {Expander: TExpander;} // Classes FastSorter: TFastSorter; MTFEncoder: TMTFEncoder; FileStrucAriEncoder: TFileStrucAriEncoder; StreamAriEncoder: TStreamAriEncoder; {RunLengthEncoder: TRunLengthEncoder;} // Main compression routines {procedure AllocateStructs; procedure FreeStructs;} procedure InitStructs; procedure SortBlock(var block_length: longint); procedure MTFGetTransformedBlock(var block_length, first_sym_index, virtual_char_index: longint); procedure AriEncodeBlock(ArchiveFile: TArchiveFile; block_length: longint); procedure AriEncodeBlockToStream(OutStream: TStream; block_length: longint; var OutSize: integer); procedure FillInBlockFromStream(Stream: TStream; var BlockLength: integer); {procedure GetTransformedBlock(var first_sym_index, virtual_char_index: longint); procedure MTFEncodeBlock;} {procedure RLEEncode;} // Debug {procedure DoBlockRecover; procedure DumpBlock(var b; bsize: longint; FileName: string); procedure DumpSortedBlock; procedure DumpTransformedBlock; procedure DumpRecoveredBlock; procedure CheckSortedBlock; {procedure CheckRecoveredBlock;} // Debug output {procedure DebugShowDoingSorting; procedure DebugShowDoingTransform;} {procedure DebugShowDoingMTF;} {procedure DebugShowDoingAriCompress;} public constructor Create; destructor Destroy; override; function GetInBlock: PBlock; // fill the inblock then compress it procedure CompressInBlockToFile(block_length: longint; ArchiveFile: TArchiveFile; var packed_size: integer); procedure CompressStream(InStream, OutStream: TStream); end; (**) implementation (**) uses ErrorUnit; constructor TCompressor.Create; begin inherited Create; //AllocateStructs; FastSorter := TFastSorter.Create; MTFEncoder := TMTFEncoder.create; FileStrucAriEncoder := TFileStrucAriEncoder.Create; StreamAriEncoder := TStreamAriEncoder.Create; {Debug} {Expander := TExpander.Create;} end; destructor TCompressor.Destroy; begin {Debug} {Expander.Free;} FileStrucAriEncoder.Free; MTFEncoder.Free; FastSorter.Free; //FreeStructs; inherited Destroy; end; (* {------------------------------------------------------------------------------- AllocateStructs --------------- Allocate memory for the block transformation and assign in_block and out_block -------------------------------------------------------------------------------} procedure TCompressor.AllocateStructs; begin New(block1); New(block2); New(index); // Debug {New(recovered_block); New(original_block);} end; {------------------------------------------------------------------------------- FreeStructs Free whatever memory that was allocated by AllocateStructs -------------------------------------------------------------------------------} procedure TCompressor.FreeStructs; begin // Debug {Dispose(original_block); Dispose(recovered_block);} Dispose(index); Dispose(block2); Dispose(block1); end; *) {------------------------------------------------------------------------------- InitStructs inits swap block structures. pass the block1 to be assigned inits the index. Assigns an index to every position in block. Each entry in index indicates the start of a string. -------------------------------------------------------------------------------} procedure TCompressor.InitStructs; begin // Assign block pointers for the swapblocks system // in_block was assigned when GetInBlock was called. it took block1 out_block := BlockMan.block2; index := BlockMan.longintblock1; end; function TCompressor.GetInBlock: PBlock; // fill the inblock then compress it begin in_block := BlockMan.block1; result := BlockMan.block1; end; {------------------------------------------------------------------------------- CompressBlockToFile ------------------- Writes out the data header + data IN Assertion: ArchiveFile has been seeked to the next write position OUT Assertion: ArchiveFile is seeked to the next output position -------------------------------------------------------------------------------} procedure TCompressor.CompressInBlockToFile; var data_header_offset, // offset of the data header next_free_pos: integer; // the next output position when ArchiveFile is returned ari_data_size: longword; // size of the arithmetic data crc: longword; // crc calculated for this block first_sym_index, virtual_char_index: longint; DataBlockHeader: TDataBlockHeader; // the data header begin {Compression process: Sort Transform Move To Front Ari Code} // reserve space for the block size first data_header_offset := ArchiveFile.Position; ArchiveFile.ReserveSpace(DATA_HEADER_SIZE); ArchiveFile.ResetBuffer; InitStructs; CalculateCRC32(in_block, block_length, crc); SortBlock(block_length); MTFGetTransformedBlock(block_length, first_sym_index, virtual_char_index); AriEncodeBlock(ArchiveFile, block_length); // save the current position next_free_pos := ArchiveFile.Position; // some calculations ari_data_size := next_free_pos - data_header_offset -DATA_HEADER_SIZE; // seek back to start of data block to write the data header of this block ArchiveFile.SmartSeek(data_header_offset, soFromBeginning); DataBlockHeader := TDataBlockHeader.Create; DataBlockHeader.crc32 := crc; DataBlockHeader.compressed_size := ari_data_size; DataBlockHeader.first_sym_index := first_sym_index; DataBlockHeader.virtual_char_index := virtual_char_index; DataBlockHeader.WriteToFile(ArchiveFile); DataBlockHeader.Free; // seek back to where we left off ArchiveFile.SmartSeek(next_free_pos, soFromBeginning); // allow screen update Application.ProcessMessages; // return values packed_size := ari_data_size; end; {------------------------------------------------------------------------------- CompressStream ------------------- Writes Compressed Data Only to OutStream. No block information is stored. IN Assertion: ArchiveFile has been seeked to the next write position OUT Assertion: ArchiveFile is seeked to the next output position -------------------------------------------------------------------------------} procedure TCompressor.CompressStream(InStream, OutStream: TStream); var BlockLength: integer; // actual size of data in block crc: longword; // crc calculated for this block AriDataSize: longint; // size of the arithmetic data first_sym_index, virtual_char_index: longint; DataBlockHeader: TDataBlockHeader; // the data header begin {Compression process: Sort Transform Move To Front Ari Code} GetInBlock; // init in_block. InitStructs; While (InStream.Position < InStream.Size) do begin FillInBlockFromStream(InStream, BlockLength); CalculateCRC32(in_block, BlockLength, crc); SortBlock(BlockLength); MTFGetTransformedBlock(BlockLength, first_sym_index, virtual_char_index); AriEncodeBlockToStream(OutStream, BlockLength, AriDataSize); end; (* InitStructs; CalculateCRC32(in_block, block_length, crc); SortBlock(block_length); MTFGetTransformedBlock(block_length, first_sym_index, virtual_char_index); AriEncodeBlock(ArchiveFile, block_length); // save the current position next_free_pos := ArchiveFile.Position; // some calculations ari_data_size := next_free_pos - data_header_offset -DATA_HEADER_SIZE; // seek back to start of data block to write the data header of this block ArchiveFile.SmartSeek(data_header_offset, soFromBeginning); DataBlockHeader := TDataBlockHeader.Create; with DataBlockHeader do begin crc32 := crc; compressed_size := ari_data_size; end; DataBlockHeader.first_sym_index := first_sym_index; DataBlockHeader.virtual_char_index := virtual_char_index; DataBlockHeader.WriteToFile(ArchiveFile); DataBlockHeader.Free; // seek back to where we left off ArchiveFile.SmartSeek(next_free_pos, soFromBeginning); // allow screen update Application.ProcessMessages; // return values packed_size := ari_data_size; *) end; procedure TCompressor.FillInBlockFromStream(Stream: TStream; var BlockLength: integer); begin BlockLength := Stream.Read(in_block^[0], BlockSize); end; {------------------------------------------------------------------------------- RLEEncode Run Length Encode the block for faster sorting. OUT Assertion: block_length is set to the new length -------------------------------------------------------------------------------} {procedure TCompressor.RLEEncode; var RLEEncoder: TRunLengthEncoder; begin RLEEncoder := TRunLengthEncoder.Create; RLEEncoder.EncodeBlock(in_block, out_block, block_length, block_length); RLEEncoder.Free; SwapBlocks; end;} {------------------------------------------------------------------------------- SortBlock -------------------------------------------------------------------------------} procedure TCompressor.SortBlock(var block_length: longint); var i: longint; begin for i := 0 to block_length-1 do index[i] := i; //DebugShowDoingSorting; FastSorter.SortBlock(in_block, index, block_length); // SadaSort adds a virtual char inc(block_length); // debug check {DumpSortedBlock;} {CheckSortedBlock;} // in_block is not changed, only Index is created. // swapblocks need not be called end; {------------------------------------------------------------------------------- GetTransformedBlock and MTF encode Get the last column l -------------------------------------------------------------------------------} procedure TCompressor.MTFGetTransformedBlock(var block_length, first_sym_index, virtual_char_index: longint); var i, j: longint; begin //DebugShowDoingTransform; MTFEncoder.Init; // mirror the last byte in block to block[-1], so when i = 0, block^[i-1] works in_block^[-1] := in_block^[block_length-1]; // sada sort account for vitual. don't pass it to the mtf. // we remove it from out_block and store its index. i := 0; // in_block index j := 0; // out_block index virtual_char_index := -2; while (i < block_length) do begin if (index[i] = 1) then first_sym_index := i; // the virtual char is accessed when in_block[-1] is accessed if ((index[i]-1) = -1) then virtual_char_index := j // we skip the virtual char else begin out_block[j] := MTFEncoder.Encode(in_block[index[i]-1]); inc(j); end; inc(i); end; if (virtual_char_index = -2) then begin // fatal error: virtual_char_index may not have been initialized at all ShowError('virtual_char_index not initialized.'); end; // we have taken out the virtual char, so we dec block_length dec(block_length); SwapBlocks; end; {------------------------------------------------------------------------------- AriEncodeBlock Notes: Arithmetic compress block and output block -------------------------------------------------------------------------------} procedure TCompressor.AriEncodeBlock(ArchiveFile: TArchiveFile; block_length: longint); begin //DebugShowDoingAriCompress; // FileStrucAriEncoder := TFileStrucAriEncoder.Create; FileStrucAriEncoder.EncodeBlock(ArchiveFile, in_block, block_length); // debug check {DecodeBlock(recovered_block, rsize); CompareBlocks(mtf_block, recovered_block, block_length, 'Decompression error.');} end; procedure TCompressor.AriEncodeBlockToStream(OutStream: TStream; block_length: longint; var OutSize: integer); begin StreamAriEncoder.EncodeBlock(OutStream, in_block, block_length, OutSize); end; (* procedure TCompressor.GetTransformedBlock(var first_sym_index, virtual_char_index: longint); var i, j: longint; begin DebugShowDoingTransform; // mirror the last byte in block to block[-1], so when i = 0, block^[i-1] works in_block^[-1] := in_block^[block_length-1]; // sada sort account for vitual. don't pass it to the mtf. // we remove it from out_block and store its index. i := 0; // in_block index j := 0; // out_block index virtual_char_index := -2; while (i < block_length) do begin if (index^[i] = 1) then first_sym_index := i; // the virtual char is accessed when in_block[-1] is accessed if ((index^[i]-1) = -1) then virtual_char_index := j // we skip the virtual char else begin out_block^[j] := in_block^[longint(index[i])-1]; inc(j); end; inc(i); end; //ShowMessage('Virtual char index: ' + IntToStr(virtual_char_index)); if (virtual_char_index = -2) then begin // fatal error: virtual_char_index may not have been initialized at all ShowError('virtual_char_index not initialized.'); end; // we have taken out the virtual char, so we dec block_length dec(block_length); // debug check {DumpTransformedBlock;} {DoBlockRecover; CheckRecoveredBlock;} SwapBlocks; end; {------------------------------------------------------------------------------- MTFEncodeBlock -------------------------------------------------------------------------------} procedure TCompressor.MTFEncodeBlock; var MTFEncoder: TMTFEncoder; {MTFDecoder: TMTFDecoder;} begin DebugShowDoingMTF; MTFEncoder := TMTFEncoder.create; MTFEncoder.EncodeBlock(in_block, out_block, block_length); MTFEncoder.free; SwapBlocks; // debug check {MTFDecoder := TMTFDecoder.create; MTFDecoder.DecodeBlock(mtf_block, recovered_block, block_length); MTFDecoder.free;} end; *) (******************************************************************************* Debuging routines *******************************************************************************) (* procedure TCompressor.DoBlockRecover; {var RecoveredBlockLength: Longint;} begin //Expander.ExpandBlock(block, recovered_block, first_sym_index, block_length, RecoveredBlockLength); //Expander.ExpandBlock(transformed_block, recovered_block, first_sym_index, block_length, RecoveredBlockLength); end; {------------------------------------------------------------------------------- DumpSortedBlock --------------- Dumps the data in block sorted in alphabetical order. Used to visually confirm the reliability of the sorting algorithm. -------------------------------------------------------------------------------} procedure TCompressor.DumpSortedBlock; var f: text; i: integer; begin AssignFile(f, 'c:\ctest\SortedBlockDump.txt'); Rewrite(f); writeln(f, 'Sorted Block Dump file'); writeln(f, 'reSource eXperimental (C) 1997 F-inc'); writeln(f, '======================================='); writeln(f, 'block_length: ', block_length); writeln(f, '======================================='); for i := 0 to block_length-1 do {if (index^[i] = block_length) then write(f, '?') else} //write(f, char(block^[index^[i]])); Close(f); end; {------------------------------------------------------------------------------- DumpBlock --------- Dumps the block, b to a file. Used by DumpTransformedBlock -------------------------------------------------------------------------------} procedure TCompressor.DumpBlock(var b; bsize: longint; FileName: string); var f: TOFile; begin f := TOFile.create(FileName); f.Rewrite(1); f.BlockWrite(b, block_length); f.free; end; procedure TCompressor.DumpRecoveredBlock; begin DumpBlock(recovered_block^, block_length, 'c:\ctest\out Recovered Block.txt'); end; {------------------------------------------------------------------------------- DumpTransformedBlock -------------------- Dumps the transformed block to file. This is actually L, or the last column in the transformation matrix. IN Assertion: DoBlockTransform was called. -------------------------------------------------------------------------------} procedure TCompressor.DumpTransformedBlock; begin // DumpBlock(block^, block_length, 'c:\ctest\out Transformed Block.txt'); end; *) {------------------------------------------------------------------------------- CheckSortedBlock ---------------- Checks the sorted block for ascending order. Only displays an error when one has occured. -------------------------------------------------------------------------------} (* procedure TCompressor.CheckSortedBlock; var i: integer; begin {Checks: INBLOCK Assertion: Index has been created} i := 1; while (i < block_length-1) and (in_block^[Index^[i]] >= in_block^[Index^[i-1]]) do inc(i); {An error has occured if i did not reach the end of block} if (i < block_length-1) then ShowError('Block not sorted correctly'); end; {------------------------------------------------------------------------------- CheckRecoveredBlock ------------------- Does a byte to byte comparison of the recovered block and the original block. Shows an error and the position where the first different byte was found. -------------------------------------------------------------------------------} procedure TCompressor.CheckRecoveredBlock; var i: longint; begin //DumpRecoveredBlock; {recovered_block must be the same as original block} for i := 0 to block_length-1 do if recovered_block^[i] <> original_block^[i] then begin ShowError('Recovered block differs from original block at ' + IntToStr(i)); break; end; {Alternate way of comparing using CompareMem. Position of difference start will not be shown. if not CompareMem(recovered_block, block, block_length-1) then ShowError('Recovered block differs from original block');} end; *) (******************************************************************************* Debug Output routines *******************************************************************************) {procedure TCompressor.DebugShowDoingSorting; begin if ConfigMan.ShowDebugForm then DebugForm.DoingSorting; end; procedure TCompressor.DebugShowDoingTransform; begin if ConfigMan.ShowDebugForm then DebugForm.DoingTransform; end;} {procedure TCompressor.DebugShowDoingMTF; begin if ConfigMan.ShowDebugForm then DebugForm.DoingMTF; end;} {procedure TCompressor.DebugShowDoingAriCompress; begin if ConfigMan.ShowDebugForm then DebugForm.DoingAriCompress; end;} end.