rsvk/Component/BWTCompressUnit.pas

692 lines
21 KiB
Plaintext

unit BWTCompressUnit;
{-------------------------------------------------------------------------------
Burrows Wheeler Transformation
Block Compression Unit
------------------------------
reSource v2.6
Copyright (C) 1998-2001 Victor Kasenda / gruv
http://go.to/gruv
email: vickas@singnet.com.sg
Desc:
This is the class that brings all the engines together.
It uses the FSortUnit, MTFEncoder, StrucAriEncoder.
The whole compression for a block consists of:
1) Burrows Wheeler Transformation (Sort + Retrieve last column)
2) Move To Front encoding
3) Structured Arithmetic encoding
Brief Explanation:
1) BWT is the trick to the high performance compression
2) Move to Front coding is done to transform the block into a series
of numbers. The more frequantly appearing characters will thus be
transformed to lower numbers, resulting a low numbers dominating the
block (0 and 1s especially). This aids Arithmetic coding.
3) Arithmetic coding is performed with a structured or hierarchical model.
Read the system doc for more information about the structured
arithmetic model.
For a more in depth discussion of the compression process, refer
to the system doc.
Usage:
- just create the object and call CompressBlockToFile
CompressBlockToFile writes out the data header and the data
- to not use the structured arithmetic encoder, undefine USE_STRUC_ARI
Notes:
- read notes.txt for information about the block swapping technique used
- certain debug procedures have been commented out to prevent hints
- the general rule is pass only what is needed for the engine wrappers
-------------------------------------------------------------------------------}
(**) interface (**)
uses // delphi
Classes, Forms, SysUtils, Dialogs,
// general
OFile, StructsUnit, ArchiveFileUnit, ArchiveHeadersUnit, CRC32Unit,
// engine
RLEUnit, FSortUnit, MTFEncoderUnit, MTFDecoderUnit, BWTExpandUnit,
FileStrucAriEncoderUnit, StreamStrucAriEncoderUnit,
// base
BWTBaseUnit;
type
TCompressor = class(TBWTBase)
private
//block1, block2: PBlock;
index: PLongintBlock;
// Debug
{original_block: PBlock;
recovered_block: PBlock;}
{Expander: TExpander;}
// Classes
FastSorter: TFastSorter;
MTFEncoder: TMTFEncoder;
FileStrucAriEncoder: TFileStrucAriEncoder;
StreamAriEncoder: TStreamAriEncoder;
{RunLengthEncoder: TRunLengthEncoder;}
// Main compression routines
{procedure AllocateStructs;
procedure FreeStructs;}
procedure InitStructs;
procedure SortBlock(var block_length: longint);
procedure MTFGetTransformedBlock(var block_length, first_sym_index, virtual_char_index: longint);
procedure AriEncodeBlock(ArchiveFile: TArchiveFile; block_length: longint);
procedure AriEncodeBlockToStream(OutStream: TStream; block_length: longint; var OutSize: integer);
procedure FillInBlockFromStream(Stream: TStream; var BlockLength: integer);
{procedure GetTransformedBlock(var first_sym_index, virtual_char_index: longint);
procedure MTFEncodeBlock;}
{procedure RLEEncode;}
// Debug
{procedure DoBlockRecover;
procedure DumpBlock(var b; bsize: longint; FileName: string);
procedure DumpSortedBlock;
procedure DumpTransformedBlock;
procedure DumpRecoveredBlock;
procedure CheckSortedBlock;
{procedure CheckRecoveredBlock;}
// Debug output
{procedure DebugShowDoingSorting;
procedure DebugShowDoingTransform;}
{procedure DebugShowDoingMTF;}
{procedure DebugShowDoingAriCompress;}
public
constructor Create;
destructor Destroy; override;
function GetInBlock: PBlock; // fill the inblock then compress it
procedure CompressInBlockToFile(block_length: longint; ArchiveFile: TArchiveFile;
var packed_size: integer);
procedure CompressStream(InStream, OutStream: TStream);
end;
(**) implementation (**)
uses ErrorUnit;
constructor TCompressor.Create;
begin
inherited Create;
//AllocateStructs;
FastSorter := TFastSorter.Create;
MTFEncoder := TMTFEncoder.create;
FileStrucAriEncoder := TFileStrucAriEncoder.Create;
StreamAriEncoder := TStreamAriEncoder.Create;
{Debug}
{Expander := TExpander.Create;}
end;
destructor TCompressor.Destroy;
begin
{Debug}
{Expander.Free;}
FileStrucAriEncoder.Free;
MTFEncoder.Free;
FastSorter.Free;
//FreeStructs;
inherited Destroy;
end;
(*
{-------------------------------------------------------------------------------
AllocateStructs
---------------
Allocate memory for the block transformation and assign in_block and out_block
-------------------------------------------------------------------------------}
procedure TCompressor.AllocateStructs;
begin
New(block1);
New(block2);
New(index);
// Debug
{New(recovered_block);
New(original_block);}
end;
{-------------------------------------------------------------------------------
FreeStructs
Free whatever memory that was allocated by AllocateStructs
-------------------------------------------------------------------------------}
procedure TCompressor.FreeStructs;
begin
// Debug
{Dispose(original_block);
Dispose(recovered_block);}
Dispose(index);
Dispose(block2);
Dispose(block1);
end;
*)
{-------------------------------------------------------------------------------
InitStructs
inits swap block structures.
pass the block1 to be assigned
inits the index.
Assigns an index to every position in block. Each entry in index indicates the
start of a string.
-------------------------------------------------------------------------------}
procedure TCompressor.InitStructs;
begin
// Assign block pointers for the swapblocks system
// in_block was assigned when GetInBlock was called. it took block1
out_block := BlockMan.block2;
index := BlockMan.longintblock1;
end;
function TCompressor.GetInBlock: PBlock; // fill the inblock then compress it
begin
in_block := BlockMan.block1;
result := BlockMan.block1;
end;
{-------------------------------------------------------------------------------
CompressBlockToFile
-------------------
Writes out the data header + data
IN Assertion: ArchiveFile has been seeked to the next write position
OUT Assertion: ArchiveFile is seeked to the next output position
-------------------------------------------------------------------------------}
procedure TCompressor.CompressInBlockToFile;
var
data_header_offset, // offset of the data header
next_free_pos: integer; // the next output position when ArchiveFile is returned
ari_data_size: longword; // size of the arithmetic data
crc: longword; // crc calculated for this block
first_sym_index, virtual_char_index: longint;
DataBlockHeader: TDataBlockHeader; // the data header
begin
{Compression process:
Sort
Transform
Move To Front
Ari Code}
// reserve space for the block size first
data_header_offset := ArchiveFile.Position;
ArchiveFile.ReserveSpace(DATA_HEADER_SIZE);
ArchiveFile.ResetBuffer;
InitStructs;
CalculateCRC32(in_block, block_length, crc);
SortBlock(block_length);
MTFGetTransformedBlock(block_length, first_sym_index, virtual_char_index);
AriEncodeBlock(ArchiveFile, block_length);
// save the current position
next_free_pos := ArchiveFile.Position;
// some calculations
ari_data_size := next_free_pos - data_header_offset -DATA_HEADER_SIZE;
// seek back to start of data block to write the data header of this block
ArchiveFile.SmartSeek(data_header_offset, soFromBeginning);
DataBlockHeader := TDataBlockHeader.Create;
DataBlockHeader.crc32 := crc;
DataBlockHeader.compressed_size := ari_data_size;
DataBlockHeader.first_sym_index := first_sym_index;
DataBlockHeader.virtual_char_index := virtual_char_index;
DataBlockHeader.WriteToFile(ArchiveFile);
DataBlockHeader.Free;
// seek back to where we left off
ArchiveFile.SmartSeek(next_free_pos, soFromBeginning);
// allow screen update
Application.ProcessMessages;
// return values
packed_size := ari_data_size;
end;
{-------------------------------------------------------------------------------
CompressStream
-------------------
Writes Compressed Data Only to OutStream.
No block information is stored.
IN Assertion: ArchiveFile has been seeked to the next write position
OUT Assertion: ArchiveFile is seeked to the next output position
-------------------------------------------------------------------------------}
procedure TCompressor.CompressStream(InStream, OutStream: TStream);
var
BlockLength: integer; // actual size of data in block
crc: longword; // crc calculated for this block
AriDataSize: longint; // size of the arithmetic data
first_sym_index, virtual_char_index: longint;
DataBlockHeader: TDataBlockHeader; // the data header
begin
{Compression process:
Sort
Transform
Move To Front
Ari Code}
GetInBlock; // init in_block.
InitStructs;
While (InStream.Position < InStream.Size) do
begin
FillInBlockFromStream(InStream, BlockLength);
CalculateCRC32(in_block, BlockLength, crc);
SortBlock(BlockLength);
MTFGetTransformedBlock(BlockLength, first_sym_index, virtual_char_index);
AriEncodeBlockToStream(OutStream, BlockLength, AriDataSize);
end;
(*
InitStructs;
CalculateCRC32(in_block, block_length, crc);
SortBlock(block_length);
MTFGetTransformedBlock(block_length, first_sym_index, virtual_char_index);
AriEncodeBlock(ArchiveFile, block_length);
// save the current position
next_free_pos := ArchiveFile.Position;
// some calculations
ari_data_size := next_free_pos - data_header_offset -DATA_HEADER_SIZE;
// seek back to start of data block to write the data header of this block
ArchiveFile.SmartSeek(data_header_offset, soFromBeginning);
DataBlockHeader := TDataBlockHeader.Create;
with DataBlockHeader do
begin
crc32 := crc;
compressed_size := ari_data_size;
end;
DataBlockHeader.first_sym_index := first_sym_index;
DataBlockHeader.virtual_char_index := virtual_char_index;
DataBlockHeader.WriteToFile(ArchiveFile);
DataBlockHeader.Free;
// seek back to where we left off
ArchiveFile.SmartSeek(next_free_pos, soFromBeginning);
// allow screen update
Application.ProcessMessages;
// return values
packed_size := ari_data_size;
*)
end;
procedure TCompressor.FillInBlockFromStream(Stream: TStream; var BlockLength: integer);
begin
BlockLength := Stream.Read(in_block^[0], BlockSize);
end;
{-------------------------------------------------------------------------------
RLEEncode
Run Length Encode the block for faster sorting.
OUT Assertion: block_length is set to the new length
-------------------------------------------------------------------------------}
{procedure TCompressor.RLEEncode;
var
RLEEncoder: TRunLengthEncoder;
begin
RLEEncoder := TRunLengthEncoder.Create;
RLEEncoder.EncodeBlock(in_block, out_block, block_length, block_length);
RLEEncoder.Free;
SwapBlocks;
end;}
{-------------------------------------------------------------------------------
SortBlock
-------------------------------------------------------------------------------}
procedure TCompressor.SortBlock(var block_length: longint);
var
i: longint;
begin
for i := 0 to block_length-1 do
index[i] := i;
//DebugShowDoingSorting;
FastSorter.SortBlock(in_block, index, block_length);
// SadaSort adds a virtual char
inc(block_length);
// debug check
{DumpSortedBlock;}
{CheckSortedBlock;}
// in_block is not changed, only Index is created.
// swapblocks need not be called
end;
{-------------------------------------------------------------------------------
GetTransformedBlock and MTF encode
Get the last column l
-------------------------------------------------------------------------------}
procedure TCompressor.MTFGetTransformedBlock(var block_length, first_sym_index, virtual_char_index: longint);
var
i, j: longint;
begin
//DebugShowDoingTransform;
MTFEncoder.Init;
// mirror the last byte in block to block[-1], so when i = 0, block^[i-1] works
in_block^[-1] := in_block^[block_length-1];
// sada sort account for vitual. don't pass it to the mtf.
// we remove it from out_block and store its index.
i := 0; // in_block index
j := 0; // out_block index
virtual_char_index := -2;
while (i < block_length) do
begin
if (index[i] = 1) then
first_sym_index := i;
// the virtual char is accessed when in_block[-1] is accessed
if ((index[i]-1) = -1) then
virtual_char_index := j // we skip the virtual char
else
begin
out_block[j] := MTFEncoder.Encode(in_block[index[i]-1]);
inc(j);
end;
inc(i);
end;
if (virtual_char_index = -2) then
begin
// fatal error: virtual_char_index may not have been initialized at all
ShowError('virtual_char_index not initialized.');
end;
// we have taken out the virtual char, so we dec block_length
dec(block_length);
SwapBlocks;
end;
{-------------------------------------------------------------------------------
AriEncodeBlock
Notes:
Arithmetic compress block and output block
-------------------------------------------------------------------------------}
procedure TCompressor.AriEncodeBlock(ArchiveFile: TArchiveFile; block_length: longint);
begin
//DebugShowDoingAriCompress;
// FileStrucAriEncoder := TFileStrucAriEncoder.Create;
FileStrucAriEncoder.EncodeBlock(ArchiveFile, in_block, block_length);
// debug check
{DecodeBlock(recovered_block, rsize);
CompareBlocks(mtf_block, recovered_block, block_length, 'Decompression error.');}
end;
procedure TCompressor.AriEncodeBlockToStream(OutStream: TStream; block_length: longint; var OutSize: integer);
begin
StreamAriEncoder.EncodeBlock(OutStream, in_block, block_length, OutSize);
end;
(*
procedure TCompressor.GetTransformedBlock(var first_sym_index, virtual_char_index: longint);
var
i, j: longint;
begin
DebugShowDoingTransform;
// mirror the last byte in block to block[-1], so when i = 0, block^[i-1] works
in_block^[-1] := in_block^[block_length-1];
// sada sort account for vitual. don't pass it to the mtf.
// we remove it from out_block and store its index.
i := 0; // in_block index
j := 0; // out_block index
virtual_char_index := -2;
while (i < block_length) do
begin
if (index^[i] = 1) then
first_sym_index := i;
// the virtual char is accessed when in_block[-1] is accessed
if ((index^[i]-1) = -1) then
virtual_char_index := j // we skip the virtual char
else
begin
out_block^[j] := in_block^[longint(index[i])-1];
inc(j);
end;
inc(i);
end;
//ShowMessage('Virtual char index: ' + IntToStr(virtual_char_index));
if (virtual_char_index = -2) then
begin
// fatal error: virtual_char_index may not have been initialized at all
ShowError('virtual_char_index not initialized.');
end;
// we have taken out the virtual char, so we dec block_length
dec(block_length);
// debug check
{DumpTransformedBlock;}
{DoBlockRecover;
CheckRecoveredBlock;}
SwapBlocks;
end;
{-------------------------------------------------------------------------------
MTFEncodeBlock
-------------------------------------------------------------------------------}
procedure TCompressor.MTFEncodeBlock;
var
MTFEncoder: TMTFEncoder;
{MTFDecoder: TMTFDecoder;}
begin
DebugShowDoingMTF;
MTFEncoder := TMTFEncoder.create;
MTFEncoder.EncodeBlock(in_block, out_block, block_length);
MTFEncoder.free;
SwapBlocks;
// debug check
{MTFDecoder := TMTFDecoder.create;
MTFDecoder.DecodeBlock(mtf_block, recovered_block, block_length);
MTFDecoder.free;}
end;
*)
(*******************************************************************************
Debuging routines
*******************************************************************************)
(*
procedure TCompressor.DoBlockRecover;
{var
RecoveredBlockLength: Longint;}
begin
//Expander.ExpandBlock(block, recovered_block, first_sym_index, block_length, RecoveredBlockLength);
//Expander.ExpandBlock(transformed_block, recovered_block, first_sym_index, block_length, RecoveredBlockLength);
end;
{-------------------------------------------------------------------------------
DumpSortedBlock
---------------
Dumps the data in block sorted in alphabetical order.
Used to visually confirm the reliability of the sorting algorithm.
-------------------------------------------------------------------------------}
procedure TCompressor.DumpSortedBlock;
var
f: text;
i: integer;
begin
AssignFile(f, 'c:\ctest\SortedBlockDump.txt');
Rewrite(f);
writeln(f, 'Sorted Block Dump file');
writeln(f, 'reSource eXperimental (C) 1997 F-inc');
writeln(f, '=======================================');
writeln(f, 'block_length: ', block_length);
writeln(f, '=======================================');
for i := 0 to block_length-1 do
{if (index^[i] = block_length) then
write(f, '?')
else}
//write(f, char(block^[index^[i]]));
Close(f);
end;
{-------------------------------------------------------------------------------
DumpBlock
---------
Dumps the block, b to a file.
Used by DumpTransformedBlock
-------------------------------------------------------------------------------}
procedure TCompressor.DumpBlock(var b; bsize: longint; FileName: string);
var
f: TOFile;
begin
f := TOFile.create(FileName);
f.Rewrite(1);
f.BlockWrite(b, block_length);
f.free;
end;
procedure TCompressor.DumpRecoveredBlock;
begin
DumpBlock(recovered_block^, block_length, 'c:\ctest\out Recovered Block.txt');
end;
{-------------------------------------------------------------------------------
DumpTransformedBlock
--------------------
Dumps the transformed block to file.
This is actually L, or the last column in the transformation matrix.
IN Assertion: DoBlockTransform was called.
-------------------------------------------------------------------------------}
procedure TCompressor.DumpTransformedBlock;
begin
// DumpBlock(block^, block_length, 'c:\ctest\out Transformed Block.txt');
end;
*)
{-------------------------------------------------------------------------------
CheckSortedBlock
----------------
Checks the sorted block for ascending order.
Only displays an error when one has occured.
-------------------------------------------------------------------------------}
(*
procedure TCompressor.CheckSortedBlock;
var
i: integer;
begin
{Checks: INBLOCK
Assertion: Index has been created}
i := 1;
while (i < block_length-1) and (in_block^[Index^[i]] >= in_block^[Index^[i-1]]) do
inc(i);
{An error has occured if i did not reach the end of block}
if (i < block_length-1) then
ShowError('Block not sorted correctly');
end;
{-------------------------------------------------------------------------------
CheckRecoveredBlock
-------------------
Does a byte to byte comparison of the recovered block and the original block.
Shows an error and the position where the first different byte was found.
-------------------------------------------------------------------------------}
procedure TCompressor.CheckRecoveredBlock;
var
i: longint;
begin
//DumpRecoveredBlock;
{recovered_block must be the same as original block}
for i := 0 to block_length-1 do
if recovered_block^[i] <> original_block^[i] then
begin
ShowError('Recovered block differs from original block at ' + IntToStr(i));
break;
end;
{Alternate way of comparing using CompareMem.
Position of difference start will not be shown.
if not CompareMem(recovered_block, block, block_length-1) then
ShowError('Recovered block differs from original block');}
end;
*)
(*******************************************************************************
Debug Output routines
*******************************************************************************)
{procedure TCompressor.DebugShowDoingSorting;
begin
if ConfigMan.ShowDebugForm then DebugForm.DoingSorting;
end;
procedure TCompressor.DebugShowDoingTransform;
begin
if ConfigMan.ShowDebugForm then DebugForm.DoingTransform;
end;}
{procedure TCompressor.DebugShowDoingMTF;
begin
if ConfigMan.ShowDebugForm then DebugForm.DoingMTF;
end;}
{procedure TCompressor.DebugShowDoingAriCompress;
begin
if ConfigMan.ShowDebugForm then DebugForm.DoingAriCompress;
end;}
end.