rsvk/Component/FSortUnit.pas

654 lines
16 KiB
Plaintext

unit FSortUnit;
{-------------------------------------------------------------------------------
Fast sorter unit
----------------
reSource v2.6
Copyright (C) 1998-2001 Victor Kasenda / gruv
http://go.to/gruv
email: vickas@singnet.com.sg
Fast sort unit.
Algos:
DJ Wheeler from his June 1989 report and
Kunihiko Sadakane's Suffix sort.
coded by gruv
Notes:
Sort the index, not the block.
SadaSort compares group indexes not block.
Sort rev 4:
Radix on symbol pairs.
Sadakane's Suffix sort.
------------------------------------------------------------------------------}
(**) interface (**)
uses SysUtils, Forms, dialogs, StructsUnit;
const
STRIDE = 4;
MAXDEPTH = 20;
NUMOVERSHOOT = MAXDEPTH + 100;
type
{THead = array[0..65535] of Longint;
PHead = ^THead;}
TFastSorter = class
private
block: PBlock; // block to sort
index: PLongintBlock; // index to the block to sort. each index pos is a string
block_length: longint; // length of the block
last_index: integer;
head: P64kBlock; // head of the linked list
link: PLongintBlock; // links in the linked list
//link_count: PHead; // Number of links in each head
//index_head: PHead; // start of each group in index
group: PLongintBlock; // group of suffix s
size: PLongintBlock;
{For SadaSort: from the paper
I -> index
V -> group
S -> size}
procedure RadixSortOnSymbolPairs;
procedure InitIndexFromLink;
procedure SadaSort;
procedure SortGroup(const stlo, sthi, depth: integer);
public
constructor Create;
destructor Destroy; override;
procedure SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint);
end;
(**) implementation (**)
uses ErrorUnit;
{-------------------------------------------------------------------------------
Create/Destroy
--------------
Allocates and frees the memory structures used for sorting.
-------------------------------------------------------------------------------}
constructor TFastSorter.Create;
begin
inherited Create;
{New(head);
New(link);
//New(link_count);
//New(index_head);
New(group);
New(size);}
end;
destructor TFastSorter.Destroy;
begin
{Dispose(size);
Dispose(group);
//Dispose(index_head);
//Dispose(link_count);
Dispose(link);
Dispose(head);}
inherited Destroy;
end;
{-------------------------------------------------------------------------------
SortBlock
---------
Main procedure to call.
Initializes the block then calls the respective procedures to sort the block.
-------------------------------------------------------------------------------}
procedure TFastSorter.SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint);
procedure Initialize;
var
i: integer;
begin
{Initialize}
block := _block;
index := _index;
block_length := _block_length;
last_index := block_length-1;
// sizes array need not be cleared. it will be init.
// assign block memory
// index -> longintblock1
head := BlockMan.k64Block;
link := BlockMan.longintblock2;
group := BlockMan.longintblock2;
size := BlockMan.longintblock3;
{Clear Arrays}
for i := 0 to 65535 do
head[i] := -1;
end;
//var
//head_idx, cur_head: longword;
//first_char: byte;
//i, numkeys, first_head: integer; // numkeys: total number of keys with first_char
//t: longword;
//totalbytes: integer; // for progress bar
begin
//totalbytes := 0;
Initialize;
RadixSortOnSymbolPairs; // fill up head and link
InitIndexFromLink; // get index in semi sorted order and index_head
SadaSort;
end;
{-------------------------------------------------------------------------------
RadixSortOnSymbolPairs
----------------------
Radix sort: Run through the block array in words to get the buckets and dump
the indexes into their respective bucket.
Initializes long_block with each long integer straddling 4 bytes.
OUT Assertion:
head/link are linked lists to the sort.
long_block is initialized
-------------------------------------------------------------------------------}
procedure TFastSorter.RadixSortOnSymbolPairs;
var
i: integer;
w: word;
begin
{Init w with the first character}
w := block^[0];
for i := 0 to last_index-1 do
begin
w := word(w shl 8) or block^[i+1];
{if there is no entry in head then set the pos as the head.
Otherwise link the pos in by making it the head and setting its link}
if (head^[w] = -1) then
begin
head^[w] := i;
link^[i] := -1;
{Set link^[i] to -1 as the terminator}
end
else
begin
link^[i] := head^[w];
head^[w] := i;
end;
end; {for}
end;
{-------------------------------------------------------------------------------
InitIndexFromLink
-----------------
Out Assertion:
Inits index, index_head and link_count.
index_head will point to the head of each symbol pair in index.
link_count is the count for each symbol pair corresponding in head.
index will contain a continuous list of indexes. These indexes are in groups
with their head pointed to by index_head and counts in link_count.
Quicksort will sort the index.
head no more used.
Desc:
This will run through the head array.
It will fill in the index_head with all valid entries from head.
It is therefore possible that index_head be smaller than head, because all
-1 entries are removed.
The current index position is then filled with the head value.
If there is a head, there may be links. So the links are filled in trailing
after the head value until a -1 terminator is reached.
Note:
link_count includes the head node and all other link nodes.
link_count corresponds to the new def. of head, NOT the old one.
link_count[i] is the count for index_head[i].
All -1 or 'no entries' in index_head have been removed. index_head is a continuous list
of heads in index.
The end of index_head is marked by a -1.
New:
use link and head to init index, index_head, link_count, size
index_pos walks through to fill in index with the semi sorted indexes.
after this, link and head are no more used.
link and group share the same memory location
-------------------------------------------------------------------------------}
procedure TFastSorter.InitIndexFromLink;
var
i, index_pos, {head_pos,} cur_node, t: longint;
group_num: integer;
//group_first_index: integer;
group_size: integer;
w: word;
begin
index_pos := 1; // start from 1 for virtual smallest character. for circular start from 0
//head_pos := 0;
// due to the last char being the smallest char, we must fill in manually
// link for that one.
// if actual last is 'e', then we get 'e$00' and we add to the head.
w := word(block[last_index] shl 8);
{if there is no entry in head then set the pos as the head.
Otherwise link the pos in by making it the head and setting its link}
if (head^[w] = -1) then
begin
head^[w] := last_index;
link^[last_index] := -1;
// Set link[i] to -1 as the terminator
end
else
begin
link^[last_index] := head^[w];
head^[w] := last_index;
end;
{go through each radix bucket}
for i := 0 to 65535 do
begin
cur_node := head^[i];
if (i = w) then
begin
// the link with the virtual smallest char is the first one
// we give it it's own group number, remove it from the linked list
// and continue as if this never happened
// cur_node is the index
// index_pos is the group number
Assert(cur_node = last_index);
index[index_pos] := cur_node;
size[index_pos] := 1;
// link and group share the same memory location. update cur_node then
// assign the group number because we'll never access that link again.
cur_node := link[cur_node]; // take out the memory contents
group[last_index] := index_pos; // override it
inc(index_pos);
end;
if (cur_node <> -1) then
begin
{Head now points to the head of a symbol pair linked list in index}
//index_head^[head_pos] := index_pos;
//link_count^[head_pos] := 0;
// walk the linked list
group_num := index_pos; // group_num is i
//group_first_index := cur_node;
group_size := 0;
repeat
// collate the nodes in index
index[index_pos] := cur_node;
t := cur_node; // save the cur_node
cur_node := link[cur_node];
// fill in the group number for index_pos
// override previous memory location in link with the group_num
group[t] := group_num; // group[index[index[pos]] or V[I[i]]
inc(index_pos);
inc(group_size); // inc(link_count[head_pos]);
until (cur_node = -1);
// fill in the group size in size[group_num]
size[group_num] := group_size;
//size[group_num] := link_count[head_pos];
//inc(head_pos);
end;
end;
//index_head^[head_pos] := -1;
// init the virtual smallest character
block[block_length] := 0;
index[0] := block_length;
size[0] := -1; // sorted, 1 char only
group[index[0]] := 0; // first group}
end;
{Notes:
group and index init from 1 to block_size
0 is the virtual smallest char. compare with index[0]=block_size should
be greater. note that index[0] may not contain block_size}
procedure TFastSorter.SortGroup(const stlo, sthi, depth: integer);
{Swap - swaps 2 values v1 and v2 }
procedure Swap(var v1, v2: longword); overload;
var
t: longword;
begin
t := v1;
v1 := v2;
v2 := t;
end;
{Swap - swaps 2 values v1 and v2 }
procedure Swap(var v1, v2: longint); overload;
var
t: longword;
begin
t := v1;
v1 := v2;
v2 := t;
end;
{Vector swap}
procedure VecSwap(p1, p2, n: longword);
{var
t: longword;}
begin
while (n > 0) do
begin
{Swap p1, p2}
{t := p1;
p1 := p2;
p2 := t;}
Swap(index[p1], index[p2]);
inc(p1); inc(p2); dec(n);
end;
end;
{Median of 3}
function Med3(a, b, c: byte): byte; overload;
var
t: byte;
begin
if (a > b) then
begin
{Swap a, b}
t := a; a := b; b := t;
end;
if (b > c) then
begin
{Swap b, c}
t := b;
b := c;
c := t;
end;
if (a > b) then b := a;
result := b;
end;
function Min(a, b: integer): integer;
begin
if (a < b) then
result := a
else
result := b;
end;
function Med3(a, b, c: longword): longword; overload;
var
t: longword;
begin
if (a > b) then
begin
{Swap a, b}
t := a; a := b; b := t;
end;
if (b > c) then
begin
{Swap b, c}
t := b;
b := c;
c := t;
end;
if (a > b) then b := a;
result := b;
end;
function Med3(a, b, c: integer): integer; overload;
var
t: integer;
begin
if (a > b) then
begin
{Swap a, b}
t := a; a := b; b := t;
end;
if (b > c) then
begin
{Swap b, c}
t := b;
b := c;
c := t;
end;
if (a > b) then b := a;
result := b;
end;
{function NormIdx(idx: integer): integer;
begin
repeat
if (idx > last_index) then
dec(idx, last_index)
else
begin
result := idx;
exit;
end;
until false;
end;}
procedure QSort3(lo, hi: integer);
{lo, hi: first and last element
Note: we will compare group numbers
the depth of comparison is constant througout the recursion}
var
a, b, c, d: integer; // may become negative?
r: integer;
med: integer; // byte
i, group_num: integer;
begin
if (hi-lo < 1) then
begin
// 1 item only. assign it a group
if (hi = lo) then
begin
group[index[hi]] := hi;
size[hi] := 1;
end;
exit;
end;
med := Med3(group[index[lo] + depth],
group[index[hi] + depth],
group[index[(lo + hi) shr 1] + depth]);
a := lo;
b := lo;
c := hi;
d := hi;
while true do
begin
{ = < }
{ find item greater than med, while swapping equal items to the left }
while (b <= c) and (group[index[b] + depth] <= med) do
begin
if (group[index[b] + depth] = med) then
begin
Swap(index[a], index[b]);
inc(a);
end;
inc(b);
end;
{ > = }
{ find item smaller than med, while swapping equal items to the right }
while (b <= c) and (group[index[c] + depth] >= med) do
begin
if (group[index[c] + depth] = med) then
begin
Swap(index[c], index[d]);
dec(d);
end;
dec(c);
end;
if (b > c) then break;
// swap b and c
Swap(index[b], index[c]);
inc(b);
dec(c);
end;
{b = c+1 once we are out}
Assert(b = c+1);
//if b <> (c+1) then ShowMessage('bc');
{final arrangment:
lo a c b d hi
d is next avail pos. d+1 to hi: = items
a is next avail pos. lo to a-1: = items}
{left centre right}
{swap enough to get left from '= <' to '< ='
a-lo: num of = items
b-a: num of < items
r gives the min items to swap}
r := min(a-lo, b-a);
VecSwap(lo, b-r, r);
{swap enough to get right from '> =' to '= >'
d-c: num of > items
hi-d: num of = items}
r := min(d-c, hi-d);
VecSwap(b, hi-r+1, r);
// sort from higher to lower
// for equal items update their group numbers to the same group
r := d-c; // num of '>' items
QSort3(hi-r+1, hi); // sort right
r := (a-lo) + (hi-d);
{QSort3(lo+b-a, lo+b-a+r-1); // sort middle}
group_num := lo+b-a;
for i := lo+b-a to lo+b-a+r-1 do // give the '=' items the same group number
group[index[i]] := group_num;
size[group_num] := r;
r := b-a; // size of '<' items
QSort3(lo, lo + r - 1); // sort left
end; {QSort3}
begin
QSort3(stlo, sthi);
end;
procedure TFastSorter.SadaSort;
var
i, k: integer;
first_i: integer;
group_size: integer;
begin
// sort unsorted groups
// go through the size array. anything with size 1 we ignore and add to the
// previous group size
// if first_i = -1 that means first_i not avail and next sorted group can
// be first_i
// blocksize has increased by 1 because of the vitual char
inc(block_length);
// keep sorting until all has been sorted
k := 2;
while (abs(size[0]) < (block_length-1)) do
begin
i := 0;//i := abs(size[0]);
first_i := -1;
repeat
if (size[i] < 0) then
begin
if (first_i = -1) then
begin
first_i := i; // we can add further sorted groups to this group
inc(i, abs(size[i])); // skip this group
end
else
begin
Assert(size[first_i] < 0);
inc(size[first_i], size[i]); // add to the first_i
inc(i, abs(size[i])); // skip, because it is sorted and group has been combined
end;
end
else if (size[i] = 1) then
begin
if (first_i = -1) then
begin
first_i := i; // we can add further sorted groups to this group
size[first_i] := -1; // make this the head sorted group
end
else
begin
Assert(size[first_i] < 0);
dec(size[first_i]); // add this group to the first_i
end;
inc(i);
end
else
begin
// group size > 1 sort it
group_size := size[i];
SortGroup(i, i + size[i]-1, k);
inc(i, group_size); // size[i] may change after sort group
first_i := -1;
end;
until (i >= block_length); // while (i < block_length-1)
k := k * 2;
end;
end;
end.