654 lines
16 KiB
Plaintext
654 lines
16 KiB
Plaintext
unit FSortUnit;
|
|
{-------------------------------------------------------------------------------
|
|
Fast sorter unit
|
|
----------------
|
|
reSource v2.6
|
|
Copyright (C) 1998-2001 Victor Kasenda / gruv
|
|
http://go.to/gruv
|
|
email: vickas@singnet.com.sg
|
|
|
|
Fast sort unit.
|
|
Algos:
|
|
DJ Wheeler from his June 1989 report and
|
|
Kunihiko Sadakane's Suffix sort.
|
|
|
|
coded by gruv
|
|
|
|
Notes:
|
|
Sort the index, not the block.
|
|
SadaSort compares group indexes not block.
|
|
|
|
Sort rev 4:
|
|
Radix on symbol pairs.
|
|
Sadakane's Suffix sort.
|
|
|
|
------------------------------------------------------------------------------}
|
|
|
|
|
|
(**) interface (**)
|
|
uses SysUtils, Forms, dialogs, StructsUnit;
|
|
|
|
const
|
|
STRIDE = 4;
|
|
MAXDEPTH = 20;
|
|
NUMOVERSHOOT = MAXDEPTH + 100;
|
|
|
|
|
|
type
|
|
{THead = array[0..65535] of Longint;
|
|
PHead = ^THead;}
|
|
|
|
|
|
TFastSorter = class
|
|
private
|
|
block: PBlock; // block to sort
|
|
|
|
index: PLongintBlock; // index to the block to sort. each index pos is a string
|
|
block_length: longint; // length of the block
|
|
|
|
last_index: integer;
|
|
|
|
head: P64kBlock; // head of the linked list
|
|
link: PLongintBlock; // links in the linked list
|
|
//link_count: PHead; // Number of links in each head
|
|
//index_head: PHead; // start of each group in index
|
|
|
|
group: PLongintBlock; // group of suffix s
|
|
size: PLongintBlock;
|
|
{For SadaSort: from the paper
|
|
I -> index
|
|
V -> group
|
|
S -> size}
|
|
|
|
procedure RadixSortOnSymbolPairs;
|
|
procedure InitIndexFromLink;
|
|
|
|
procedure SadaSort;
|
|
procedure SortGroup(const stlo, sthi, depth: integer);
|
|
public
|
|
constructor Create;
|
|
destructor Destroy; override;
|
|
|
|
procedure SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint);
|
|
end;
|
|
|
|
|
|
(**) implementation (**)
|
|
uses ErrorUnit;
|
|
|
|
{-------------------------------------------------------------------------------
|
|
Create/Destroy
|
|
--------------
|
|
|
|
Allocates and frees the memory structures used for sorting.
|
|
-------------------------------------------------------------------------------}
|
|
constructor TFastSorter.Create;
|
|
begin
|
|
inherited Create;
|
|
{New(head);
|
|
New(link);
|
|
//New(link_count);
|
|
//New(index_head);
|
|
|
|
New(group);
|
|
New(size);}
|
|
end;
|
|
|
|
destructor TFastSorter.Destroy;
|
|
begin
|
|
{Dispose(size);
|
|
Dispose(group);
|
|
|
|
//Dispose(index_head);
|
|
//Dispose(link_count);
|
|
Dispose(link);
|
|
Dispose(head);}
|
|
inherited Destroy;
|
|
end;
|
|
|
|
{-------------------------------------------------------------------------------
|
|
SortBlock
|
|
---------
|
|
|
|
Main procedure to call.
|
|
Initializes the block then calls the respective procedures to sort the block.
|
|
-------------------------------------------------------------------------------}
|
|
procedure TFastSorter.SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint);
|
|
|
|
procedure Initialize;
|
|
var
|
|
i: integer;
|
|
begin
|
|
{Initialize}
|
|
block := _block;
|
|
index := _index;
|
|
block_length := _block_length;
|
|
last_index := block_length-1;
|
|
|
|
// sizes array need not be cleared. it will be init.
|
|
|
|
// assign block memory
|
|
// index -> longintblock1
|
|
head := BlockMan.k64Block;
|
|
link := BlockMan.longintblock2;
|
|
group := BlockMan.longintblock2;
|
|
size := BlockMan.longintblock3;
|
|
|
|
{Clear Arrays}
|
|
for i := 0 to 65535 do
|
|
head[i] := -1;
|
|
|
|
|
|
end;
|
|
|
|
|
|
|
|
|
|
//var
|
|
//head_idx, cur_head: longword;
|
|
//first_char: byte;
|
|
//i, numkeys, first_head: integer; // numkeys: total number of keys with first_char
|
|
//t: longword;
|
|
//totalbytes: integer; // for progress bar
|
|
|
|
begin
|
|
//totalbytes := 0;
|
|
|
|
Initialize;
|
|
RadixSortOnSymbolPairs; // fill up head and link
|
|
InitIndexFromLink; // get index in semi sorted order and index_head
|
|
SadaSort;
|
|
end;
|
|
|
|
{-------------------------------------------------------------------------------
|
|
RadixSortOnSymbolPairs
|
|
----------------------
|
|
|
|
Radix sort: Run through the block array in words to get the buckets and dump
|
|
the indexes into their respective bucket.
|
|
|
|
Initializes long_block with each long integer straddling 4 bytes.
|
|
|
|
OUT Assertion:
|
|
head/link are linked lists to the sort.
|
|
long_block is initialized
|
|
-------------------------------------------------------------------------------}
|
|
procedure TFastSorter.RadixSortOnSymbolPairs;
|
|
var
|
|
i: integer;
|
|
w: word;
|
|
begin
|
|
{Init w with the first character}
|
|
w := block^[0];
|
|
|
|
for i := 0 to last_index-1 do
|
|
begin
|
|
w := word(w shl 8) or block^[i+1];
|
|
|
|
{if there is no entry in head then set the pos as the head.
|
|
Otherwise link the pos in by making it the head and setting its link}
|
|
if (head^[w] = -1) then
|
|
begin
|
|
head^[w] := i;
|
|
link^[i] := -1;
|
|
{Set link^[i] to -1 as the terminator}
|
|
end
|
|
else
|
|
begin
|
|
link^[i] := head^[w];
|
|
head^[w] := i;
|
|
end;
|
|
end; {for}
|
|
|
|
|
|
end;
|
|
|
|
|
|
|
|
{-------------------------------------------------------------------------------
|
|
InitIndexFromLink
|
|
-----------------
|
|
|
|
Out Assertion:
|
|
Inits index, index_head and link_count.
|
|
index_head will point to the head of each symbol pair in index.
|
|
link_count is the count for each symbol pair corresponding in head.
|
|
index will contain a continuous list of indexes. These indexes are in groups
|
|
with their head pointed to by index_head and counts in link_count.
|
|
Quicksort will sort the index.
|
|
head no more used.
|
|
|
|
Desc:
|
|
This will run through the head array.
|
|
It will fill in the index_head with all valid entries from head.
|
|
It is therefore possible that index_head be smaller than head, because all
|
|
-1 entries are removed.
|
|
|
|
The current index position is then filled with the head value.
|
|
If there is a head, there may be links. So the links are filled in trailing
|
|
after the head value until a -1 terminator is reached.
|
|
|
|
Note:
|
|
link_count includes the head node and all other link nodes.
|
|
link_count corresponds to the new def. of head, NOT the old one.
|
|
link_count[i] is the count for index_head[i].
|
|
|
|
All -1 or 'no entries' in index_head have been removed. index_head is a continuous list
|
|
of heads in index.
|
|
The end of index_head is marked by a -1.
|
|
|
|
New:
|
|
use link and head to init index, index_head, link_count, size
|
|
index_pos walks through to fill in index with the semi sorted indexes.
|
|
after this, link and head are no more used.
|
|
link and group share the same memory location
|
|
-------------------------------------------------------------------------------}
|
|
|
|
procedure TFastSorter.InitIndexFromLink;
|
|
var
|
|
i, index_pos, {head_pos,} cur_node, t: longint;
|
|
|
|
group_num: integer;
|
|
//group_first_index: integer;
|
|
group_size: integer;
|
|
|
|
w: word;
|
|
begin
|
|
index_pos := 1; // start from 1 for virtual smallest character. for circular start from 0
|
|
//head_pos := 0;
|
|
|
|
// due to the last char being the smallest char, we must fill in manually
|
|
// link for that one.
|
|
// if actual last is 'e', then we get 'e$00' and we add to the head.
|
|
w := word(block[last_index] shl 8);
|
|
{if there is no entry in head then set the pos as the head.
|
|
Otherwise link the pos in by making it the head and setting its link}
|
|
if (head^[w] = -1) then
|
|
begin
|
|
head^[w] := last_index;
|
|
link^[last_index] := -1;
|
|
// Set link[i] to -1 as the terminator
|
|
end
|
|
else
|
|
begin
|
|
link^[last_index] := head^[w];
|
|
head^[w] := last_index;
|
|
end;
|
|
|
|
|
|
|
|
{go through each radix bucket}
|
|
for i := 0 to 65535 do
|
|
begin
|
|
cur_node := head^[i];
|
|
|
|
if (i = w) then
|
|
begin
|
|
// the link with the virtual smallest char is the first one
|
|
// we give it it's own group number, remove it from the linked list
|
|
// and continue as if this never happened
|
|
// cur_node is the index
|
|
// index_pos is the group number
|
|
Assert(cur_node = last_index);
|
|
index[index_pos] := cur_node;
|
|
size[index_pos] := 1;
|
|
// link and group share the same memory location. update cur_node then
|
|
// assign the group number because we'll never access that link again.
|
|
cur_node := link[cur_node]; // take out the memory contents
|
|
group[last_index] := index_pos; // override it
|
|
inc(index_pos);
|
|
end;
|
|
|
|
|
|
if (cur_node <> -1) then
|
|
begin
|
|
{Head now points to the head of a symbol pair linked list in index}
|
|
//index_head^[head_pos] := index_pos;
|
|
//link_count^[head_pos] := 0;
|
|
|
|
// walk the linked list
|
|
group_num := index_pos; // group_num is i
|
|
//group_first_index := cur_node;
|
|
group_size := 0;
|
|
|
|
repeat
|
|
// collate the nodes in index
|
|
index[index_pos] := cur_node;
|
|
t := cur_node; // save the cur_node
|
|
cur_node := link[cur_node];
|
|
// fill in the group number for index_pos
|
|
// override previous memory location in link with the group_num
|
|
group[t] := group_num; // group[index[index[pos]] or V[I[i]]
|
|
|
|
inc(index_pos);
|
|
inc(group_size); // inc(link_count[head_pos]);
|
|
until (cur_node = -1);
|
|
|
|
// fill in the group size in size[group_num]
|
|
size[group_num] := group_size;
|
|
//size[group_num] := link_count[head_pos];
|
|
|
|
//inc(head_pos);
|
|
end;
|
|
end;
|
|
|
|
//index_head^[head_pos] := -1;
|
|
|
|
|
|
// init the virtual smallest character
|
|
block[block_length] := 0;
|
|
index[0] := block_length;
|
|
size[0] := -1; // sorted, 1 char only
|
|
group[index[0]] := 0; // first group}
|
|
end;
|
|
|
|
{Notes:
|
|
group and index init from 1 to block_size
|
|
0 is the virtual smallest char. compare with index[0]=block_size should
|
|
be greater. note that index[0] may not contain block_size}
|
|
procedure TFastSorter.SortGroup(const stlo, sthi, depth: integer);
|
|
|
|
{Swap - swaps 2 values v1 and v2 }
|
|
procedure Swap(var v1, v2: longword); overload;
|
|
var
|
|
t: longword;
|
|
begin
|
|
t := v1;
|
|
v1 := v2;
|
|
v2 := t;
|
|
end;
|
|
|
|
{Swap - swaps 2 values v1 and v2 }
|
|
procedure Swap(var v1, v2: longint); overload;
|
|
var
|
|
t: longword;
|
|
begin
|
|
t := v1;
|
|
v1 := v2;
|
|
v2 := t;
|
|
end;
|
|
|
|
{Vector swap}
|
|
procedure VecSwap(p1, p2, n: longword);
|
|
{var
|
|
t: longword;}
|
|
begin
|
|
while (n > 0) do
|
|
begin
|
|
{Swap p1, p2}
|
|
{t := p1;
|
|
p1 := p2;
|
|
p2 := t;}
|
|
Swap(index[p1], index[p2]);
|
|
|
|
inc(p1); inc(p2); dec(n);
|
|
end;
|
|
end;
|
|
|
|
{Median of 3}
|
|
function Med3(a, b, c: byte): byte; overload;
|
|
var
|
|
t: byte;
|
|
begin
|
|
if (a > b) then
|
|
begin
|
|
{Swap a, b}
|
|
t := a; a := b; b := t;
|
|
end;
|
|
if (b > c) then
|
|
begin
|
|
{Swap b, c}
|
|
t := b;
|
|
b := c;
|
|
c := t;
|
|
end;
|
|
if (a > b) then b := a;
|
|
result := b;
|
|
end;
|
|
|
|
function Min(a, b: integer): integer;
|
|
begin
|
|
if (a < b) then
|
|
result := a
|
|
else
|
|
result := b;
|
|
end;
|
|
|
|
function Med3(a, b, c: longword): longword; overload;
|
|
var
|
|
t: longword;
|
|
begin
|
|
if (a > b) then
|
|
begin
|
|
{Swap a, b}
|
|
t := a; a := b; b := t;
|
|
end;
|
|
if (b > c) then
|
|
begin
|
|
{Swap b, c}
|
|
t := b;
|
|
b := c;
|
|
c := t;
|
|
end;
|
|
if (a > b) then b := a;
|
|
result := b;
|
|
end;
|
|
|
|
function Med3(a, b, c: integer): integer; overload;
|
|
var
|
|
t: integer;
|
|
begin
|
|
if (a > b) then
|
|
begin
|
|
{Swap a, b}
|
|
t := a; a := b; b := t;
|
|
end;
|
|
if (b > c) then
|
|
begin
|
|
{Swap b, c}
|
|
t := b;
|
|
b := c;
|
|
c := t;
|
|
end;
|
|
if (a > b) then b := a;
|
|
result := b;
|
|
end;
|
|
|
|
{function NormIdx(idx: integer): integer;
|
|
begin
|
|
repeat
|
|
if (idx > last_index) then
|
|
dec(idx, last_index)
|
|
else
|
|
begin
|
|
result := idx;
|
|
exit;
|
|
end;
|
|
until false;
|
|
end;}
|
|
|
|
procedure QSort3(lo, hi: integer);
|
|
{lo, hi: first and last element
|
|
Note: we will compare group numbers
|
|
the depth of comparison is constant througout the recursion}
|
|
var
|
|
a, b, c, d: integer; // may become negative?
|
|
r: integer;
|
|
med: integer; // byte
|
|
i, group_num: integer;
|
|
begin
|
|
if (hi-lo < 1) then
|
|
begin
|
|
// 1 item only. assign it a group
|
|
if (hi = lo) then
|
|
begin
|
|
group[index[hi]] := hi;
|
|
size[hi] := 1;
|
|
end;
|
|
exit;
|
|
end;
|
|
|
|
med := Med3(group[index[lo] + depth],
|
|
group[index[hi] + depth],
|
|
group[index[(lo + hi) shr 1] + depth]);
|
|
|
|
|
|
a := lo;
|
|
b := lo;
|
|
c := hi;
|
|
d := hi;
|
|
|
|
while true do
|
|
begin
|
|
|
|
{ = < }
|
|
{ find item greater than med, while swapping equal items to the left }
|
|
while (b <= c) and (group[index[b] + depth] <= med) do
|
|
begin
|
|
if (group[index[b] + depth] = med) then
|
|
begin
|
|
Swap(index[a], index[b]);
|
|
inc(a);
|
|
end;
|
|
inc(b);
|
|
end;
|
|
|
|
{ > = }
|
|
{ find item smaller than med, while swapping equal items to the right }
|
|
while (b <= c) and (group[index[c] + depth] >= med) do
|
|
begin
|
|
if (group[index[c] + depth] = med) then
|
|
begin
|
|
Swap(index[c], index[d]);
|
|
dec(d);
|
|
end;
|
|
dec(c);
|
|
end;
|
|
|
|
if (b > c) then break;
|
|
|
|
// swap b and c
|
|
Swap(index[b], index[c]);
|
|
inc(b);
|
|
dec(c);
|
|
end;
|
|
|
|
{b = c+1 once we are out}
|
|
Assert(b = c+1);
|
|
//if b <> (c+1) then ShowMessage('bc');
|
|
|
|
{final arrangment:
|
|
lo a c b d hi
|
|
d is next avail pos. d+1 to hi: = items
|
|
a is next avail pos. lo to a-1: = items}
|
|
|
|
{left centre right}
|
|
|
|
{swap enough to get left from '= <' to '< ='
|
|
a-lo: num of = items
|
|
b-a: num of < items
|
|
r gives the min items to swap}
|
|
r := min(a-lo, b-a);
|
|
VecSwap(lo, b-r, r);
|
|
|
|
{swap enough to get right from '> =' to '= >'
|
|
d-c: num of > items
|
|
hi-d: num of = items}
|
|
r := min(d-c, hi-d);
|
|
VecSwap(b, hi-r+1, r);
|
|
|
|
// sort from higher to lower
|
|
// for equal items update their group numbers to the same group
|
|
|
|
r := d-c; // num of '>' items
|
|
QSort3(hi-r+1, hi); // sort right
|
|
|
|
r := (a-lo) + (hi-d);
|
|
{QSort3(lo+b-a, lo+b-a+r-1); // sort middle}
|
|
group_num := lo+b-a;
|
|
for i := lo+b-a to lo+b-a+r-1 do // give the '=' items the same group number
|
|
group[index[i]] := group_num;
|
|
size[group_num] := r;
|
|
|
|
r := b-a; // size of '<' items
|
|
QSort3(lo, lo + r - 1); // sort left
|
|
|
|
end; {QSort3}
|
|
|
|
begin
|
|
QSort3(stlo, sthi);
|
|
end;
|
|
|
|
procedure TFastSorter.SadaSort;
|
|
var
|
|
i, k: integer;
|
|
first_i: integer;
|
|
group_size: integer;
|
|
|
|
begin
|
|
// sort unsorted groups
|
|
// go through the size array. anything with size 1 we ignore and add to the
|
|
// previous group size
|
|
// if first_i = -1 that means first_i not avail and next sorted group can
|
|
// be first_i
|
|
|
|
// blocksize has increased by 1 because of the vitual char
|
|
inc(block_length);
|
|
|
|
// keep sorting until all has been sorted
|
|
k := 2;
|
|
while (abs(size[0]) < (block_length-1)) do
|
|
begin
|
|
i := 0;//i := abs(size[0]);
|
|
first_i := -1;
|
|
|
|
repeat
|
|
|
|
if (size[i] < 0) then
|
|
begin
|
|
if (first_i = -1) then
|
|
begin
|
|
first_i := i; // we can add further sorted groups to this group
|
|
inc(i, abs(size[i])); // skip this group
|
|
end
|
|
else
|
|
begin
|
|
Assert(size[first_i] < 0);
|
|
inc(size[first_i], size[i]); // add to the first_i
|
|
inc(i, abs(size[i])); // skip, because it is sorted and group has been combined
|
|
end;
|
|
end
|
|
else if (size[i] = 1) then
|
|
begin
|
|
if (first_i = -1) then
|
|
begin
|
|
first_i := i; // we can add further sorted groups to this group
|
|
size[first_i] := -1; // make this the head sorted group
|
|
end
|
|
else
|
|
begin
|
|
Assert(size[first_i] < 0);
|
|
dec(size[first_i]); // add this group to the first_i
|
|
end;
|
|
inc(i);
|
|
end
|
|
else
|
|
begin
|
|
// group size > 1 sort it
|
|
group_size := size[i];
|
|
SortGroup(i, i + size[i]-1, k);
|
|
|
|
inc(i, group_size); // size[i] may change after sort group
|
|
first_i := -1;
|
|
end;
|
|
until (i >= block_length); // while (i < block_length-1)
|
|
|
|
k := k * 2;
|
|
end;
|
|
|
|
end;
|
|
|
|
|
|
end.
|
|
|