unit FSortUnit; {------------------------------------------------------------------------------- Fast sorter unit ---------------- reSource v2.6 Copyright (C) 1998-2001 Victor Kasenda / gruv http://go.to/gruv email: vickas@singnet.com.sg Fast sort unit. Algos: DJ Wheeler from his June 1989 report and Kunihiko Sadakane's Suffix sort. coded by gruv Notes: Sort the index, not the block. SadaSort compares group indexes not block. Sort rev 4: Radix on symbol pairs. Sadakane's Suffix sort. ------------------------------------------------------------------------------} (**) interface (**) uses SysUtils, Forms, dialogs, StructsUnit; const STRIDE = 4; MAXDEPTH = 20; NUMOVERSHOOT = MAXDEPTH + 100; type {THead = array[0..65535] of Longint; PHead = ^THead;} TFastSorter = class private block: PBlock; // block to sort index: PLongintBlock; // index to the block to sort. each index pos is a string block_length: longint; // length of the block last_index: integer; head: P64kBlock; // head of the linked list link: PLongintBlock; // links in the linked list //link_count: PHead; // Number of links in each head //index_head: PHead; // start of each group in index group: PLongintBlock; // group of suffix s size: PLongintBlock; {For SadaSort: from the paper I -> index V -> group S -> size} procedure RadixSortOnSymbolPairs; procedure InitIndexFromLink; procedure SadaSort; procedure SortGroup(const stlo, sthi, depth: integer); public constructor Create; destructor Destroy; override; procedure SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint); end; (**) implementation (**) uses ErrorUnit; {------------------------------------------------------------------------------- Create/Destroy -------------- Allocates and frees the memory structures used for sorting. -------------------------------------------------------------------------------} constructor TFastSorter.Create; begin inherited Create; {New(head); New(link); //New(link_count); //New(index_head); New(group); New(size);} end; destructor TFastSorter.Destroy; begin {Dispose(size); Dispose(group); //Dispose(index_head); //Dispose(link_count); Dispose(link); Dispose(head);} inherited Destroy; end; {------------------------------------------------------------------------------- SortBlock --------- Main procedure to call. Initializes the block then calls the respective procedures to sort the block. -------------------------------------------------------------------------------} procedure TFastSorter.SortBlock(const _block: PBlock; const _index: PLongintBlock; const _block_length: longint); procedure Initialize; var i: integer; begin {Initialize} block := _block; index := _index; block_length := _block_length; last_index := block_length-1; // sizes array need not be cleared. it will be init. // assign block memory // index -> longintblock1 head := BlockMan.k64Block; link := BlockMan.longintblock2; group := BlockMan.longintblock2; size := BlockMan.longintblock3; {Clear Arrays} for i := 0 to 65535 do head[i] := -1; end; //var //head_idx, cur_head: longword; //first_char: byte; //i, numkeys, first_head: integer; // numkeys: total number of keys with first_char //t: longword; //totalbytes: integer; // for progress bar begin //totalbytes := 0; Initialize; RadixSortOnSymbolPairs; // fill up head and link InitIndexFromLink; // get index in semi sorted order and index_head SadaSort; end; {------------------------------------------------------------------------------- RadixSortOnSymbolPairs ---------------------- Radix sort: Run through the block array in words to get the buckets and dump the indexes into their respective bucket. Initializes long_block with each long integer straddling 4 bytes. OUT Assertion: head/link are linked lists to the sort. long_block is initialized -------------------------------------------------------------------------------} procedure TFastSorter.RadixSortOnSymbolPairs; var i: integer; w: word; begin {Init w with the first character} w := block^[0]; for i := 0 to last_index-1 do begin w := word(w shl 8) or block^[i+1]; {if there is no entry in head then set the pos as the head. Otherwise link the pos in by making it the head and setting its link} if (head^[w] = -1) then begin head^[w] := i; link^[i] := -1; {Set link^[i] to -1 as the terminator} end else begin link^[i] := head^[w]; head^[w] := i; end; end; {for} end; {------------------------------------------------------------------------------- InitIndexFromLink ----------------- Out Assertion: Inits index, index_head and link_count. index_head will point to the head of each symbol pair in index. link_count is the count for each symbol pair corresponding in head. index will contain a continuous list of indexes. These indexes are in groups with their head pointed to by index_head and counts in link_count. Quicksort will sort the index. head no more used. Desc: This will run through the head array. It will fill in the index_head with all valid entries from head. It is therefore possible that index_head be smaller than head, because all -1 entries are removed. The current index position is then filled with the head value. If there is a head, there may be links. So the links are filled in trailing after the head value until a -1 terminator is reached. Note: link_count includes the head node and all other link nodes. link_count corresponds to the new def. of head, NOT the old one. link_count[i] is the count for index_head[i]. All -1 or 'no entries' in index_head have been removed. index_head is a continuous list of heads in index. The end of index_head is marked by a -1. New: use link and head to init index, index_head, link_count, size index_pos walks through to fill in index with the semi sorted indexes. after this, link and head are no more used. link and group share the same memory location -------------------------------------------------------------------------------} procedure TFastSorter.InitIndexFromLink; var i, index_pos, {head_pos,} cur_node, t: longint; group_num: integer; //group_first_index: integer; group_size: integer; w: word; begin index_pos := 1; // start from 1 for virtual smallest character. for circular start from 0 //head_pos := 0; // due to the last char being the smallest char, we must fill in manually // link for that one. // if actual last is 'e', then we get 'e$00' and we add to the head. w := word(block[last_index] shl 8); {if there is no entry in head then set the pos as the head. Otherwise link the pos in by making it the head and setting its link} if (head^[w] = -1) then begin head^[w] := last_index; link^[last_index] := -1; // Set link[i] to -1 as the terminator end else begin link^[last_index] := head^[w]; head^[w] := last_index; end; {go through each radix bucket} for i := 0 to 65535 do begin cur_node := head^[i]; if (i = w) then begin // the link with the virtual smallest char is the first one // we give it it's own group number, remove it from the linked list // and continue as if this never happened // cur_node is the index // index_pos is the group number Assert(cur_node = last_index); index[index_pos] := cur_node; size[index_pos] := 1; // link and group share the same memory location. update cur_node then // assign the group number because we'll never access that link again. cur_node := link[cur_node]; // take out the memory contents group[last_index] := index_pos; // override it inc(index_pos); end; if (cur_node <> -1) then begin {Head now points to the head of a symbol pair linked list in index} //index_head^[head_pos] := index_pos; //link_count^[head_pos] := 0; // walk the linked list group_num := index_pos; // group_num is i //group_first_index := cur_node; group_size := 0; repeat // collate the nodes in index index[index_pos] := cur_node; t := cur_node; // save the cur_node cur_node := link[cur_node]; // fill in the group number for index_pos // override previous memory location in link with the group_num group[t] := group_num; // group[index[index[pos]] or V[I[i]] inc(index_pos); inc(group_size); // inc(link_count[head_pos]); until (cur_node = -1); // fill in the group size in size[group_num] size[group_num] := group_size; //size[group_num] := link_count[head_pos]; //inc(head_pos); end; end; //index_head^[head_pos] := -1; // init the virtual smallest character block[block_length] := 0; index[0] := block_length; size[0] := -1; // sorted, 1 char only group[index[0]] := 0; // first group} end; {Notes: group and index init from 1 to block_size 0 is the virtual smallest char. compare with index[0]=block_size should be greater. note that index[0] may not contain block_size} procedure TFastSorter.SortGroup(const stlo, sthi, depth: integer); {Swap - swaps 2 values v1 and v2 } procedure Swap(var v1, v2: longword); overload; var t: longword; begin t := v1; v1 := v2; v2 := t; end; {Swap - swaps 2 values v1 and v2 } procedure Swap(var v1, v2: longint); overload; var t: longword; begin t := v1; v1 := v2; v2 := t; end; {Vector swap} procedure VecSwap(p1, p2, n: longword); {var t: longword;} begin while (n > 0) do begin {Swap p1, p2} {t := p1; p1 := p2; p2 := t;} Swap(index[p1], index[p2]); inc(p1); inc(p2); dec(n); end; end; {Median of 3} function Med3(a, b, c: byte): byte; overload; var t: byte; begin if (a > b) then begin {Swap a, b} t := a; a := b; b := t; end; if (b > c) then begin {Swap b, c} t := b; b := c; c := t; end; if (a > b) then b := a; result := b; end; function Min(a, b: integer): integer; begin if (a < b) then result := a else result := b; end; function Med3(a, b, c: longword): longword; overload; var t: longword; begin if (a > b) then begin {Swap a, b} t := a; a := b; b := t; end; if (b > c) then begin {Swap b, c} t := b; b := c; c := t; end; if (a > b) then b := a; result := b; end; function Med3(a, b, c: integer): integer; overload; var t: integer; begin if (a > b) then begin {Swap a, b} t := a; a := b; b := t; end; if (b > c) then begin {Swap b, c} t := b; b := c; c := t; end; if (a > b) then b := a; result := b; end; {function NormIdx(idx: integer): integer; begin repeat if (idx > last_index) then dec(idx, last_index) else begin result := idx; exit; end; until false; end;} procedure QSort3(lo, hi: integer); {lo, hi: first and last element Note: we will compare group numbers the depth of comparison is constant througout the recursion} var a, b, c, d: integer; // may become negative? r: integer; med: integer; // byte i, group_num: integer; begin if (hi-lo < 1) then begin // 1 item only. assign it a group if (hi = lo) then begin group[index[hi]] := hi; size[hi] := 1; end; exit; end; med := Med3(group[index[lo] + depth], group[index[hi] + depth], group[index[(lo + hi) shr 1] + depth]); a := lo; b := lo; c := hi; d := hi; while true do begin { = < } { find item greater than med, while swapping equal items to the left } while (b <= c) and (group[index[b] + depth] <= med) do begin if (group[index[b] + depth] = med) then begin Swap(index[a], index[b]); inc(a); end; inc(b); end; { > = } { find item smaller than med, while swapping equal items to the right } while (b <= c) and (group[index[c] + depth] >= med) do begin if (group[index[c] + depth] = med) then begin Swap(index[c], index[d]); dec(d); end; dec(c); end; if (b > c) then break; // swap b and c Swap(index[b], index[c]); inc(b); dec(c); end; {b = c+1 once we are out} Assert(b = c+1); //if b <> (c+1) then ShowMessage('bc'); {final arrangment: lo a c b d hi d is next avail pos. d+1 to hi: = items a is next avail pos. lo to a-1: = items} {left centre right} {swap enough to get left from '= <' to '< =' a-lo: num of = items b-a: num of < items r gives the min items to swap} r := min(a-lo, b-a); VecSwap(lo, b-r, r); {swap enough to get right from '> =' to '= >' d-c: num of > items hi-d: num of = items} r := min(d-c, hi-d); VecSwap(b, hi-r+1, r); // sort from higher to lower // for equal items update their group numbers to the same group r := d-c; // num of '>' items QSort3(hi-r+1, hi); // sort right r := (a-lo) + (hi-d); {QSort3(lo+b-a, lo+b-a+r-1); // sort middle} group_num := lo+b-a; for i := lo+b-a to lo+b-a+r-1 do // give the '=' items the same group number group[index[i]] := group_num; size[group_num] := r; r := b-a; // size of '<' items QSort3(lo, lo + r - 1); // sort left end; {QSort3} begin QSort3(stlo, sthi); end; procedure TFastSorter.SadaSort; var i, k: integer; first_i: integer; group_size: integer; begin // sort unsorted groups // go through the size array. anything with size 1 we ignore and add to the // previous group size // if first_i = -1 that means first_i not avail and next sorted group can // be first_i // blocksize has increased by 1 because of the vitual char inc(block_length); // keep sorting until all has been sorted k := 2; while (abs(size[0]) < (block_length-1)) do begin i := 0;//i := abs(size[0]); first_i := -1; repeat if (size[i] < 0) then begin if (first_i = -1) then begin first_i := i; // we can add further sorted groups to this group inc(i, abs(size[i])); // skip this group end else begin Assert(size[first_i] < 0); inc(size[first_i], size[i]); // add to the first_i inc(i, abs(size[i])); // skip, because it is sorted and group has been combined end; end else if (size[i] = 1) then begin if (first_i = -1) then begin first_i := i; // we can add further sorted groups to this group size[first_i] := -1; // make this the head sorted group end else begin Assert(size[first_i] < 0); dec(size[first_i]); // add this group to the first_i end; inc(i); end else begin // group size > 1 sort it group_size := size[i]; SortGroup(i, i + size[i]-1, k); inc(i, group_size); // size[i] may change after sort group first_i := -1; end; until (i >= block_length); // while (i < block_length-1) k := k * 2; end; end; end.