From 060df74e674db97ec46299a6b91d8f3b8fff00aa Mon Sep 17 00:00:00 2001 From: Ricardo Markiewicz Date: Sun, 20 Jun 2004 23:50:08 +0000 Subject: [PATCH] Agrego un paper sobre tecnicas para preprocesar antes del BS. --- doc/BWT_paper.rtf | 278 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 doc/BWT_paper.rtf diff --git a/doc/BWT_paper.rtf b/doc/BWT_paper.rtf new file mode 100644 index 0000000..697cd3a --- /dev/null +++ b/doc/BWT_paper.rtf @@ -0,0 +1,278 @@ +{\rtf1\ansi \deff4\deflang1033{\fonttbl{\f1\froman\fcharset2\fprq2 Symbol;}{\f3\froman\fcharset238\fprq1 Courier;}{\f4\froman\fcharset238\fprq2 Times New Roman CE;}{\f5\fswiss\fcharset238\fprq2 Arial CE;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0; +\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\widctlpar \f4\fs20\lang1045 \snext0 Normal;}{\s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 +{\pntxta .}}\b\f4\fs20\lang1024\kerning28 \sbasedon0\snext17 heading 1;}{\s2\fi-708\li992\sb240\sa60\keepn\widctlpar{\*\pn \pnlvl2\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\b\i\f5\lang1045 \sbasedon0\snext0 heading 2;}{ +\s3\fi-708\li1700\sb240\sa60\keepn\widctlpar{\*\pn \pnlvl3\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\f5\lang1045 \sbasedon0\snext0 heading 3;}{\s4\fi-708\li2408\sb240\sa60\keepn\widctlpar{\*\pn \pnlvl4\pndec\pnprev1\pnstart1\pnindent708\pnhang +{\pntxta .}}\b\f5\lang1045 \sbasedon0\snext0 heading 4;}{\s5\fi-708\li3116\sb240\sa60\widctlpar{\*\pn \pnlvl5\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\f5\fs22\lang1045 \sbasedon0\snext0 heading 5;}{ +\s6\fi-708\li3824\sb240\sa60\widctlpar{\*\pn \pnlvl6\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\i\f4\fs22\lang1045 \sbasedon0\snext0 heading 6;}{\s7\fi-708\li4532\sb240\sa60\widctlpar{\*\pn \pnlvl7\pndec\pnprev1\pnstart1\pnindent708\pnhang +{\pntxta .}}\f5\fs20\lang1045 \sbasedon0\snext0 heading 7;}{\s8\fi-708\li5240\sb240\sa60\widctlpar{\*\pn \pnlvl8\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\i\f5\fs20\lang1045 \sbasedon0\snext0 heading 8;}{ +\s9\fi-708\li5948\sb240\sa60\widctlpar{\*\pn \pnlvl9\pndec\pnprev1\pnstart1\pnindent708\pnhang{\pntxta .}}\b\i\f5\fs18\lang1045 \sbasedon0\snext0 heading 9;}{\*\cs10 \additive Default Paragraph Font;}{\s15\widctlpar\tqc\tx4536\tqr\tx9072 +\f4\fs20\lang1045 \sbasedon0\snext15 header;}{\s16\widctlpar\tqc\tx4536\tqr\tx9072 \f4\fs20\lang1045 \sbasedon0\snext16 footer;}{\s17\qj\fi284\widctlpar \f4\fs20\lang1045 \sbasedon0\snext17 Styl1;}}{\info{\title Introduction}{\author Szymon Grabowski} +{\operator Szymon Grabowski}{\creatim\yr1999\mo9\dy23\hr19\min24}{\revtim\yr2000\mo9\dy2\hr11\min47}{\version2}{\edmins1}{\nofpages11}{\nofwords3348}{\nofchars19087}{\*\company P\'a3}{\vern57443}} +\paperw11907\paperh16840\margl2438\margr2381\margt3345\margb3345 \facingp\deftab284\widowctrl\ftnbj\aenddoc\hyphhotz425\hyphcaps0\formshade \fet0\sectd \linex0\headery3345\footery709\colsx709\endnhere {\headerl \pard\plain \s15\widctlpar\tx284\tqr\tx9072 +\f4\fs20\lang1045 {\i \tab Szymon Grabowski +\par \tab Text Preprocessing for Burrows-Wheeler Block Sorting Compression +\par }\pard \s15\widctlpar\tqc\tx4536\tqr\tx9072 {\i ________________________________________________________________}_ +\par +\par +\par }{\headerr \pard\plain \s15\qc\widctlpar\tqc\tx4536\tqr\tx9072 \f4\fs20\lang1045 {\i VII Konferencja \ldblquote Sieci i Systemy Informatyczne \endash teoria, projekty, wdro\'bfenia\rdblquote +\par \'a3\'f3d\'9f, pa\'9fdziernik 1999 +\par _________________________________________________________________ +\par }\pard \s15\widctlpar\tqc\tx4536\tqr\tx9072 {\i +\par }}{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}} +{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8 +\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \li284\widctlpar \f4\fs20\lang1045 {\caps Szymon Grabowski}\line Katedra Informatyki Stosowanej Politechniki \'a3 +\'f3dzkiej +\par \pard \qc\widctlpar +\par +\par \pard \qc\li567\ri567\widctlpar {\b\caps Text preprocessing \line for Burrows-Wheeler BLOCK SORTING Compression +\par }{\caps +\par +\par +\par +\par }{\b Abstract +\par }\pard \qj\fi284\li568\ri567\widctlpar {\i +Burrows-Wheeler block sorting algorithm is one of the most attractive compression methods. In this paper we present several ideas attempting to improve text compression while retaining the speed. All of them are performed before actual coding in an inde +pendent preprocessing phase which yields flexibility of the coder selection. Experiments with bzip, a well known block sorting implementation, indicate 2-4 per cent compression gains for typical English texts. +\par }\pard \qj\li567\ri567\widctlpar +\par +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 1.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Introduction +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 +The Burrows-Wheeler transform (BWT) compression algorithm is a relatively new method, especially appropriate for text and other data with stable statistics. Although it has been only five years since the method was presented, several outstanding implemen +tations have already been released. In this paper we refer to the canonical post-BWT coding described in [1], however it should be mentioned this is not the only possible way. +\par The BWT transform takes all rotations of a given block of {\i n} + bytes and sorts them lexicographically. The sorting groups similar contexts together and the last byte of each string is the transform output. The reversibility of the transform is shown eg. in\~[1, 3]. It is noteworthy that in most implemen\- +tations the contexts consist of the bytes following the predicted character instead of more traditional preceding bytes. To compress the resulting data, the output of the transform is run through a Move-To-Front encoder and finally submitted to the arith +metic or Huffman coder. +\par Move-To-Front (MTF) coding outputs the rank of each byte in the sequence of its predecessors. Careful study of modelling and coding the output of BWT transform was presented in [3,\~4]. +\par Although conceptually simple, BWT-based compression algorithm (also called a block sorting coder or BS, for short) is not easy to be improved in general terms, that is, without much assumption for input data. Its advantage over more traditional Predictio +n by Partial Matching (PPM) coders lies in seam +less utilization of many similar contexts of different orders, the price is however lack of knowledge of actual contexts. What hurts most in block sorting coding is rapid changes in successive contexts resulting in high and costly MTF ranks. To relief r +apid context changes, contexts with similar probability distributions should be placed close together. Ways to find appropriate alphabet orders for given data were proposed in [2] as well as a hard coded heuristic order chosen for English text was there +presented. We use the heuristic order, in a slightly modified form, as one of the enhancements. +\par In this paper we also present another technique for removing certain rapid context changes introduced by line breaks and punctation marks used in typical text. +\par Converting capitals to lower case is also tested. It results in grouping intrinsically similar but formally different (and thus distant after BWT) contexts. +\par Yet another idea which we examine here is replacing frequent phrases in the given text with single characters outside the range of actually used characters. +\par Finally, we discuss a little the chances of successful coding of End-Of-Line characters. +\par \pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 2.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Capital conversion +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 Capital letters preceding actual contexts usually result in high MTF ranks while their lower case equivalents would very often imply 0\rquote s in MTF ranks. For example, the (following) context +\ldblquote {\i he day...}\rdblquote predicts {\i t}\rquote s as well as {\i T}\rquote s. We found it beneficial to replace such capitals with an equivalent lower case letter preceded by a unique flag +. To improve compression, the conversion is omitted if the next character is not a lower case letter [6]. +\par \pard\plain \qj\fi284\widctlpar \f4\fs20\lang1045 +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 Example: +\par \pard\plain \qj\fi284\li568\widctlpar \f4\fs20\lang1045 {\i The Title} {{\field{\*\fldinst SYMBOL 174 \\f "Symbol" \\s 10}{\fldrslt\f1\fs20}}} {\i #the #title} +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 but \tab {\i THE TITLE} {{\field{\*\fldinst SYMBOL 174 \\f "Symbol" \\s 10}{\fldrslt\f1\fs20}}} {\i THE TITLE} (# \endash a flag) +\par \pard\plain \fi284\widctlpar \f4\fs20\lang1045 +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 Why does it work? Instead of quite a high MTF rank, we usually obtain two lower ranks, with one of them often being zero. +\par Below is an example. Table 1 refers to the original text. The current context is shown in the last column. \lquote _\rquote denotes a blank space. +\par \pard\plain \qj\fi284\widctlpar\tqr\tx5245 \f4\fs20\lang1045 \page \tab {\expnd4\expndtw20 Table 1} +\par \trowd \trqc\trgaph71\trleft-71 \clbrdrt\brdrs\brdrw15 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx1063\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx2197\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx3331 \pard +\qc\widctlpar\intbl the current character\rquote s +\par predecessor\cell current \line (predicted) character\cell following\line context\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 \cellx1063\cellx2197\clbrdrr\brdrs\brdrw15 \cellx3331 \pard +\qc\fi-284\li284\widctlpar\intbl _\cell T\cell \pard \qc\widctlpar\intbl hen h...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell W\cell hen i...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell w\cell hen i...\cell +\pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell w\cell hen i...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell t\cell hen i...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell t\cell hen s...\cell \pard +\widctlpar\intbl \row \pard \qc\widctlpar\intbl _\cell t\cell hen s...\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw15 \cellx1063\clbrdrb\brdrs\brdrw15 \cellx2197\clbrdrb\brdrs\brdrw15 +\clbrdrr\brdrs\brdrw15 \cellx3331 \pard \qc\widctlpar\intbl _\cell T\cell hen t...\cell \pard \widctlpar\intbl \row \pard \fi284\widctlpar +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 The MTF rank for \lquote T\rquote in the last row is 3. +\par \pard\plain \fi284\widctlpar \f4\fs20\lang1045 +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 After the conversion instead of \ldblquote {\i Then t...}\rdblquote , we obtain \ldblquote {\i #then t...}\rdblquote , \lquote {\i #}\rquote \-being a flag (see Table 2). +\par \pard\plain \fi284\widctlpar \f4\fs20\lang1045 +\par \pard \fi284\widctlpar\tqr\tx5812 \tab {\expnd4\expndtw20 Table 2} +\par \trowd \trqc\trgaph71\trleft-71 \clbrdrt\brdrs\brdrw15 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx1063\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrdb\brdrw30 \cellx2197\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx3331\clbrdrt +\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx4465 \pard \qc\widctlpar\intbl current character\cell context\cell current character\cell context\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 +\cellx1063\clbrdrr\brdrdb\brdrw30 \cellx2197\cellx3331\clbrdrr\brdrs\brdrw15 \cellx4465 \pard \qc\fi-284\li284\widctlpar\intbl t\cell \pard \qc\widctlpar\intbl hen h...\cell \pard \qc\fi-284\li284\widctlpar\intbl #\cell \pard \qc\widctlpar\intbl then h... +\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl t\cell hen i...\cell _\cell then i...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl t\cell hen s...\cell _\cell then s...\cell \pard \widctlpar\intbl \row \pard \qc\widctlpar\intbl t +\cell hen s...\cell _\cell then s...\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw15 \cellx1063\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx2197\clbrdrb\brdrs\brdrw15 \cellx3331 +\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx4465 \pard \qc\widctlpar\intbl t\cell hen t...\cell #\cell then t...\cell \pard \widctlpar\intbl \row \pard \fi284\widctlpar +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 The resulting MTF ranks are 0 (for \lquote t\rquote on the left) and 1 (for \lquote #\rquote + on the right). According to Fenwick [3], MTF codes of 0, 1 and 3 for English texts cost generally about 0.5, 3 and almost 5\~bit/symbol, respectively. It is clear now we can expect gain in the presented case. +\par We found that following the flag with a blank space in the described idea [9] is even slightly better on the overall, the improvement is however inconsistent, for example this is better on book1 and +book2 but worse on bib (all files from Calgary Corpus). We decided to use the extended idea for the tests in Section 7. +\par To the best of our knowledge, the described capital conversion idea has never been presented before. +\par \pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 3.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Space stuffing +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 +Most text files have End-Of-Line characters (EOLs) followed by a letter. An EOL character (or the latter of the two EOL characters in CarriageReturn/LineFeed text style) is not predicted very well with its context. To alleviate it, in the preprocessing +p +hase we expand the data by adding a blank space at the beginning of each line. An added blank is predicted very well with the context starting with a full word and the character distributions for contexts starting with a letter are no longer adversely af +fected with the EOL character. In other words, it is better to sacrifice prediction of contexts starting with a blank space for improving prediction of contexts starting with almost any other character. +\par An additional little improvement is to refrain from adding spaces after EOLs if the next character is neither a letter nor a space [5]. It can help eg. on program sources with many lines starting with a tab character. +\par Some texts have almost almost all lines started with one or more spaces (eg. plays, poems). A careful implementation could detect such cases and then refrain from adding spaces after EOLs for the whole file/block. +\par \pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 4.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Phrase substitution +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 Replacing frequent phrases in the original data with new symbols from an augmented alphabet before actual +compression is a known idea [8], we however are not aware of any application of this concept to block sorting compression. Teahan reports also increased memory requirements for PPM after phrase substitution due to expanded alphabet size. This should not + be a problem for BS, since most sorting algorithms for BWT transform require memory resources proportional to the block size, and not depending on its contents as long as a single symbol in the augmented alphabet occupies no more bits than in the origina +l alphabet. Plain ASCII text uses 7 bits only, so the natural choice was not to exceed 256 symbols in the expanded alphabet. Possible collisions (for non-\ldblquote pure ASCII\rdblquote + data) must be handled by data expansion. In a practical implementation for general usage, data type detection should be performed before using phrase replacement. For example, English text might be detected by finding a reasonable amount of the word +\ldblquote the\rdblquote in the given file/block; it is also recommended to estimate the rate of non-ASCII chara +cters in the file/block. If it does not exceed eg. 5 per cent, we could feel confident the phrase replacement will not hurt much. The set of phrases may be static (hard coded) or selected dynamically for given data. From a practical point of view, dyn +amic phrase substitution offers little for its increased complicacy and computational costs. We chose to test static phrase substitution only. Our phrase set was selected for English language, since it is estimated that 80% of all texts on the Internet +are in English [8]. +\par The advantage of phrase substitution in block sorting compression comes from removing certain runs of zeroes in MTF values as well as managing to comprise more actual data in one block. The negative impact is context decorrelation. For example, the stri +ng \ldblquote {\i tha...}\rdblquote might be placed far from the string \ldblquote {\i the...}\rdblquote if \ldblquote {\i the}\rdblquote + is in the set of substituted phrases. This undesirable phenomenon may be mitigated by proper alphabet reordering (see Section 5). +\par It is not easy to give a rule to finding the most appropriate phrases to be replaced. One simple and rather obvious remark is that certain frequent phrases may not really be good candidates for the phrase set. For example, selecting the phrase +\ldblquote {\i ther}\rdblquote (prefix of the word \ldblquote {\i there}\rdblquote ) seems inappropriate while the phrase \ldblquote {\i the}\rdblquote is to be selected. +\par Our set was found experimentally. We limited the set to phrases of length two (digrams), three (trigrams) and four. The text was scanned three times: in the first pass the phrases of length 4 were substitut +ed while in the last pass digrams were replaced. Using lookup tables enabled to perform the replacement very fast. All phrases consisted of lower case letters only and the 4-character phrases had all unique 3-character prefixes, so the memory requiremen +ts for the lookup tables were modest. +\par Below we present our set of phrases (Table 3). It consists of 9 quadgrams, 26 trigrams and 88 digrams. +\par \pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par \pard \qr\widctlpar {\expnd4\expndtw20 Table 3} +\par \trowd \trqc\trgaph71\trleft-71 \clbrdrt\brdrs\brdrw15 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw30 \cellx1403\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw30 \cellx2991\clbrdrt\brdrs\brdrw15 \clbrdrb +\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx6960 \pard \qj\widctlpar\intbl 4-char phrases{\b \cell }trigrams\cell digrams{\b \cell }\pard \widctlpar\intbl {\b \row }\trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 \clbrdrr\brdrs\brdrw30 \cellx1403 +\clbrdrr\brdrs\brdrw30 \cellx2991\clbrdrr\brdrs\brdrw15 \cellx6960 \pard \qj\widctlpar\intbl {\i that}\cell {\i all}, {\i and}, {\i but}, {\b \cell }{\i ac}, {\i ad}, {\i ai}, {\i al}, {\i am}, {\i an}, {\i ar}, {\i as}, {\i at},\cell \pard +\widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i said}\cell {\i dow}, {\i for}, {\i had},\cell {\i ea}, {\i ec}, {\i ed}, {\i ee}, {\i el}, {\i en}, {\i er}, {\i es}, {\i et},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i +with}\cell {\i hav}, {\i her}, {\i him},\cell {\i id}, {\i ie}, {\i ig}, {\i il}, {\i in}, {\i io}, {\i is}, {\i it},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i have}\cell {\i his}, {\i man}, {\i mor},\cell {\i of},{\i ol},{\i + on},{\i oo},{\i or},{\i os},{\i ou},{\i ow},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i this}\cell {\i not}, {\i now}, {\i one},\cell {\i ul}, {\i un}, {\i ur}, {\i us},{\i ba}, {\i be}, {\i ca}, {\i ce}, {\i co}, {\i ch}, +\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i from}\cell {\i she}, {\i the}, {\i was},\cell {\i de}, {\i di}, {\i ge}, {\i gh},{\i ha}, {\i he}, {\i hi}, {\i ho},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i +whic\cell wer}, {\i whi}, {\i whe},\cell {\i ra}, {\i re}, {\i ri}, {\i ro}, {\i rs}, {\i la}, {\i le}, {\i li}, {\i lo}, {\i ld}, {\i ll}, {\i ly},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i were\cell wit}, {\i you}, {\i any}, +\cell {\i se}, {\i si}, {\i so}, {\i sh}, {\i ss}, {\i st}, {\i ma}, {\i me}, {\i mi},\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar\intbl {\i tion\cell are}\cell {\i ne},{\i nc},{\i nd}, {\i ng}, {\i nt},{\b }{\i pa}, {\i pe},{\i \cell +}\pard \widctlpar\intbl {\b \row }\trowd \trqc\trgaph71\trleft-71 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw30 \cellx1403\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw30 \cellx2991\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 +\cellx6960 \pard \qj\widctlpar\intbl \cell \cell {\i ta}, {\i te}, {\i ti}, {\i to}, {\i th}, {\i tr}, {\i wa}, {\i ve}\cell \pard \widctlpar\intbl {\b \row }\pard \qj\widctlpar {\b +\par }\pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 Two digrams from the selected set, \ldblquote {\i th}\rdblquote and \ldblquote {\i on}\rdblquote +, were split ie. each of them was substituted with one of two symbols depending on if the character preceeding the phrase were a blank or not. Actually, the substitution introduced 9+26+88+2=125 new symbols.{\b +\par }\pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 5.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Alphabet reordering +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 +The degree of similarity of neighboring contexts depends on the sorting order. Any alphabet order is allowed as long as both the coder and the decoder know it. It turns out the the alphabetical ASCII order is not the best choice. More information on fi +nding a better alphabet + order, also dynamically for the given data, can be found in [2], to the best of our knowledge the first and so far only study of the aspect of alphabet reordering for BWT. In our tests we use a modified version of the heuristic order for English languag +e taken from that paper. Adaptive strategies there presented yield less consistant (and not really greater on the overall) improvements and their usage is limited due to the long time of the search. The heuristic order from the mentioned paper groups vo +wels, similar consonants, and punctation. For the lower case letters, the order is: \ldblquote aeioubcdgfhrlsmnpqjktwvxyz\rdblquote and is analogous for the upper case. The other groupings are \ldblquote ?!\rdblquote and \ldblquote +-,.\rdblquote +. It performs well on text, and, since the order is similar to the original ASCII order, it doesn\rquote t usually hurt much on non-text data. +\par Our modification to the heuristic order intermingles digrams chosen for the phrase substitution with original symbols (eg. \ldblquote {\i th}\rdblquote is put close to \lquote {\i t}\rquote ). We additionally group \ldblquote special characters +\rdblquote (punctation marks, EOL characters, space, capital flag etc.). +\par An intrinsically similar idea to grouping punctation, EOLs etc. together is adding spaces before each of several chosen punctation marks. It is clearly an extension of the idea of adding spaces after EOLs from Section 3. The key observation is that it i +s a widely respected convention not to precede a dot neither a comma with a blank space in typed text; instead they are followed with a blank. The convention is not equally strict for o +ther punctation marks, nevertheless it seems reasonable to precede some of them with a blank as well. +\par Which of those two almost equivalent concepts is better? We chose symbol grouping and gave up adding spaces before punctation for containing more actual data in BS blocks and slightly faster preprocessing. +\par +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 6.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 EOL coding +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 End-Of-Line characters are hard to predict. If we had spaces instead of EOLs, most text files would be noticably more compressible. Hence comes the idea of replacing EOLs with spaces an +d grouping the EOLs together represented as line lengths in words (more precisely: each EOL is represented as the number of blank spaces since the previous EOL) [7]. Then all data (\ldblquote real\rdblquote + data and EOL information) together are submitted to BWT. This ide +a has unstable influence: on some files (eg. book1 from Calgary Corpus) it gives noticable improvement, on other files it hurts. The following order-1 entropy measurement heuristic was tested. The sum of sizes of order-1 arithmetically compressed actual + data with EOLs replaced with spaces and order\_ +1 arithmetically compressed EOL information was estimated and compared to the size of original data arithmetically compressed in the order-1 model. If and only if the sum of compressed sizes was lower, then th +e described EOL coding and reordering was performed. +\par The heuristic gave limited success. We decided to stop exploring the EOL coding idea but it looks promising for further +research. For example, the EOLs can be encoded separately with eg. order-1 arithmetic coder which seems to give better results (we however limited ourselves in this work to mere preproccessing only so that solution is out of our scope). Another interest +ing question is about proper representation of EOL information. In many text files line lengths (in characters) are similar (ie. fixed left and right margins are kept). This information could be explored. +\par We would like to mention that our result on book1 +with all ideas including the EOL coding and submitting all data to one BWT block was about 220300 bytes with bzip. This is, as can be seen below, over 2000 bytes additional gain. Book1 was however the only file from Calgary Corpus for which the EOL codi +ng was helpful. +\par \pard\plain \qj\fi284\widctlpar \f4\fs20\lang1045 +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 7.\tab}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Test results +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 The presented filters are applied in the following order: +\par {\pntext\pard\plain\f1\fs20 \'b7\tab}\pard \s17\qj\fi284\widctlpar{\*\pn \pnlvlblt\pnf1\pnstart1\pnindent-284\pnhang{\pntxtb \'b7}}\~capital conversion (to increase the number of phrase substitutions in the next step); +\par {\pntext\pard\plain\f1\fs20 \'b7\tab}\~phrase replacement; +\par {\pntext\pard\plain\f1\fs20 \'b7\tab}\~alphabet reordering (the new symbols for phrases are intermingled with original symbols); +\par {\pntext\pard\plain\f1\fs20 \'b7\tab}\~space stuffing (with respect to the new alphabet order) if EOL coding is to be omitted, +\par {\pntext\tab}\pard \s17\qj\widctlpar{\*\pn \pnlvlcont\pnf1\pnstart1\pnindent-284\pnhang{\pntxtb \'b7}} +\par {\pntext\tab}otherwise +\par {\pntext\tab} +\par {\pntext\pard\plain\f1\fs20 \'b7\tab}\pard \s17\qj\fi284\widctlpar{\*\pn \pnlvlblt\pnf1\pnstart1\pnindent-284\pnhang{\pntxtb \'b7}}\~ +EOL coding if the order-1 entropy check heuristic suggests it. If the decision is negative, space stuffing (ie. adding spaces after EOLs) may then be used. +\par \pard \s17\qj\fi284\widctlpar +\par We did not use the EOL coding in our test. +\par In Table 4 we present the results on 14 text files from the Calgary Corpus obtained with {\i bzip} and IMP. {\i Bzip}, by Julian Seward, is a well-known block sorting compressor, based on Fenwick\rquote +s research in this area; it is available with sources [10].\~\~IMP, from Technelysium Pty Ltd, is a very fast and relatively new implementation of block sorting compression [11]. The main difference between them is that {\i bzip} + uses arithmetic coder while IMP uses Huffman coder. Each of the tested files from any of the two following corpora was contained in one {\i bzip}\rquote s or IMP\rquote s block. +\par \pard \s17\qr\fi284\widctlpar \page {\expnd4\expndtw20 Table 4} +\par \trowd \trqc\trgaph142\trleft-142 \clbrdrt\brdrs\brdrw15 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrdb\brdrw30 \cellx765\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx1956\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr +\brdrs\brdrw15 \cellx3147\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrdb\brdrw30 \cellx3827\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx5018\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx6209\clbrdrt +\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx6889 \pard\plain \widctlpar\intbl \f4\fs20\lang1045 {\b file\cell }\pard \qr\widctlpar\intbl {\b bzip (original data)\cell bzip\line (after\line filtering)\cell }\pard \qc\widctlpar\intbl +{\b gain (%)\cell }\pard \qr\widctlpar\intbl {\b IMP -2 +\par (original\line data)\cell IMP -2\line (after filtering)\cell }\pard \qc\widctlpar\intbl {\b gain (%)\cell }\pard \widctlpar\intbl {\b \row }\trowd \trqc\trgaph142\trleft-142 \clbrdrl\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx765\cellx1956\clbrdrr +\brdrs\brdrw15 \cellx3147\clbrdrr\brdrdb\brdrw30 \cellx3827\cellx5018\clbrdrr\brdrs\brdrw15 \cellx6209\clbrdrr\brdrs\brdrw15 \cellx6889 \pard \widctlpar\intbl bib\cell \pard \qr\widctlpar\intbl 27097\cell {\f5\cf1 26839}\cell \pard \qc\widctlpar\intbl { +\f5\cf1 0.95\cell }\pard \qr\widctlpar\intbl {\f5\cf1 27549}\cell {\f5\cf1 27264}\cell \pard \qc\widctlpar\intbl {\f5\cf1 1.03}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl book1\cell \pard \qr\widctlpar\intbl 230247\cell {\f5\cf1 222567}\cell +\pard \qc\widctlpar\intbl {\f5\cf1 3.34}\cell \pard \qr\widctlpar\intbl {\f5\cf1 232103}\cell {\f5\cf1 223962}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.51}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl book2\cell \pard \qr\widctlpar\intbl +155944\cell {\f5\cf1 150291}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.63}\cell \pard \qr\widctlpar\intbl {\f5\cf1 157155}\cell {\f5\cf1 150933}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.96}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl news +\cell \pard \qr\widctlpar\intbl 118112\cell {\f5\cf1 114927}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.70}\cell \pard \qr\widctlpar\intbl {\f5\cf1 118598}\cell {\f5\cf1 115009}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.03}\cell \pard \widctlpar\intbl +\row \pard \widctlpar\intbl paper1\cell \pard \qr\widctlpar\intbl 16360\cell {\f5\cf1 15772}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.59}\cell \pard \qr\widctlpar\intbl {\f5\cf1 16645}\cell {\f5\cf1 16073}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.44} +\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl paper2\cell \pard \qr\widctlpar\intbl 24826\cell {\f5\cf1 23839}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.98}\cell \pard \qr\widctlpar\intbl {\f5\cf1 25203}\cell {\f5\cf1 24152}\cell \pard +\qc\widctlpar\intbl {\f5\cf1 4.17}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl paper3\cell \pard \qr\widctlpar\intbl 15704\cell {\f5\cf1 15194}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.25}\cell \pard \qr\widctlpar\intbl {\f5\cf1 16005}\cell { +\f5\cf1 15481}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.27}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl paper4\cell \pard \qr\widctlpar\intbl 5072\cell {\f5\cf1 4881}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.77}\cell \pard +\qr\widctlpar\intbl {\f5\cf1 5291}\cell {\f5\cf1 5089}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.82}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl paper5\cell \pard \qr\widctlpar\intbl 4704\cell {\f5\cf1 4570}\cell \pard \qc\widctlpar\intbl { +\f5\cf1 2.85}\cell \pard \qr\widctlpar\intbl {\f5\cf1 4934}\cell {\f5\cf1 4760}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.53}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl paper6\cell \pard \qr\widctlpar\intbl 12132\cell {\f5\cf1 11666}\cell +\pard \qc\widctlpar\intbl {\f5\cf1 3.84}\cell \pard \qr\widctlpar\intbl {\f5\cf1 12480}\cell {\f5\cf1 11981}\cell \pard \qc\widctlpar\intbl {\f5\cf1 4.00}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl progc\cell \pard \qr\widctlpar\intbl 12379 +\cell {\f5\cf1 12086}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.37}\cell \pard \qr\widctlpar\intbl {\f5\cf1 12668}\cell {\f5\cf1 12377}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.30}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl progl\cell \pard +\qr\widctlpar\intbl 15387\cell {\f5\cf1 15023}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.37}\cell \pard \qr\widctlpar\intbl {\f5\cf1 15774}\cell {\f5\cf1 15435}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.15}\cell \pard \widctlpar\intbl \row \pard +\widctlpar\intbl progp\cell \pard \qr\widctlpar\intbl 10533\cell {\f5\cf1 10376}\cell \pard \qc\widctlpar\intbl {\f5\cf1 1.49}\cell \pard \qr\widctlpar\intbl {\f5\cf1 10855}\cell {\f5\cf1 10708}\cell \pard \qc\widctlpar\intbl {\f5\cf1 1.35}\cell \pard +\widctlpar\intbl \row \pard \widctlpar\intbl trans\cell \pard \qr\widctlpar\intbl 17488\cell {\f5\cf1 17142}\cell \pard \qc\widctlpar\intbl {\f5\cf1 1.98}\cell \pard \qr\widctlpar\intbl {\f5\cf1 17801}\cell {\f5\cf1 17499}\cell \pard \qc\widctlpar\intbl { +\f5\cf1 1.70}\cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl {\i \cell }\pard \qr\widctlpar\intbl {\i \cell \cell }\pard \qc\widctlpar\intbl {\i \cell }\pard \qr\widctlpar\intbl {\i \cell \cell }\pard \qc\widctlpar\intbl {\i \cell }\pard +\widctlpar\intbl {\i \row }\pard \widctlpar\intbl total\cell \pard \qr\widctlpar\intbl {\field\flddirty{\*\fldinst =SUM(ABOVE) }{\fldrslt {\lang1024 665985}}}\cell {\field{\*\fldinst =SUM(above) }{\fldrslt {\lang1024 645173}}}\cell \pard +\qc\widctlpar\intbl {\f5\cf1 3.12}\cell \pard \qr\widctlpar\intbl {\field{\*\fldinst =SUM(ABOVE) }{\fldrslt {\lang1024 673061}}}\cell {\field\flddirty{\*\fldinst =SUM(above) }{\fldrslt {\lang1024 650723}}}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.32} +\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph142\trleft-142 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx765\clbrdrb\brdrs\brdrw15 \cellx1956\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx3147\clbrdrb +\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx3827\clbrdrb\brdrs\brdrw15 \cellx5018\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx6209\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx6889 \pard \widctlpar\intbl avg.\cell \pard \qr\widctlpar\intbl +\cell \cell \pard \qc\widctlpar\intbl {\f5\cf1 2.86}\cell \pard \qr\widctlpar\intbl \cell \cell \pard \qc\widctlpar\intbl {\f5\cf1 2.95}\cell \pard \widctlpar\intbl \row \pard \widctlpar +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 As it can be seen, compression was never hurt on tested files. The overall gain is about 3 per cent, a little greater for IMP. On plain text +s the improvement for IMP was usually greater (eg. book1, book2, news), on more structural texts (eg. progc, progp, progl, trans) the opposite seemed to be true. In other words, IMP was more sensitive to text preprocessing. +\par With both compressors more gain was obtained for plain, human language files than for more structural, \ldblquote artificial\rdblquote files. +\par Since the presented filtering was tuned to the Calgary Corpus files, with slightly more attention to plain text (book1) than to other text types, it should be int +eresting to evaluate the performance of filtering on another set of files. We chose the Canterbury Corpus (Table\~5). Again, we skip the files containing characters with ASCII codes\~>\~127. +\par \pard\plain \qr\widctlpar \f4\fs20\lang1045 {\b \page }{\expnd4\expndtw20 Table 5}{\b +\par }\trowd \trqc\trgaph142\trleft-142 \clbrdrt\brdrs\brdrw15 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrdb\brdrw30 \cellx879\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx2041\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr +\brdrs\brdrw15 \cellx3203\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrdb\brdrw30 \cellx3883\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \cellx5045\clbrdrt\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx6207\clbrdrt +\brdrs\brdrw15 \clbrdrb\brdrs\brdrw30 \clbrdrr\brdrs\brdrw15 \cellx6887 \pard \widctlpar\intbl {\b file\cell }\pard \qr\widctlpar\intbl {\b bzip (original data)\cell bzip\line (after\line filtering)\cell }\pard \qc\widctlpar\intbl {\b gain (%)\cell +}\pard \qr\widctlpar\intbl {\b IMP -2 +\par (original\line data)\cell IMP -2\line (after filtering)\cell }\pard \qc\widctlpar\intbl {\b gain (%)\cell }\pard \widctlpar\intbl {\b \row }\trowd \trqc\trgaph142\trleft-142 \clbrdrl\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx879\cellx2041\clbrdrr +\brdrs\brdrw15 \cellx3203\clbrdrr\brdrdb\brdrw30 \cellx3883\cellx5045\clbrdrr\brdrs\brdrw15 \cellx6207\clbrdrr\brdrs\brdrw15 \cellx6887 \pard \widctlpar\intbl alice29\cell \pard \qr\widctlpar\intbl {\f5\cf1 42823}\cell {\f5\cf1 41159}\cell {\f5\cf1 3.89 +\cell 43321\cell 41601\cell 3.97\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl fields\cell \pard \qr\widctlpar\intbl 2911\cell 2829\cell {\f5\cf1 2.82\cell }3114\cell 3076\cell {\f5\cf1 1.22\cell }\pard \widctlpar\intbl \row \pard +\widctlpar\intbl lcet10\cell \pard \qr\widctlpar\intbl {\f5\cf1 106629\cell 102394\cell 3.97\cell 107208\cell 102711\cell 4.19\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl plrabn12\cell \pard \qr\widctlpar\intbl {\f5\cf1 144223\cell 141491 +\cell 1.89\cell 145467\cell 142102\cell 2.31\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl cp\cell \pard \qr\widctlpar\intbl {\f5\cf1 7517\cell 7350\cell 2.22\cell 7750\cell 7577\cell 2.23\cell }\pard \widctlpar\intbl \row \pard +\widctlpar\intbl grammar\cell \pard \qr\widctlpar\intbl {\f5\cf1 1184\cell 1161\cell 1.94\cell 1348\cell 1338\cell 0.74\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl xargs.1\cell \pard \qr\widctlpar\intbl {\f5\cf1 1655\cell 1595\cell 3.63\cell +1819\cell 1769\cell 2.75\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl asyoulik\cell \pard \qr\widctlpar\intbl {\f5\cf1 39315\cell 38454\cell 2.19\cell 39823\cell 38857\cell 2.43\cell }\pard \widctlpar\intbl \row \pard \widctlpar\intbl \cell +\pard \qr\widctlpar\intbl \cell \cell \pard \qc\widctlpar\intbl \cell \pard \qr\widctlpar\intbl \cell \cell \pard \qc\widctlpar\intbl \cell \pard \widctlpar\intbl \row \pard \widctlpar\intbl total\cell \pard \qr\widctlpar\intbl {\field{\*\fldinst =SUM +(ABOVE) }{\fldrslt {\lang1024 346257}}}\cell {\field{\*\fldinst =SUM(above) }{\fldrslt {\lang1024 336433}}}\cell \pard \qc\widctlpar\intbl {\f5\cf1 2.84}\cell \pard \qr\widctlpar\intbl {\field{\*\fldinst =SUM(ABOVE) }{\fldrslt {\lang1024 349850}}}\cell +{\field{\*\fldinst =SUM(above) }{\fldrslt {\lang1024 339031}}}\cell \pard \qc\widctlpar\intbl {\f5\cf1 3.09}\cell \pard \widctlpar\intbl \row \trowd \trqc\trgaph142\trleft-142 \clbrdrl\brdrs\brdrw15 \clbrdrb\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 +\cellx879\clbrdrb\brdrs\brdrw15 \cellx2041\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx3203\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrdb\brdrw30 \cellx3883\clbrdrb\brdrs\brdrw15 \cellx5045\clbrdrb\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx6207\clbrdrb +\brdrs\brdrw15 \clbrdrr\brdrs\brdrw15 \cellx6887 \pard \widctlpar\intbl avg.\cell \pard \qr\widctlpar\intbl \cell \cell \pard \qc\widctlpar\intbl {\f5\cf1 2.82}\cell \pard \qr\widctlpar\intbl \cell \cell \pard \qc\widctlpar\intbl {\f5\cf1 2.48}\cell +\pard \widctlpar\intbl \row \pard \widctlpar {\b +\par }As before, most gain is achieved on plain texts (lcet10, alice29).{\b +\par +\par {\pntext\pard\plain\b\fs20\lang1024\kerning28 8.\tab}}\pard\plain \s1\keepn\widctlpar{\*\pn \pnlvl1\pndec\pnprev1\pnstart1\pnindent284 {\pntxta .}}\b\f4\fs20\lang1024\kerning28 Conclusions +\par \pard\plain \s17\qj\fi284\widctlpar \f4\fs20\lang1045 +The appeal of BWT-based compression comes from its speed comparable to typical LZ coders combined with superior compression rates for text data, only several per cent worse than those achieved with the best existing text compressors today, RKIVE by Malcol +m Taylor [12] and BOA by Ian Sutton [13]. We showed three ways to further improve text compression of blocksorters. \ldblquote Context alleviation\rdblquote + (space stuffing and capital conversion) and phrase substitution are of more general usage. The former helps to PPM a +nd the latter helps to both PPM and, above all, LZ schemes. Alphabet reordering, which we used for additional slight gain, is BWT specific technique, although we noticed a slight improvement also for the {\i associative coder} + by George Buyanovsky (ACB), a very interesting algorithm having much in common with blocksorters [14]. +\par The overall gain for {\i bzip} and for IMP, two notable block sorting implementations, was about 3 per cent, which is a significant step toward the best known compression algorithms. Our p +reprocessing was fast and the overall compression speed was almost unaffected. In special cases, the speed could even be increased since the phrase substitution outputs a noticably smaller volume of data for further coding. A too large set of substitute +d phrases may hurt compression while making it faster in the same time. It is clear that fact raises a speed / compression trade-off. We also would like to stress that the considered ideas are independent from the used block sorting coder, in the sense +we may perceive the actual coder as a black box. Hence the simplicity and ease of application comes. +\par The main difficulty with our ideas is the fact they are not general. Context alleviation helps with most human languages but not necessarily with other structured texts like computer programs in various languages. The alphabet order was optimized for En +glish, although, for the positive side, some key observations supporting the chosen symbol permutation are common for many languages. Phrase replacement is +also language specific. Moreover, the most frequent phrases for different texts in one language may vary significantly. On the other hand, shorter phrases, particularly digrams, are more or less equally common within one language. Our experiments showe +d that replacing phrases longer than 3 characters does not give much additional benefit, so we restricted ourselves to digrams, trigrams and a few 4-character phrases only. It also enabled an easy and fast implementation of this idea. +\par In practical applicat +ion, for each of the ideas lacking robustness, it is recommended to perform a preliminary scan with a quick heuristic trying to determine if the idea in question is applicable. Several simple hints were given in appropriate parts of this paper but they w +ere not implemented. Our purpose was to demonstrate that the presented simple ideas often give significant benefits for compression and are worth further exploration rather than dealing with heuristics for handling special cases. Therefore the question +about the range of applicability of those ideas remains for further research. +\par \pard\plain \qj\widctlpar \f4\fs20\lang1045 +\par {\b Acknowledgements +\par }\pard \qj\fi284\widctlpar The author would like to thank Malcolm Taylor, Uwe Herklotz, George Lyapko, Ian Sutton and Vadim Yoockin for valuable suggestions, ideas and interesting discussion. +\par \pard \qj\widctlpar +\par {\b References +\par }\pard \qj\fi-284\li284\widctlpar [1]\tab M.\~Burrows and D.\~J.\~Wheeler, {\i A block-sorting Lossless Data Compression Algorithm}, SRC Research Report 124, Digital Systems Research Center, Palo Alto, CA, May 1994. Also available at {\f3 +http://gatekeeper.dec.com/\line pub/DEC/SRC/research-reports/abstracts/src-rr-124.html}. +\par [2]\tab B.\~Chapin and S.\~R.\~Tate, {\i Higher Compression from the Burrows-Wheeler Trans\-form by Modified Sorting}, University of North Texas, Department of Computer Science, 1998. Also available {\f3 at http://www.cs.unt.edu/\line +~srt/papers/bwtsort.pdf}. +\par \page [3]\tab P.\~Fenwick, {\i Experiments with a Block Sorting Text Compression Algorithm}, The University of Auckland, Department of Computer Science, Tech.\~Rep. 111, May 1995. Also available at {\f3 ftp://ftp.cs.auckland.ac.nz/out/peter-f/\line +TechRep111.ps}. +\par [4]\~P.\~Fenwick, {\i Block Sorting Text Compression - Final Report}, The University of Auckland, Department of Computer Science, Tech.\~Rep. 130, April 1996. Also available at {\f3 ftp://ftp.cs.auckland.ac.nz/out/peter-f/\line TechRep130.ps}. +\par [5] U.\~Herklotz, private correspondence, 1999. +\par [6]\tab G.\~Lyapko, private correspondence, 1999. +\par [7]\tab M.\~Taylor, private correspondence, 1998. +\par [8]\tab B.\~Teahan, {\i Modelling English text}, PhD thesis, Department of Computer Science, The University of Waikato, Hamilton, New Zealand, May 1998. +\par [9] V.\~Yoockin, private correspondence, 1999. +\par \pard \fi-284\li284\widctlpar [10] Bzip, version 0.21, sources available at {\f3 ftp://ftp.elf.stuba.sk/pub/pc/pack/bzip021.tgz}\line or {\f3 ftp://ftp.elf.stuba.sk/pub/pc/pack/bzip021.zip}. +\par \pard \qj\fi-284\li284\widctlpar [11] IMP, version 1.1, available at {\f3 http://www.technelysium.com.au/}. +\par \pard \fi-284\li284\widctlpar [12] RKIVE, version 1.92 beta 1, available at {\f3 http://www.geocities.com/SiliconValley/Peaks/9463/}. +\par \pard \qj\fi-284\li284\widctlpar [13]\~BOA, version 0.58 beta, available at {\f3 ftp://ftp.elf.stuba.sk/pub/pc/\line pack/boa058.zip}. +\par [14]\~ACB version 2.00c, available at {\f3 ftp://ftp.elf.stuba.sk/pub/pc/\line pack/acb_200c.zip}. +\par } \ No newline at end of file -- 2.43.0