src/blocksorting/bs.c

   1 /*----------------------------------------------------------------------------
   2  *                   jacu - Just Another Compression Utility
   3  *----------------------------------------------------------------------------
   4  * This file is part of jacu.
   5  *
   6  * jacu is free software; you can redistribute it and/or modify it under the
   7  * terms of the GNU General Public License as published by the Free Software
   8  * Foundation; either version 2 of the License, or (at your option) any later
   9  * version.
  10  *
  11  * jacu is distributed in the hope that it will be useful, but WITHOUT ANY
  12  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  14  * details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with jacu; if not, write to the Free Software Foundation, Inc., 59 Temple
  18  * Place, Suite 330, Boston, MA  02111-1307  USA
  19  *----------------------------------------------------------------------------
  20  */
  21
  22
  23 #include "bs.h"
  24 #include <stdlib.h>
  25 #include <ctype.h>
  26
  27 /* Block Sorting Optimizado en memoria! */
  28
  29 typedef struct _dict_ {
  30         char *word;
  31         int len;
  32         int hits;
  33 } t_Diccionario;
  34
  35 #define PALABRA(a) {a, sizeof(a)-1, 0}
  36
  37 #define DICT_SIZE sizeof(dic)/sizeof(t_Diccionario)
  38
  39 /* ASCII Numero 3 como caracter de escape */
  40 #define ESCAPE_CHARACTER 0x3
  41
  42 /* Diccionario */
  43 t_Diccionario dic[] = {
  44         /* Preposiciones (no todas) */
  45         PALABRA(" ante"),
  46         PALABRA(" bajo"),
  47         PALABRA(" cabe"),
  48         PALABRA(" con"),
  49         PALABRA(" contra"),
  50         /* PALABRA(" de"),  XXX No se si conviene, solo ahorra 1 byte y puede romper localidad */
  51         PALABRA(" desde"),
  52         PALABRA(" hasta"),
  53         PALABRA(" hacia"),
  54         PALABRA(" para"),
  55         PALABRA(" por"),
  56         PALABRA(" según"),
  57         PALABRA(" sin"),
  58         PALABRA(" sobre"),
  59         PALABRA(" tras"),
  60         PALABRA(" mediante"),
  61         PALABRA(" durante"),
  62
  63         /* Articulos */
  64         PALABRA(" el "),
  65         PALABRA(" la "),
  66         PALABRA(" ella "),
  67         PALABRA(" del "),
  68         PALABRA(" los "),
  69         PALABRA(" las "),
  70         PALABRA(" lo "),
  71         PALABRA(" que"),
  72         PALABRA(" una "),
  73         PALABRA(" también"),
  74         PALABRA(" tambien"),
  75         PALABRA(" cuando"),
  76         PALABRA(" pero"),
  77         PALABRA(" todo"),
  78         /* Otras de test */
  79         PALABRA(" si "),
  80         PALABRA(" mas "),
  81         PALABRA(" más "),
  82         PALABRA(" porque"),
  83         PALABRA(" entonces"),
  84         PALABRA(" siempre"),
  85         PALABRA(" segundo"),
  86         PALABRA(" programa"),
  87         PALABRA(" existe"),
  88         PALABRA(" recien"),
  89         PALABRA(" máximo"),
  90         PALABRA(" mínimo"),
  91         PALABRA(" casi"),
  92         PALABRA(" sección"),
  93         PALABRA(" informe"),
  94         PALABRA(" Informe"),
  95         PALABRA(" acción"),
  96         PALABRA(" perdedor"),
  97         PALABRA(" existencia"),
  98         PALABRA(" aire"),
  99         PALABRA(" árbol"),
 100         PALABRA(" prueba"),
 101         PALABRA(" muestra"),
 102         PALABRA(" animal"),
 103         PALABRA(" suerte"),
 104         PALABRA(" portador"),
 105         PALABRA(" molesto"),
 106         PALABRA(" cielo"),
 107         PALABRA(" impulso"),
 108         PALABRA(" alcohol"),
 109         PALABRA(" seguido"),
 110         PALABRA(" permiso"),
 111         PALABRA(" cuarto"),
 112         PALABRA(" brillante"),
 113         PALABRA(" tener"),
 114         PALABRA(" ningún"),
 115         PALABRA(" fácil"),
 116         /* Algunas cosas de ingles para mejorar otros textos */
 117         PALABRA(" as "),
 118         PALABRA(" will "),
 119         PALABRA(" with "),
 120         PALABRA(" this "),
 121         PALABRA(" program "),
 122         PALABRA(" are "),
 123         PALABRA(" The "),
 124         PALABRA(" it "),
 125         PALABRA(" which "),
 126         PALABRA(" for "),
 127         PALABRA(" be "),
 128         PALABRA(" that "),
 129         PALABRA(" is "),
 130         PALABRA(" and "),
 131         PALABRA(" to "),
 132         PALABRA(" of "),
 133         PALABRA(" the ")
 134 };
 135
 136 typedef struct _bs_decode_t_ {
 137         char c;
 138         Uint32 sig;
 139 } t_BlockSortDecode;
 140
 141 char es_menor(char *data, t_BlockSort *bs, int i, int j);
 142
 143 int _compare(const void *d1, const void *d2) {
 144         t_BlockSortDecode *s1, *s2;
 145
 146         s1 = (t_BlockSortDecode *)d1;
 147         s2 = (t_BlockSortDecode *)d2;
 148
 149         /* Hack si es el mismo caracter, dejo como mejor la de menor sig */
 150         if (s1->c == s2->c) {
 151                 return (s1->sig - s2->sig);
 152         }
 153
 154         return (s1->c - s2->c);
 155 }
 156
 157 int __compare(const void *d1, const void *d2) {
 158         t_BlockSortData *s1, *s2;
 159
 160         s1 = (t_BlockSortData *)d1;
 161         s2 = (t_BlockSortData *)d2;
 162
 163         return  es_menor(s1->bs->data, s1->bs, s1->pos_inicial, s2->pos_inicial);
 164 }
 165
 166 char es_menor(char *data, t_BlockSort *bs, int i, int j)
 167 {
 168         Uint32 pi, pj, k;
 169
 170         for(k=0; k<bs->len; k++) {
 171                 pi = (i+k)%bs->len;
 172                 pj = (j+k)%bs->len;
 173
 174                 if (data[pi] > data[pj]) return 1; /* El primero es mayor */
 175                 if (data[pi] < data[pj]) return -1; /* El primero es menor */
 176         }
 177
 178         /* Son iguales! */
 179         return 0;
 180 }
 181
 182 void generar_array(char *data, t_BlockSort *bs)
 183 {
 184         int i;
 185         for(i=0; i<bs->len; i++) {
 186                 bs->array[i].pos_inicial = i;
 187                 bs->array[i].pos_final = (i+bs->len-1)%bs->len;
 188                 bs->array[i].ord = (i==0)?1:0;
 189                 bs->array[i].bs = bs;
 190         }
 191 }
 192
 193 void ordenar_array(char *data, t_BlockSort *bs)
 194 {
 195         qsort(bs->array, bs->len, sizeof(t_BlockSortData), __compare);
 196 }
 197
 198 int generar_salida(char *data, t_BlockSort *bs, char *salida)
 199 {
 200         Uint32 i, k;
 201         char *out;
 202
 203         /* Dejo lugar para guardar el K */
 204         out = salida + sizeof(Uint32);
 205
 206         k=-1;
 207         for(i=0; i<bs->len; i++) {
 208                 out[i] = data[bs->array[i].pos_final];
 209                 if (bs->array[i].ord == 1) k = i;
 210         }
 211         return k;
 212 }
 213
 214 void bs_solve(char *in, char *out, t_BlockSort *bs, Uint32 *k, Uint32 leido)
 215 {
 216         unsigned int l;
 217         l = bs->len;
 218         /* Hack para pedasos menores a la pagina */
 219         if (leido < bs->len) bs->len = leido;
 220         bs->data = in;
 221
 222         generar_array(in, bs);
 223         ordenar_array(in, bs);
 224         (*k) = generar_salida(in, bs, out);
 225
 226         /* Guardo el k y el tamaño en el array */
 227         memcpy(out, k, sizeof(Uint32));
 228         bs->len = l;
 229 }
 230
 231 void bs_restore(char *dst, char *c, Uint32 k, Uint32 len)
 232 {
 233         Uint32 i, current;
 234         t_BlockSortDecode *in;
 235
 236         in = malloc(sizeof(t_BlockSortDecode)*len);
 237
 238         for(i=0; i<len; i++) {
 239                 in[i].c = c[i];
 240                 in[i].sig = i;
 241         }
 242
 243         qsort(in, len, sizeof(t_BlockSortDecode), _compare);
 244
 245         current = k;
 246         i=0;
 247         do {
 248                 dst[i++] = in[current].c;
 249                 current = in[current].sig;
 250         } while (current != k);
 251         free(in);
 252 }
 253
 254 t_BlockSort *bs_create(Uint32 len)
 255 {
 256         t_BlockSort *tmp;
 257
 258         tmp = malloc(sizeof(t_BlockSort));
 259         tmp->array = malloc(sizeof(t_BlockSortData)*len);
 260         tmp->len = len;
 261
 262         return tmp;
 263 }
 264
 265
 266 void bs_destroy(t_BlockSort *bs)
 267 {
 268         free(bs->array);
 269         free(bs);
 270 }
 271
 272
 273 void bs_EOL(char *data, Uint32 pagesize, Uint32 *j)
 274 {
 275         /* Trato de agregar 4 espacios antes y 4 despues de un \n */
 276         int i = (*j);
 277
 278         /* Verifico poder hacerlo */
 279         if ((i+9) >= pagesize) return; /* No pude! */
 280
 281         data[i++] = ' ';
 282         data[i++] = ' ';
 283         data[i++] = ' ';
 284         data[i++] = ' ';
 285         data[i++] = ' ';
 286         data[i++] = ' ';
 287         data[i++] = ' ';
 288         data[i++] = ' ';
 289         data[i++] = ' ';
 290
 291         (*j) += 9;
 292 }
 293
 294 char check_hint(char c)
 295 {
 296         static int current_pos = 0;
 297         char hits = 0;
 298         int i;
 299         char hit_pos = -1;
 300
 301         for(i=0; i<DICT_SIZE; i++) {
 302                 /* Veo de no pasarme */
 303                 if (current_pos < dic[i].len) {
 304                         /* Si la palabra tiene suficientes hints */
 305                         if (dic[i].hits == current_pos) {
 306                                 if (dic[i].word[current_pos] == c) {
 307                                         dic[i].hits++;
 308                                         hits++;
 309                                         if (dic[i].hits == dic[i].len) {
 310                                                 hit_pos = i;
 311                                         }
 312                                 } else {
 313                                         /* Esta palabra ya no tiene posibilidades */
 314                                         dic[i].hits = 0;
 315                                 }
 316                         }
 317                 } else {
 318                         dic[i].hits = 0;
 319                 }
 320         }
 321
 322         current_pos++;
 323
 324         if (hits == 0) {
 325                 current_pos = 0;
 326                 return -1; /* No hay hints ! */
 327         }
 328         if (hits > 1) return -2; /* Tengo mas de 1 hint! */
 329
 330         if (hit_pos == -1) return -2; /* Tengo 1 solo hit pero no es completo */
 331
 332         /* Tengo 1 solo hint !!!! */
 333         current_pos = 0;
 334         return hit_pos;
 335 }
 336
 337 void bs_clean_dic()
 338 {
 339         int i;
 340         for(i=0; i<DICT_SIZE; i++) {
 341                 dic[i].hits = 0;
 342         }
 343 }
 344
 345 int bs_readblock(FILE *fp, char *data, Uint32 pagesize, int usar_dic)
 346 {
 347         Uint32 i=0;
 348         char c, hint;
 349         char buffer[100];
 350         int j, buffer_pos=0;
 351
 352         while ((!feof(fp)) && ((i+buffer_pos) < pagesize)) {
 353                 c = fgetc(fp);
 354
 355                 if (usar_dic != 1) {
 356                         data[i++] = c;
 357                         continue;
 358                 }
 359
 360                 if (c == ESCAPE_CHARACTER) {
 361                         data[i++] = c;
 362                         data[i++] = 0xF;
 363                         bs_clean_dic();
 364                         continue;
 365                 }
 366
 367                 hint = check_hint(c);
 368
 369                 switch (hint) {
 370                         case -1:
 371                                 /* No hay hints, vacio el buffer y luego saco el c */
 372                                 for(j=0; j<buffer_pos; j++)
 373                                         data[i++] = buffer[j];
 374
 375                                 data[i++] = c;
 376                                 buffer_pos = 0;
 377                                 bs_clean_dic();
 378                         break;
 379                         case -2:
 380                                 /* Tengo mas de 1 hit, tengo posibilidades, guardo este char en el buffer */
 381                                 buffer[buffer_pos++] = c;
 382                         break;
 383                         default:
 384                                 /* Me retornaron un numero positivo!!, eso quiere decir que ese numero
 385                                  * es la posicion dentro del diccionario de la palabra que puedo reemplazar
 386                                  */
 387                                 data[i++] = ESCAPE_CHARACTER;
 388                                 /* Trato de hacer que el caracter sea comun en textos, para que no me rompa
 389                                  * la localidad
 390                                  */
 391                                 data[i++] = hint;
 392                                 bs_clean_dic();
 393                                 /* El buffer no lo necesito mas */
 394                                 buffer_pos = 0;
 395                 }
 396         }
 397
 398         for(j=0; j<buffer_pos; j++)
 399                 data[i++] = buffer[j];
 400         /* Saco un EOF que lee de mas */
 401         if (i<pagesize) i--;
 402
 403         return i;
 404 }
 405
 406 char *bs_finalblock(char *data, Uint32 len, Uint32 *new_size)
 407 {
 408         Uint32 size=0, i, j;
 409         char *out = NULL;
 410         unsigned char tmp;
 411
 412         for(i=0; i < len; i++) {
 413                 if (data[i] == ESCAPE_CHARACTER) {
 414                         i++;
 415                         if (data[i] == 0xF) {
 416                                 /* Solo tengo que dejar el ESCAPE_CHARACTER */
 417                                 size++;
 418                                 out = (char *)realloc(out, size*sizeof(char));
 419                                 out[size-1] = data[i];
 420                                 /* Salteo ese caracter */
 421                         } else {
 422                                 /* Va una palabra!! */
 423                                 tmp = data[i];
 424                                 size += dic[(size_t)tmp].len;
 425                                 out = (char *)realloc(out, size*sizeof(char));
 426                                 for(j=0; j<dic[(size_t)tmp].len; j++) {
 427                                         out[size-dic[(size_t)tmp].len+j] = dic[(size_t)tmp].word[j];
 428                                 }
 429                                 /* Salteo el caracter siguiente al de escape */
 430                         }
 431                 } else {
 432                         size++;
 433                         out = (char *)realloc(out, size*sizeof(char));
 434                         out[size-1] = data[i];
 435                 }
 436         }
 437
 438         (*new_size) = size;
 439         return out;
 440 }
 441