Cambios minimos, no se si entraran en la impresion :(

[z.facultad/75.06/jacu.git] / src / blocksorting / bs.c
diff --git a/src/blocksorting/bs.c b/src/blocksorting/bs.c

index 5c9be2a74934e71769595f960617569ec41f25a6..c6ad0917f60a7bf884b9fceba1ab1183973d3e4a 100644 (file)
--- a/src/blocksorting/bs.c
+++ b/src/blocksorting/bs.c
@@ -1,12 +1,141 @@
+/*----------------------------------------------------------------------------
+ *                   jacu - Just Another Compression Utility
+ *----------------------------------------------------------------------------
+ * This file is part of jacu.
+ *
+ * jacu is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ * jacu is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with jacu; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA  02111-1307  USA
+ *----------------------------------------------------------------------------
+ */
+
  
  #include "bs.h"
  #include <stdlib.h>
+#include <ctype.h>
  
  /* Block Sorting Optimizado en memoria! */
  
+typedef struct _dict_ {
+       char *word;
+       int len;
+       int hits;
+} t_Diccionario;
+
+#define PALABRA(a) {a, sizeof(a)-1, 0}
+
+#define DICT_SIZE sizeof(dic)/sizeof(t_Diccionario)
+
+/* ASCII Numero 3 como caracter de escape */
+#define ESCAPE_CHARACTER 0x3
+
+/* Diccionario */
+t_Diccionario dic[] = {
+       /* Preposiciones (no todas) */
+       PALABRA(" ante"),
+       PALABRA(" bajo"),
+       PALABRA(" cabe"),
+       PALABRA(" con"),
+       PALABRA(" contra"),
+       /* PALABRA(" de"),  XXX No se si conviene, solo ahorra 1 byte y puede romper localidad */
+       PALABRA(" desde"),
+       PALABRA(" hasta"),
+       PALABRA(" hacia"),
+       PALABRA(" para"),
+       PALABRA(" por"),
+       PALABRA(" según"),
+       PALABRA(" sin"),
+       PALABRA(" sobre"),
+       PALABRA(" tras"),
+       PALABRA(" mediante"),
+       PALABRA(" durante"),
+
+       /* Articulos */
+       PALABRA(" el "),
+       PALABRA(" la "),
+       PALABRA(" ella "),
+       PALABRA(" del "),
+       PALABRA(" los "),
+       PALABRA(" las "),
+       PALABRA(" lo "),
+       PALABRA(" que"),
+       PALABRA(" una "),
+       PALABRA(" también"),
+       PALABRA(" tambien"),
+       PALABRA(" cuando"),
+       PALABRA(" pero"),
+       PALABRA(" todo"),
+       /* Otras de test */
+       PALABRA(" si "),
+       PALABRA(" mas "),
+       PALABRA(" más "),
+       PALABRA(" porque"),
+       PALABRA(" entonces"),
+       PALABRA(" siempre"),
+       PALABRA(" segundo"),
+       PALABRA(" programa"),
+       PALABRA(" existe"),
+       PALABRA(" recien"),
+       PALABRA(" máximo"),
+       PALABRA(" mínimo"),
+       PALABRA(" casi"),
+       PALABRA(" sección"),
+       PALABRA(" informe"),
+       PALABRA(" Informe"),
+       PALABRA(" acción"),
+       PALABRA(" perdedor"),
+       PALABRA(" existencia"),
+       PALABRA(" aire"),
+       PALABRA(" árbol"),
+       PALABRA(" prueba"),
+       PALABRA(" muestra"),
+       PALABRA(" animal"),
+       PALABRA(" suerte"),
+       PALABRA(" portador"),
+       PALABRA(" molesto"),
+       PALABRA(" cielo"),
+       PALABRA(" impulso"),
+       PALABRA(" alcohol"),
+       PALABRA(" seguido"),
+       PALABRA(" permiso"),
+       PALABRA(" cuarto"),
+       PALABRA(" brillante"),
+       PALABRA(" tener"),
+       PALABRA(" ningún"),
+       PALABRA(" fácil"),
+       /* Algunas cosas de ingles para mejorar otros textos */
+       PALABRA(" as "),
+       PALABRA(" will "),
+       PALABRA(" with "),
+       PALABRA(" this "),
+       PALABRA(" program "),
+       PALABRA(" are "),
+       PALABRA(" The "),
+       PALABRA(" it "),
+       PALABRA(" which "),
+       PALABRA(" for "),
+       PALABRA(" be "),
+       PALABRA(" that "),
+       PALABRA(" is "),
+       PALABRA(" and "),
+       PALABRA(" to "),
+       PALABRA(" of "),
+       PALABRA(" the ")
+};
+
  typedef struct _bs_decode_t_ {
         char c;
-       Uint32 pos;
+       Uint32 sig;
  } t_BlockSortDecode;
  
  char es_menor(char *data, t_BlockSort *bs, int i, int j);
@@ -17,6 +146,11 @@ int _compare(const void *d1, const void *d2) {
         s1 = (t_BlockSortDecode *)d1;
         s2 = (t_BlockSortDecode *)d2;
  
+       /* Hack si es el mismo caracter, dejo como mejor la de menor sig */
+       if (s1->c == s2->c) {
+               return (s1->sig - s2->sig);
+       }
+
         return (s1->c - s2->c);
  }
  
@@ -26,7 +160,7 @@ int __compare(const void *d1, const void *d2) {
         s1 = (t_BlockSortData *)d1;
         s2 = (t_BlockSortData *)d2;
  
-       return es_menor(s1->bs->data, s1->bs, s1->pos_inicial, s2->pos_inicial);
+       return  es_menor(s1->bs->data, s1->bs, s1->pos_inicial, s2->pos_inicial);
  }
  
  char es_menor(char *data, t_BlockSort *bs, int i, int j)
@@ -61,26 +195,16 @@ void ordenar_array(char *data, t_BlockSort *bs)
         qsort(bs->array, bs->len, sizeof(t_BlockSortData), __compare);
  }
  
-void print_(char *data, Uint32 pos, Uint32 len)
-{
-       Uint32 i;
-
-       for(i=0; i<len; i++)
-               printf("%c", data[(pos+i)%len]);
-       printf("\n");
-}
-
  int generar_salida(char *data, t_BlockSort *bs, char *salida)
  {
         Uint32 i, k;
         char *out;
  
-       /* Dejo lugar para guardar el k y el tamaño de este bloque */
-       out = salida + sizeof(Uint32)*2;
+       /* Dejo lugar para guardar el K */
+       out = salida + sizeof(Uint32);
  
         k=-1;
         for(i=0; i<bs->len; i++) {
-               /* print_(data, bs->array[i].pos_inicial, bs->len); */
                 out[i] = data[bs->array[i].pos_final];
                 if (bs->array[i].ord == 1) k = i;
         }
@@ -99,9 +223,8 @@ void bs_solve(char *in, char *out, t_BlockSort *bs, Uint32 *k, Uint32 leido)
         ordenar_array(in, bs);
         (*k) = generar_salida(in, bs, out);
  
-       /* Guardo el k y el tamaño en el array */
-       memcpy(out, &leido, sizeof(Uint32));
-       memcpy(out+sizeof(Uint32), k, sizeof(Uint32));
+       /* Guardo el k y el tamaño en el array */       
+       memcpy(out, k, sizeof(Uint32));
         bs->len = l;
  }
  
@@ -114,7 +237,7 @@ void bs_restore(char *dst, char *c, Uint32 k, Uint32 len)
  
         for(i=0; i<len; i++) {
                 in[i].c = c[i];
-               in[i].pos = i;
+               in[i].sig = i;
         }
  
         qsort(in, len, sizeof(t_BlockSortDecode), _compare);
@@ -123,7 +246,7 @@ void bs_restore(char *dst, char *c, Uint32 k, Uint32 len)
         i=0;
         do {
                 dst[i++] = in[current].c;
-               current = in[current].pos;
+               current = in[current].sig;
         } while (current != k);
         free(in);
  }
@@ -146,3 +269,173 @@ void bs_destroy(t_BlockSort *bs)
         free(bs);
  }
  
+
+void bs_EOL(char *data, Uint32 pagesize, Uint32 *j)
+{
+       /* Trato de agregar 4 espacios antes y 4 despues de un \n */
+       int i = (*j);
+
+       /* Verifico poder hacerlo */
+       if ((i+9) >= pagesize) return; /* No pude! */
+       
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+       data[i++] = ' ';
+
+       (*j) += 9;
+}
+
+char check_hint(char c)
+{
+       static int current_pos = 0;
+       char hits = 0;
+       int i;
+       char hit_pos = -1;
+
+       for(i=0; i<DICT_SIZE; i++) {
+               /* Veo de no pasarme */
+               if (current_pos < dic[i].len) {
+                       /* Si la palabra tiene suficientes hints */
+                       if (dic[i].hits == current_pos) {
+                               if (dic[i].word[current_pos] == c) {
+                                       dic[i].hits++;
+                                       hits++;
+                                       if (dic[i].hits == dic[i].len) {
+                                               hit_pos = i;
+                                       }
+                               } else {
+                                       /* Esta palabra ya no tiene posibilidades */
+                                       dic[i].hits = 0;
+                               }
+                       }
+               } else {
+                       dic[i].hits = 0;
+               }
+       }
+
+       current_pos++;
+
+       if (hits == 0) {
+               current_pos = 0;
+               return -1; /* No hay hints ! */
+       }
+       if (hits > 1) return -2; /* Tengo mas de 1 hint! */
+
+       if (hit_pos == -1) return -2; /* Tengo 1 solo hit pero no es completo */
+
+       /* Tengo 1 solo hint !!!! */
+       current_pos = 0;
+       return hit_pos;
+}
+
+void bs_clean_dic()
+{
+       int i;
+       for(i=0; i<DICT_SIZE; i++) {
+               dic[i].hits = 0;
+       }
+}
+
+int bs_readblock(FILE *fp, char *data, Uint32 pagesize, int usar_dic)
+{
+       Uint32 i=0;
+       char c, hint;
+       char buffer[100];
+       int j, buffer_pos=0;
+
+       while ((!feof(fp)) && ((i+buffer_pos) < pagesize)) {
+               c = fgetc(fp);
+               
+               if (usar_dic != 1) {
+                       data[i++] = c;
+                       continue;
+               }
+
+               if (c == ESCAPE_CHARACTER) {
+                       data[i++] = c;
+                       data[i++] = 0xF;
+                       bs_clean_dic();
+                       continue;
+               }
+
+               hint = check_hint(c);
+
+               switch (hint) {
+                       case -1:
+                               /* No hay hints, vacio el buffer y luego saco el c */
+                               for(j=0; j<buffer_pos; j++)
+                                       data[i++] = buffer[j];
+
+                               data[i++] = c;
+                               buffer_pos = 0;
+                               bs_clean_dic();
+                       break;
+                       case -2:
+                               /* Tengo mas de 1 hit, tengo posibilidades, guardo este char en el buffer */
+                               buffer[buffer_pos++] = c;
+                       break;
+                       default:
+                               /* Me retornaron un numero positivo!!, eso quiere decir que ese numero
+                                * es la posicion dentro del diccionario de la palabra que puedo reemplazar
+                                */
+                               data[i++] = ESCAPE_CHARACTER;
+                               /* Trato de hacer que el caracter sea comun en textos, para que no me rompa
+                                * la localidad
+                                */
+                               data[i++] = hint;
+                               bs_clean_dic();
+                               /* El buffer no lo necesito mas */
+                               buffer_pos = 0;
+               }
+       }
+
+       for(j=0; j<buffer_pos; j++)
+               data[i++] = buffer[j];
+       /* Saco un EOF que lee de mas */
+       if (i<pagesize) i--;
+
+       return i;
+}
+
+char *bs_finalblock(char *data, Uint32 len, Uint32 *new_size)
+{
+       Uint32 size=0, i, j;
+       char *out = NULL;
+       unsigned char tmp;
+
+       for(i=0; i < len; i++) {
+               if (data[i] == ESCAPE_CHARACTER) {
+                       i++;
+                       if (data[i] == 0xF) {
+                               /* Solo tengo que dejar el ESCAPE_CHARACTER */
+                               size++;
+                               out = (char *)realloc(out, size*sizeof(char));
+                               out[size-1] = data[i];
+                               /* Salteo ese caracter */
+                       } else {
+                               /* Va una palabra!! */
+                               tmp = data[i];
+                               size += dic[(size_t)tmp].len;
+                               out = (char *)realloc(out, size*sizeof(char));
+                               for(j=0; j<dic[(size_t)tmp].len; j++) {
+                                       out[size-dic[(size_t)tmp].len+j] = dic[(size_t)tmp].word[j];
+                               }
+                               /* Salteo el caracter siguiente al de escape */
+                       }
+               } else {
+                       size++;
+                       out = (char *)realloc(out, size*sizeof(char));
+                       out[size-1] = data[i];
+               }
+       }
+
+       (*new_size) = size;
+       return out;
+}
+