mbyte.c

   1 /*
   2  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  *     This program is free software; you can redistribute it and/or modify
   5  *     it under the terms of the GNU General Public License as published by
   6  *     the Free Software Foundation; either version 2 of the License, or
   7  *     (at your option) any later version.
   8  *
   9  *     This program is distributed in the hope that it will be useful,
  10  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *     GNU General Public License for more details.
  13  *
  14  *     You should have received a copy of the GNU General Public License
  15  *     along with this program; if not, write to the Free Software
  16  *     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17  */
  18
  19 /*
  20  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  21  */
  22
  23 #if HAVE_CONFIG_H
  24 # include "config.h"
  25 #endif
  26
  27 #include "mutt.h"
  28 #include "mbyte.h"
  29 #include "charset.h"
  30
  31 #include <errno.h>
  32
  33 #include <ctype.h>
  34
  35 #ifndef EILSEQ
  36 #define EILSEQ EINVAL
  37 #endif
  38
  39 int Charset_is_utf8 = 0;
  40 #ifndef HAVE_WC_FUNCS
  41 static int charset_is_ja = 0;
  42 static iconv_t charset_to_utf8 = (iconv_t)(-1);
  43 static iconv_t charset_from_utf8 = (iconv_t)(-1);
  44 #endif
  45
  46 void mutt_set_charset (char *charset)
  47 {
  48   char buffer[STRING];
  49
  50   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  51
  52   Charset_is_utf8 = 0;
  53 #ifndef HAVE_WC_FUNCS
  54   charset_is_ja = 0;
  55   if (charset_to_utf8 != (iconv_t)(-1))
  56   {
  57     iconv_close (charset_to_utf8);
  58     charset_to_utf8 = (iconv_t)(-1);
  59   }
  60   if (charset_from_utf8 != (iconv_t)(-1))
  61   {
  62     iconv_close (charset_from_utf8);
  63     charset_from_utf8 = (iconv_t)(-1);
  64   }
  65 #endif
  66
  67   if (mutt_is_utf8 (buffer))
  68     Charset_is_utf8 = 1;
  69 #ifndef HAVE_WC_FUNCS
  70   else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
  71         || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
  72   {
  73     charset_is_ja = 1;
  74
  75     /* Note flags=0 to skip charset-hooks: User masters the $charset
  76      * name, and we are sure of our "utf-8" constant. So there is no
  77      * possibility of wrong name that we would want to try to correct
  78      * with a charset-hook. Or rather: If $charset was wrong, we would
  79      * want to try to correct... $charset directly.
  80      */
  81     charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0);
  82     charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0);
  83   }
  84 #endif
  85
  86 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
  87   bind_textdomain_codeset(PACKAGE, buffer);
  88 #endif
  89 }
  90
  91 #ifndef HAVE_WC_FUNCS
  92
  93 /*
  94  * For systems that don't have them, we provide here our own
  95  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  96  * Instead of using the locale, as these functions normally would,
  97  * we use Mutt's Charset variable. We support 3 types of charset:
  98  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  99  * (2) For UTF-8, wchar_t uses UCS.
 100  * (3) For stateless Japanese encodings, we use UCS and convert
 101  *     via UTF-8 using iconv.
 102  * Unfortunately, we can't handle non-stateless encodings.
 103  */
 104
 105 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
 106 {
 107   char buf[MB_LEN_MAX+1];
 108   ICONV_CONST char *ib;
 109   char *ob;
 110   size_t ibl, obl, r;
 111
 112   if (s)
 113   {
 114     ibl = mutt_wctoutf8 (buf, wc, sizeof (buf));
 115     if (ibl == (size_t)(-1))
 116       return (size_t)(-1);
 117     ib = buf;
 118     ob = s;
 119     obl = MB_LEN_MAX;
 120     r = iconv (cd, &ib, &ibl, &ob, &obl);
 121   }
 122   else
 123   {
 124     ib = "";
 125     ibl = 1;
 126     ob = buf;
 127     obl = sizeof (buf);
 128     r = iconv (cd, &ib, &ibl, &ob, &obl);
 129   }
 130   return ob - s;
 131 }
 132
 133 size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 134 {
 135   /* We only handle stateless encodings, so we can ignore ps. */
 136
 137   if (Charset_is_utf8)
 138     return mutt_wctoutf8 (s, wc, MB_LEN_MAX);
 139   else if (charset_from_utf8 != (iconv_t)(-1))
 140     return wcrtomb_iconv (s, wc, charset_from_utf8);
 141   else
 142   {
 143     if (!s)
 144       return 1;
 145     if (wc < 0x100)
 146     {
 147       *s = wc;
 148       return 1;
 149     }
 150     errno = EILSEQ;
 151     return (size_t)(-1);
 152   }
 153 }
 154
 155 size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
 156                       mbstate_t *ps, iconv_t cd)
 157 {
 158   static mbstate_t mbstate;
 159   ICONV_CONST char *ib, *ibmax;
 160   char *ob, *t;
 161   size_t ibl, obl, k, r;
 162   char bufi[8], bufo[6];
 163
 164   if (!n)
 165     return (size_t)(-2);
 166
 167   t = memchr (ps, 0, sizeof (*ps));
 168   k = t ? (t - (char *)ps) : sizeof (*ps);
 169   if (k > sizeof (bufi))
 170     k = 0;
 171   if (k)
 172   {
 173     /* use the buffer for input */
 174     memcpy (bufi, ps, k);
 175     ib = bufi;
 176     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 177     memcpy (bufi + k, s, ibmax - bufi - k);
 178   }
 179   else
 180   {
 181     /* use the real input */
 182     ib = (ICONV_CONST char*) s;
 183     ibmax = (ICONV_CONST char*) s + n;
 184   }
 185
 186   ob = bufo;
 187   obl = sizeof (bufo);
 188   ibl = 1;
 189
 190   for (;;)
 191   {
 192     r = iconv (cd, &ib, &ibl, &ob, &obl);
 193     if (ob > bufo && (!k || ib > bufi + k))
 194     {
 195       /* we have a character */
 196       memset (ps, 0, sizeof (*ps));
 197       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 198       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 199     }
 200     else if (!r || (r == (size_t)(-1) && errno == EINVAL))
 201     {
 202       if (ib + ibl < ibmax)
 203         /* try using more input */
 204         ++ibl;
 205       else if (k && ib > bufi + k && bufi + k + n > ibmax)
 206       {
 207         /* switch to using real input */
 208         ib = (ICONV_CONST char*) s + (ib - bufi - k);
 209         ibmax = (ICONV_CONST char*) s + n;
 210         k = 0;
 211         ++ibl;
 212       }
 213       else
 214       {
 215         /* save the state and give up */
 216         memset (ps, 0, sizeof (*ps));
 217         if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
 218           memcpy (ps, ib, ibl);
 219         return (size_t)(-2);
 220       }
 221     }
 222     else
 223     {
 224       /* bad input */
 225       errno = EILSEQ;
 226       return (size_t)(-1);
 227     }
 228   }
 229 }
 230
 231 size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 232 {
 233   static mbstate_t mbstate;
 234
 235   if (!ps)
 236     ps = &mbstate;
 237
 238   if (Charset_is_utf8)
 239     return utf8rtowc (pwc, s, n, ps);
 240   else if (charset_to_utf8 != (iconv_t)(-1))
 241     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 242   else
 243   {
 244     if (!s)
 245     {
 246       memset(ps, 0, sizeof(*ps));
 247       return 0;
 248     }
 249     if (!n)
 250       return (size_t)-2;
 251     if (pwc)
 252       *pwc = (wchar_t)(unsigned char)*s;
 253     return (*s != 0);
 254   }
 255 }
 256
 257 int iswprint (wint_t wc)
 258 {
 259   if (Charset_is_utf8 || charset_is_ja)
 260     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 261   else
 262     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 263 }
 264
 265 int iswspace (wint_t wc)
 266 {
 267   if (Charset_is_utf8 || charset_is_ja)
 268     return (9 <= wc && wc <= 13) || wc == 32;
 269   else
 270     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 271 }
 272
 273 static wint_t towupper_ucs (wint_t x)
 274 {
 275   /* Only works for x < 0x130 */
 276   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 277     return x - 32;
 278   else if (0x100 <= x && x < 0x130)
 279     return x & ~1;
 280   else if (x == 0xb5)
 281     return 0x39c;
 282   else if (x == 0xff)
 283     return 0x178;
 284   else
 285     return x;
 286 }
 287
 288 static int iswupper_ucs (wint_t x)
 289 {
 290   /* Only works for x < 0x130 */
 291   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 292     return 0;
 293   else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde))
 294     return 1;
 295   else if (0x100 <= x && x < 0x130)
 296     return 1;
 297   else if (x == 0xb5)
 298     return 1;
 299   else if (x == 0xff)
 300     return 0;
 301   else
 302     return 0;
 303 }
 304
 305 static wint_t towlower_ucs (wint_t x)
 306 {
 307   /* Only works for x < 0x130 */
 308   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 309     return x + 32;
 310   else if (0x100 <= x && x < 0x130)
 311     return x | 1;
 312   else
 313     return x;
 314 }
 315
 316 static int iswalnum_ucs (wint_t wc)
 317 {
 318   /* Only works for x < 0x220 */
 319   if (wc >= 0x100)
 320     return 1;
 321   else if (wc < 0x30)
 322     return 0;
 323   else if (wc < 0x3a)
 324     return 1;
 325   else if (wc < 0xa0)
 326     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 327   else if (wc < 0xc0)
 328     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 329   else
 330     return !(wc == 0xd7 || wc == 0xf7);
 331 }
 332
 333 static int iswalpha_ucs (wint_t wc)
 334 {
 335   /* Only works for x < 0x220 */
 336   if (wc >= 0x100)
 337     return 1;
 338   else if (wc < 0x3a)
 339     return 0;
 340   else if (wc < 0xa0)
 341     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 342   else if (wc < 0xc0)
 343     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 344   else
 345     return !(wc == 0xd7 || wc == 0xf7);
 346 }
 347
 348 wint_t towupper (wint_t wc)
 349 {
 350   if (Charset_is_utf8 || charset_is_ja)
 351     return towupper_ucs (wc);
 352   else
 353     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 354 }
 355
 356 wint_t towlower (wint_t wc)
 357 {
 358   if (Charset_is_utf8 || charset_is_ja)
 359     return towlower_ucs (wc);
 360   else
 361     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 362 }
 363
 364 int iswalnum (wint_t wc)
 365 {
 366   if (Charset_is_utf8 || charset_is_ja)
 367     return iswalnum_ucs (wc);
 368   else
 369     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 370 }
 371
 372 int iswalpha (wint_t wc)
 373 {
 374   if (Charset_is_utf8 || charset_is_ja)
 375     return iswalpha_ucs (wc);
 376   else
 377     return (0 <= wc && wc < 256) ? isalpha (wc) : 0;
 378 }
 379
 380 int iswupper (wint_t wc)
 381 {
 382   if (Charset_is_utf8 || charset_is_ja)
 383     return iswupper_ucs (wc);
 384   else
 385     return (0 <= wc && wc < 256) ? isupper (wc) : 0;
 386 }
 387
 388 /*
 389  * l10n for Japanese:
 390  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 391  *   Character Set, have a column width of 2.
 392  */
 393 int wcwidth_ja (wchar_t ucs)
 394 {
 395   if (ucs >= 0x3021)
 396     return -1; /* continue with the normal check */
 397   /* a rough range for quick check */
 398   if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
 399       (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
 400       (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
 401       (ucs >= 0x3000 && ucs <= 0x3020))   /* CJK Symbols and Punctuation */
 402     return 2;
 403   else
 404     return -1;
 405 }
 406
 407 int wcwidth_ucs(wchar_t ucs);
 408
 409 int wcwidth (wchar_t wc)
 410 {
 411   if (!Charset_is_utf8)
 412   {
 413     if (!charset_is_ja)
 414     {
 415       /* 8-bit case */
 416       if (!wc)
 417         return 0;
 418       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 419         return 1;
 420       else
 421         return -1;
 422     }
 423     else
 424     {
 425       /* Japanese */
 426       int k = wcwidth_ja (wc);
 427       if (k != -1)
 428         return k;
 429     }
 430   }
 431   return wcwidth_ucs (wc);
 432 }
 433
 434 size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
 435 {
 436   static wchar_t mbstate;
 437   wchar_t *ps = (wchar_t *)_ps;
 438   size_t k = 1;
 439   unsigned char c;
 440   wchar_t wc;
 441   int count;
 442
 443   if (!ps)
 444     ps = &mbstate;
 445
 446   if (!s)
 447   {
 448     *ps = 0;
 449     return 0;
 450   }
 451   if (!n)
 452     return (size_t)-2;
 453
 454   if (!*ps)
 455   {
 456     c = (unsigned char)*s;
 457     if (c < 0x80)
 458     {
 459       if (pwc)
 460         *pwc = c;
 461       return (c != 0);
 462     }
 463     else if (c < 0xc2)
 464     {
 465       errno = EILSEQ;
 466       return (size_t)-1;
 467     }
 468     else if (c < 0xe0)
 469       wc = ((c & 0x1f) << 6) + (count = 0);
 470     else if (c < 0xf0)
 471       wc = ((c & 0x0f) << 12) + (count = 1);
 472     else if (c < 0xf8)
 473       wc = ((c & 0x07) << 18) + (count = 2);
 474     else if (c < 0xfc)
 475       wc = ((c & 0x03) << 24) + (count = 3);
 476     else if (c < 0xfe)
 477       wc = ((c & 0x01) << 30) + (count = 4);
 478     else
 479     {
 480       errno = EILSEQ;
 481       return (size_t)-1;
 482     }
 483     ++s, --n, ++k;
 484   }
 485   else
 486   {
 487     wc = *ps & 0x7fffffff;
 488     count = wc & 7; /* if count > 4 it will be caught below */
 489   }
 490
 491   for (; n; ++s, --n, ++k)
 492   {
 493     c = (unsigned char)*s;
 494     if (0x80 <= c && c < 0xc0)
 495     {
 496       wc |= (c & 0x3f) << (6 * count);
 497       if (!count)
 498       {
 499         if (pwc)
 500           *pwc = wc;
 501         *ps = 0;
 502         return wc ? k : 0;
 503       }
 504       --count, --wc;
 505       if (!(wc >> (11+count*5)))
 506       {
 507         errno = count < 4 ? EILSEQ : EINVAL;
 508         return (size_t)-1;
 509       }
 510     }
 511     else
 512     {
 513       errno = EILSEQ;
 514       return (size_t)-1;
 515     }
 516   }
 517   *ps = wc;
 518   return (size_t)-2;
 519 }
 520
 521 #endif /* !HAVE_WC_FUNCS */
 522
 523 wchar_t replacement_char (void)
 524 {
 525   return Charset_is_utf8 ? 0xfffd : '?';
 526 }
 527
 528 int mutt_filter_unprintable (char **s)
 529 {
 530   BUFFER *b = NULL;
 531   wchar_t wc;
 532   size_t k, k2;
 533   char scratch[MB_LEN_MAX + 1];
 534   char *p = *s;
 535   mbstate_t mbstate1, mbstate2;
 536
 537   if (!(b = mutt_buffer_init (b)))
 538     return -1;
 539   memset (&mbstate1, 0, sizeof (mbstate1));
 540   memset (&mbstate2, 0, sizeof (mbstate2));
 541   for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
 542   {
 543     if (k == (size_t)(-1) || k == (size_t)(-2))
 544     {
 545       k = 1;
 546       memset (&mbstate1, 0, sizeof (mbstate1));
 547       wc = replacement_char();
 548     }
 549     if (!IsWPrint (wc))
 550       wc = '?';
 551     k2 = wcrtomb (scratch, wc, &mbstate2);
 552     scratch[k2] = '\0';
 553     mutt_buffer_addstr (b, scratch);
 554   }
 555   FREE (s);  /* __FREE_CHECKED__ */
 556   *s = b->data ? b->data : safe_calloc (1, 1);
 557   FREE (&b);
 558   return 0;
 559 }
 560