mbyte.c

   1 /*
   2  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  *     This program is free software; you can redistribute it and/or modify
   5  *     it under the terms of the GNU General Public License as published by
   6  *     the Free Software Foundation; either version 2 of the License, or
   7  *     (at your option) any later version.
   8  *
   9  *     This program is distributed in the hope that it will be useful,
  10  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *     GNU General Public License for more details.
  13  *
  14  *     You should have received a copy of the GNU General Public License
  15  *     along with this program; if not, write to the Free Software
  16  *     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  17  */
  18
  19 /*
  20  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  21  */
  22
  23 #if HAVE_CONFIG_H
  24 # include "config.h"
  25 #endif
  26
  27 #include "mutt.h"
  28 #include "mbyte.h"
  29 #include "charset.h"
  30
  31 #include <errno.h>
  32
  33 #include <ctype.h>
  34
  35 #ifndef EILSEQ
  36 #define EILSEQ EINVAL
  37 #endif
  38
  39 int Charset_is_utf8 = 0;
  40 #ifndef HAVE_WC_FUNCS
  41 static int charset_is_ja = 0;
  42 static iconv_t charset_to_utf8 = (iconv_t)(-1);
  43 static iconv_t charset_from_utf8 = (iconv_t)(-1);
  44 #endif
  45
  46 void mutt_set_charset (char *charset)
  47 {
  48   char buffer[STRING];
  49
  50   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  51
  52   Charset_is_utf8 = 0;
  53 #ifndef HAVE_WC_FUNCS
  54   charset_is_ja = 0;
  55   if (charset_to_utf8 != (iconv_t)(-1))
  56   {
  57     iconv_close (charset_to_utf8);
  58     charset_to_utf8 = (iconv_t)(-1);
  59   }
  60   if (charset_from_utf8 != (iconv_t)(-1))
  61   {
  62     iconv_close (charset_from_utf8);
  63     charset_from_utf8 = (iconv_t)(-1);
  64   }
  65 #endif
  66
  67   if (!strcmp(buffer, "utf-8"))
  68     Charset_is_utf8 = 1;
  69 #ifndef HAVE_WC_FUNCS
  70   else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
  71         || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
  72   {
  73     charset_is_ja = 1;
  74
  75     /* Note flags=0 to skip charset-hooks: User masters the $charset
  76      * name, and we are sure of our "utf-8" constant. So there is no
  77      * possibility of wrong name that we would want to try to correct
  78      * with a charset-hook. Or rather: If $charset was wrong, we would
  79      * want to try to correct... $charset directly.
  80      */
  81     charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0);
  82     charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0);
  83   }
  84 #endif
  85
  86 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
  87   bind_textdomain_codeset(PACKAGE, buffer);
  88 #endif
  89 }
  90
  91 #ifndef HAVE_WC_FUNCS
  92
  93 /*
  94  * For systems that don't have them, we provide here our own
  95  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  96  * Instead of using the locale, as these functions normally would,
  97  * we use Mutt's Charset variable. We support 3 types of charset:
  98  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  99  * (2) For UTF-8, wchar_t uses UCS.
 100  * (3) For stateless Japanese encodings, we use UCS and convert
 101  *     via UTF-8 using iconv.
 102  * Unfortunately, we can't handle non-stateless encodings.
 103  */
 104
 105 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
 106 {
 107   char buf[MB_LEN_MAX+1];
 108   ICONV_CONST char *ib;
 109   char *ob;
 110   size_t ibl, obl, r;
 111
 112   if (s)
 113   {
 114     ibl = mutt_wctoutf8 (buf, wc, sizeof (buf));
 115     if (ibl == (size_t)(-1))
 116       return (size_t)(-1);
 117     ib = buf;
 118     ob = s;
 119     obl = MB_LEN_MAX;
 120     r = iconv (cd, &ib, &ibl, &ob, &obl);
 121   }
 122   else
 123   {
 124     ib = "";
 125     ibl = 1;
 126     ob = buf;
 127     obl = sizeof (buf);
 128     r = iconv (cd, &ib, &ibl, &ob, &obl);
 129   }
 130   return ob - s;
 131 }
 132
 133 size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 134 {
 135   /* We only handle stateless encodings, so we can ignore ps. */
 136
 137   if (Charset_is_utf8)
 138     return mutt_wctoutf8 (s, wc, MB_LEN_MAX);
 139   else if (charset_from_utf8 != (iconv_t)(-1))
 140     return wcrtomb_iconv (s, wc, charset_from_utf8);
 141   else
 142   {
 143     if (!s)
 144       return 1;
 145     if (wc < 0x100)
 146     {
 147       *s = wc;
 148       return 1;
 149     }
 150     errno = EILSEQ;
 151     return (size_t)(-1);
 152   }
 153 }
 154
 155 size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
 156                       mbstate_t *ps, iconv_t cd)
 157 {
 158   static mbstate_t mbstate;
 159   ICONV_CONST char *ib, *ibmax;
 160   char *ob, *t;
 161   size_t ibl, obl, k, r;
 162   char bufi[8], bufo[6];
 163
 164   if (!n)
 165     return (size_t)(-2);
 166
 167   t = memchr (ps, 0, sizeof (*ps));
 168   k = t ? (t - (char *)ps) : sizeof (*ps);
 169   if (k > sizeof (bufi))
 170     k = 0;
 171   if (k)
 172   {
 173     /* use the buffer for input */
 174     memcpy (bufi, ps, k);
 175     ib = bufi;
 176     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 177     memcpy (bufi + k, s, ibmax - bufi - k);
 178   }
 179   else
 180   {
 181     /* use the real input */
 182     ib = (ICONV_CONST char*) s;
 183     ibmax = (ICONV_CONST char*) s + n;
 184   }
 185
 186   ob = bufo;
 187   obl = sizeof (bufo);
 188   ibl = 1;
 189
 190   for (;;)
 191   {
 192     r = iconv (cd, &ib, &ibl, &ob, &obl);
 193     if (ob > bufo && (!k || ib > bufi + k))
 194     {
 195       /* we have a character */
 196       memset (ps, 0, sizeof (*ps));
 197       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 198       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 199     }
 200     else if (!r || (r == (size_t)(-1) && errno == EINVAL))
 201     {
 202       if (ib + ibl < ibmax)
 203         /* try using more input */
 204         ++ibl;
 205       else if (k && ib > bufi + k && bufi + k + n > ibmax)
 206       {
 207         /* switch to using real input */
 208         ib = (ICONV_CONST char*) s + (ib - bufi - k);
 209         ibmax = (ICONV_CONST char*) s + n;
 210         k = 0;
 211         ++ibl;
 212       }
 213       else
 214       {
 215         /* save the state and give up */
 216         memset (ps, 0, sizeof (*ps));
 217         if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
 218           memcpy (ps, ib, ibl);
 219         return (size_t)(-2);
 220       }
 221     }
 222     else
 223     {
 224       /* bad input */
 225       errno = EILSEQ;
 226       return (size_t)(-1);
 227     }
 228   }
 229 }
 230
 231 size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 232 {
 233   static mbstate_t mbstate;
 234
 235   if (!ps)
 236     ps = &mbstate;
 237
 238   if (Charset_is_utf8)
 239     return utf8rtowc (pwc, s, n, ps);
 240   else if (charset_to_utf8 != (iconv_t)(-1))
 241     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 242   else
 243   {
 244     if (!s)
 245     {
 246       memset(ps, 0, sizeof(*ps));
 247       return 0;
 248     }
 249     if (!n)
 250       return (size_t)-2;
 251     if (pwc)
 252       *pwc = (wchar_t)(unsigned char)*s;
 253     return (*s != 0);
 254   }
 255 }
 256
 257 int iswprint (wint_t wc)
 258 {
 259   if (Charset_is_utf8 || charset_is_ja)
 260     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 261   else
 262     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 263 }
 264
 265 int iswspace (wint_t wc)
 266 {
 267   if (Charset_is_utf8 || charset_is_ja)
 268     return (9 <= wc && wc <= 13) || wc == 32;
 269   else
 270     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 271 }
 272
 273 static wint_t towupper_ucs (wint_t x)
 274 {
 275   /* Only works for x < 0x130 */
 276   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 277     return x - 32;
 278   else if (0x100 <= x && x < 0x130)
 279     return x & ~1;
 280   else if (x == 0xb5)
 281     return 0x39c;
 282   else if (x == 0xff)
 283     return 0x178;
 284   else
 285     return x;
 286 }
 287
 288 static int iswupper_ucs (wint_t x)
 289 {
 290   /* Only works for x < 0x130 */
 291   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 292     return 1;
 293   else if (0x100 <= x && x < 0x130)
 294     return 1;
 295   else if (x == 0xb5)
 296     return 1;
 297   else if (x == 0xff)
 298     return 1;
 299   else
 300     return 0;
 301 }
 302
 303 static wint_t towlower_ucs (wint_t x)
 304 {
 305   /* Only works for x < 0x130 */
 306   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 307     return x + 32;
 308   else if (0x100 <= x && x < 0x130)
 309     return x | 1;
 310   else
 311     return x;
 312 }
 313
 314 static int iswalnum_ucs (wint_t wc)
 315 {
 316   /* Only works for x < 0x220 */
 317   if (wc >= 0x100)
 318     return 1;
 319   else if (wc < 0x30)
 320     return 0;
 321   else if (wc < 0x3a)
 322     return 1;
 323   else if (wc < 0xa0)
 324     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 325   else if (wc < 0xc0)
 326     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 327   else
 328     return !(wc == 0xd7 || wc == 0xf7);
 329 }
 330
 331 static int iswalpha_ucs (wint_t wc)
 332 {
 333   /* Only works for x < 0x220 */
 334   if (wc >= 0x100)
 335     return 1;
 336   else if (wc < 0x3a)
 337     return 0;
 338   else if (wc < 0xa0)
 339     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 340   else if (wc < 0xc0)
 341     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 342   else
 343     return !(wc == 0xd7 || wc == 0xf7);
 344 }
 345
 346 wint_t towupper (wint_t wc)
 347 {
 348   if (Charset_is_utf8 || charset_is_ja)
 349     return towupper_ucs (wc);
 350   else
 351     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 352 }
 353
 354 wint_t towlower (wint_t wc)
 355 {
 356   if (Charset_is_utf8 || charset_is_ja)
 357     return towlower_ucs (wc);
 358   else
 359     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 360 }
 361
 362 int iswalnum (wint_t wc)
 363 {
 364   if (Charset_is_utf8 || charset_is_ja)
 365     return iswalnum_ucs (wc);
 366   else
 367     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 368 }
 369
 370 int iswalpha (wint_t wc)
 371 {
 372   if (Charset_is_utf8 || charset_is_ja)
 373     return iswalpha_ucs (wc);
 374   else
 375     return (0 <= wc && wc < 256) ? isalpha (wc) : 0;
 376 }
 377
 378 int iswupper (wint_t wc)
 379 {
 380   if (Charset_is_utf8 || charset_is_ja)
 381     return iswupper_ucs (wc);
 382   else
 383     return (0 <= wc && wc < 256) ? isupper (wc) : 0;
 384 }
 385
 386 /*
 387  * l10n for Japanese:
 388  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 389  *   Character Set, have a column width of 2.
 390  */
 391 int wcwidth_ja (wchar_t ucs)
 392 {
 393   if (ucs >= 0x3021)
 394     return -1; /* continue with the normal check */
 395   /* a rough range for quick check */
 396   if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
 397       (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
 398       (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
 399       (ucs >= 0x3000 && ucs <= 0x3020))   /* CJK Symbols and Punctuation */
 400     return 2;
 401   else
 402     return -1;
 403 }
 404
 405 int wcwidth_ucs(wchar_t ucs);
 406
 407 int wcwidth (wchar_t wc)
 408 {
 409   if (!Charset_is_utf8)
 410   {
 411     if (!charset_is_ja)
 412     {
 413       /* 8-bit case */
 414       if (!wc)
 415         return 0;
 416       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 417         return 1;
 418       else
 419         return -1;
 420     }
 421     else
 422     {
 423       /* Japanese */
 424       int k = wcwidth_ja (wc);
 425       if (k != -1)
 426         return k;
 427     }
 428   }
 429   return wcwidth_ucs (wc);
 430 }
 431
 432 size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
 433 {
 434   static wchar_t mbstate;
 435   wchar_t *ps = (wchar_t *)_ps;
 436   size_t k = 1;
 437   unsigned char c;
 438   wchar_t wc;
 439   int count;
 440
 441   if (!ps)
 442     ps = &mbstate;
 443
 444   if (!s)
 445   {
 446     *ps = 0;
 447     return 0;
 448   }
 449   if (!n)
 450     return (size_t)-2;
 451
 452   if (!*ps)
 453   {
 454     c = (unsigned char)*s;
 455     if (c < 0x80)
 456     {
 457       if (pwc)
 458         *pwc = c;
 459       return (c != 0);
 460     }
 461     else if (c < 0xc2)
 462     {
 463       errno = EILSEQ;
 464       return (size_t)-1;
 465     }
 466     else if (c < 0xe0)
 467       wc = ((c & 0x1f) << 6) + (count = 0);
 468     else if (c < 0xf0)
 469       wc = ((c & 0x0f) << 12) + (count = 1);
 470     else if (c < 0xf8)
 471       wc = ((c & 0x07) << 18) + (count = 2);
 472     else if (c < 0xfc)
 473       wc = ((c & 0x03) << 24) + (count = 3);
 474     else if (c < 0xfe)
 475       wc = ((c & 0x01) << 30) + (count = 4);
 476     else
 477     {
 478       errno = EILSEQ;
 479       return (size_t)-1;
 480     }
 481     ++s, --n, ++k;
 482   }
 483   else
 484   {
 485     wc = *ps & 0x7fffffff;
 486     count = wc & 7; /* if count > 4 it will be caught below */
 487   }
 488
 489   for (; n; ++s, --n, ++k)
 490   {
 491     c = (unsigned char)*s;
 492     if (0x80 <= c && c < 0xc0)
 493     {
 494       wc |= (c & 0x3f) << (6 * count);
 495       if (!count)
 496       {
 497         if (pwc)
 498           *pwc = wc;
 499         *ps = 0;
 500         return wc ? k : 0;
 501       }
 502       --count, --wc;
 503       if (!(wc >> (11+count*5)))
 504       {
 505         errno = count < 4 ? EILSEQ : EINVAL;
 506         return (size_t)-1;
 507       }
 508     }
 509     else
 510     {
 511       errno = EILSEQ;
 512       return (size_t)-1;
 513     }
 514   }
 515   *ps = wc;
 516   return (size_t)-2;
 517 }
 518
 519 #endif /* !HAVE_WC_FUNCS */
 520
 521 wchar_t replacement_char (void)
 522 {
 523   return Charset_is_utf8 ? 0xfffd : '?';
 524 }
 525
 526 int mutt_filter_unprintable (char **s)
 527 {
 528   BUFFER *b = NULL;
 529   wchar_t wc;
 530   size_t k, k2;
 531   char scratch[MB_LEN_MAX + 1];
 532   char *p = *s;
 533   mbstate_t mbstate1, mbstate2;
 534
 535   if (!(b = mutt_buffer_init (b)))
 536     return -1;
 537   memset (&mbstate1, 0, sizeof (mbstate1));
 538   memset (&mbstate2, 0, sizeof (mbstate2));
 539   for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
 540   {
 541     if (k == (size_t)(-1) || k == (size_t)(-2))
 542     {
 543       k = 1;
 544       memset (&mbstate1, 0, sizeof (mbstate1));
 545       wc = replacement_char();
 546     }
 547     if (!IsWPrint (wc))
 548       wc = '?';
 549     k2 = wcrtomb (scratch, wc, &mbstate2);
 550     scratch[k2] = '\0';
 551     mutt_buffer_addstr (b, scratch);
 552   }
 553   FREE (s);  /* __FREE_CHECKED__ */
 554   *s = b->data ? b->data : safe_calloc (1, 1);
 555   FREE (&b);
 556   return 0;
 557 }
 558