2 * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
39 int Charset_is_utf8 = 0;
41 static int charset_is_ja = 0;
42 static iconv_t charset_to_utf8 = (iconv_t)(-1);
43 static iconv_t charset_from_utf8 = (iconv_t)(-1);
46 void mutt_set_charset (char *charset)
50 mutt_canonical_charset (buffer, sizeof (buffer), charset);
55 if (charset_to_utf8 != (iconv_t)(-1))
57 iconv_close (charset_to_utf8);
58 charset_to_utf8 = (iconv_t)(-1);
60 if (charset_from_utf8 != (iconv_t)(-1))
62 iconv_close (charset_from_utf8);
63 charset_from_utf8 = (iconv_t)(-1);
67 if (mutt_is_utf8 (buffer))
70 else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
71 || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
75 /* Note flags=0 to skip charset-hooks: User masters the $charset
76 * name, and we are sure of our "utf-8" constant. So there is no
77 * possibility of wrong name that we would want to try to correct
78 * with a charset-hook. Or rather: If $charset was wrong, we would
79 * want to try to correct... $charset directly.
81 charset_to_utf8 = mutt_iconv_open ("utf-8", charset, 0);
82 charset_from_utf8 = mutt_iconv_open (charset, "utf-8", 0);
86 #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
87 bind_textdomain_codeset(PACKAGE, buffer);
94 * For systems that don't have them, we provide here our own
95 * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
96 * Instead of using the locale, as these functions normally would,
97 * we use Mutt's Charset variable. We support 3 types of charset:
98 * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
99 * (2) For UTF-8, wchar_t uses UCS.
100 * (3) For stateless Japanese encodings, we use UCS and convert
101 * via UTF-8 using iconv.
102 * Unfortunately, we can't handle non-stateless encodings.
105 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
107 char buf[MB_LEN_MAX+1];
108 ICONV_CONST char *ib;
114 ibl = mutt_wctoutf8 (buf, wc, sizeof (buf));
115 if (ibl == (size_t)(-1))
120 r = iconv (cd, &ib, &ibl, &ob, &obl);
128 r = iconv (cd, &ib, &ibl, &ob, &obl);
133 size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
135 /* We only handle stateless encodings, so we can ignore ps. */
138 return mutt_wctoutf8 (s, wc, MB_LEN_MAX);
139 else if (charset_from_utf8 != (iconv_t)(-1))
140 return wcrtomb_iconv (s, wc, charset_from_utf8);
155 size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
156 mbstate_t *ps, iconv_t cd)
158 static mbstate_t mbstate;
159 ICONV_CONST char *ib, *ibmax;
161 size_t ibl, obl, k, r;
162 char bufi[8], bufo[6];
167 t = memchr (ps, 0, sizeof (*ps));
168 k = t ? (t - (char *)ps) : sizeof (*ps);
169 if (k > sizeof (bufi))
173 /* use the buffer for input */
174 memcpy (bufi, ps, k);
176 ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
177 memcpy (bufi + k, s, ibmax - bufi - k);
181 /* use the real input */
182 ib = (ICONV_CONST char*) s;
183 ibmax = (ICONV_CONST char*) s + n;
192 r = iconv (cd, &ib, &ibl, &ob, &obl);
193 if (ob > bufo && (!k || ib > bufi + k))
195 /* we have a character */
196 memset (ps, 0, sizeof (*ps));
197 utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
198 return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
200 else if (!r || (r == (size_t)(-1) && errno == EINVAL))
202 if (ib + ibl < ibmax)
203 /* try using more input */
205 else if (k && ib > bufi + k && bufi + k + n > ibmax)
207 /* switch to using real input */
208 ib = (ICONV_CONST char*) s + (ib - bufi - k);
209 ibmax = (ICONV_CONST char*) s + n;
215 /* save the state and give up */
216 memset (ps, 0, sizeof (*ps));
217 if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
218 memcpy (ps, ib, ibl);
231 size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
233 static mbstate_t mbstate;
239 return utf8rtowc (pwc, s, n, ps);
240 else if (charset_to_utf8 != (iconv_t)(-1))
241 return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
246 memset(ps, 0, sizeof(*ps));
252 *pwc = (wchar_t)(unsigned char)*s;
257 int iswprint (wint_t wc)
259 if (Charset_is_utf8 || charset_is_ja)
260 return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
262 return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
265 int iswspace (wint_t wc)
267 if (Charset_is_utf8 || charset_is_ja)
268 return (9 <= wc && wc <= 13) || wc == 32;
270 return (0 <= wc && wc < 256) ? isspace (wc) : 0;
273 static wint_t towupper_ucs (wint_t x)
275 /* Only works for x < 0x130 */
276 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
278 else if (0x100 <= x && x < 0x130)
288 static int iswupper_ucs (wint_t x)
290 /* Only works for x < 0x130 */
291 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
293 else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde))
295 else if (0x100 <= x && x < 0x130)
305 static wint_t towlower_ucs (wint_t x)
307 /* Only works for x < 0x130 */
308 if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
310 else if (0x100 <= x && x < 0x130)
316 static int iswalnum_ucs (wint_t wc)
318 /* Only works for x < 0x220 */
326 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
328 return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
330 return !(wc == 0xd7 || wc == 0xf7);
333 static int iswalpha_ucs (wint_t wc)
335 /* Only works for x < 0x220 */
341 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
343 return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
345 return !(wc == 0xd7 || wc == 0xf7);
348 wint_t towupper (wint_t wc)
350 if (Charset_is_utf8 || charset_is_ja)
351 return towupper_ucs (wc);
353 return (0 <= wc && wc < 256) ? toupper (wc) : wc;
356 wint_t towlower (wint_t wc)
358 if (Charset_is_utf8 || charset_is_ja)
359 return towlower_ucs (wc);
361 return (0 <= wc && wc < 256) ? tolower (wc) : wc;
364 int iswalnum (wint_t wc)
366 if (Charset_is_utf8 || charset_is_ja)
367 return iswalnum_ucs (wc);
369 return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
372 int iswalpha (wint_t wc)
374 if (Charset_is_utf8 || charset_is_ja)
375 return iswalpha_ucs (wc);
377 return (0 <= wc && wc < 256) ? isalpha (wc) : 0;
380 int iswupper (wint_t wc)
382 if (Charset_is_utf8 || charset_is_ja)
383 return iswupper_ucs (wc);
385 return (0 <= wc && wc < 256) ? isupper (wc) : 0;
390 * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
391 * Character Set, have a column width of 2.
393 int wcwidth_ja (wchar_t ucs)
396 return -1; /* continue with the normal check */
397 /* a rough range for quick check */
398 if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
399 (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
400 (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
401 (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
407 int wcwidth_ucs(wchar_t ucs);
409 int wcwidth (wchar_t wc)
411 if (!Charset_is_utf8)
418 else if ((0 <= wc && wc < 256) && IsPrint (wc))
426 int k = wcwidth_ja (wc);
431 return wcwidth_ucs (wc);
434 size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
436 static wchar_t mbstate;
437 wchar_t *ps = (wchar_t *)_ps;
456 c = (unsigned char)*s;
469 wc = ((c & 0x1f) << 6) + (count = 0);
471 wc = ((c & 0x0f) << 12) + (count = 1);
473 wc = ((c & 0x07) << 18) + (count = 2);
475 wc = ((c & 0x03) << 24) + (count = 3);
477 wc = ((c & 0x01) << 30) + (count = 4);
487 wc = *ps & 0x7fffffff;
488 count = wc & 7; /* if count > 4 it will be caught below */
491 for (; n; ++s, --n, ++k)
493 c = (unsigned char)*s;
494 if (0x80 <= c && c < 0xc0)
496 wc |= (c & 0x3f) << (6 * count);
505 if (!(wc >> (11+count*5)))
507 errno = count < 4 ? EILSEQ : EINVAL;
521 #endif /* !HAVE_WC_FUNCS */
523 wchar_t replacement_char (void)
525 return Charset_is_utf8 ? 0xfffd : '?';
528 int mutt_filter_unprintable (char **s)
533 char scratch[MB_LEN_MAX + 1];
535 mbstate_t mbstate1, mbstate2;
537 if (!(b = mutt_buffer_init (b)))
539 memset (&mbstate1, 0, sizeof (mbstate1));
540 memset (&mbstate2, 0, sizeof (mbstate2));
541 for (; (k = mbrtowc (&wc, p, MB_LEN_MAX, &mbstate1)); p += k)
543 if (k == (size_t)(-1) || k == (size_t)(-2))
546 memset (&mbstate1, 0, sizeof (mbstate1));
547 wc = replacement_char();
551 k2 = wcrtomb (scratch, wc, &mbstate2);
553 mutt_buffer_addstr (b, scratch);
555 FREE (s); /* __FREE_CHECKED__ */
556 *s = b->data ? b->data : safe_calloc (1, 1);