root/trunk/src/charset.c

Revision 4220, 56.0 kB (checked in by miyoshi, 8 months ago)

Sync up with Emacs22.2.

  • Property svn:eol-style set to native
Line 
1 /* Basic multilingual character support.
2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
3                  2006, 2007, 2008 Free Software Foundation, Inc.
4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5      2005, 2006, 2007, 2008
6      National Institute of Advanced Industrial Science and Technology (AIST)
7      Registration Number H14PRO021
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING.  If not, write to
23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 Boston, MA 02110-1301, USA.  */
25
26 /* At first, see the document in `charset.h' to understand the code in
27    this file.  */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "buffer.h"
40 #include "charset.h"
41 #include "composite.h"
42 #include "coding.h"
43 #include "disptab.h"
44
45 #else  /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
52 Lisp_Object Qunknown;
53
54 /* Declaration of special leading-codes.  */
55 EMACS_INT leading_code_private_11; /* for private DIMENSION1 of 1-column */
56 EMACS_INT leading_code_private_12; /* for private DIMENSION1 of 2-column */
57 EMACS_INT leading_code_private_21; /* for private DIMENSION2 of 1-column */
58 EMACS_INT leading_code_private_22; /* for private DIMENSION2 of 2-column */
59
60 /* Declaration of special charsets.  The values are set by
61    Fsetup_special_charsets.  */
62 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
63 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
64 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
65 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
66 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
67 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
68 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
69 int charset_mule_unicode_0100_24ff;
70 int charset_mule_unicode_2500_33ff;
71 int charset_mule_unicode_e000_ffff;
72
73 Lisp_Object Qcharset_table;
74
75 /* A char-table containing information of each character set.  */
76 Lisp_Object Vcharset_table;
77
78 /* A vector of charset symbol indexed by charset-id.  This is used
79    only for returning charset symbol from C functions.  */
80 Lisp_Object Vcharset_symbol_table;
81
82 /* A list of charset symbols ever defined.  */
83 Lisp_Object Vcharset_list;
84
85 /* Vector of translation table ever defined.
86    ID of a translation table is used to index this vector.  */
87 Lisp_Object Vtranslation_table_vector;
88
89 /* A char-table for characters which may invoke auto-filling.  */
90 Lisp_Object Vauto_fill_chars;
91
92 Lisp_Object Qauto_fill_chars;
93
94 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
95 int bytes_by_char_head[256];
96 int width_by_char_head[256];
97
98 /* Mapping table from ISO2022's charset (specified by DIMENSION,
99    CHARS, and FINAL-CHAR) to Emacs' charset.  */
100 int iso_charset_table[2][2][128];
101
102 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
103 unsigned char *_fetch_multibyte_char_p;
104 int _fetch_multibyte_char_len;
105
106 /* Offset to add to a non-ASCII value when inserting it.  */
107 EMACS_INT nonascii_insert_offset;
108
109 /* Translation table for converting non-ASCII unibyte characters
110    to multibyte codes, or nil.  */
111 Lisp_Object Vnonascii_translation_table;
112
113 /* List of all possible generic characters.  */
114 Lisp_Object Vgeneric_character_list;
115
116
117 void
118 invalid_character (c)
119      int c;
120 {
121   error ("Invalid character: %d, #o%o, #x%x", c, c, c);
122 }
123
124 /* Parse string STR of length LENGTH and fetch information of a
125    character at STR.  Set BYTES to the byte length the character
126    occupies, CHARSET, C1, C2 to proper values of the character. */
127
128 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)             \
129   do {                                                                       \
130     (c1) = *(str);                                                           \
131     (bytes) = BYTES_BY_CHAR_HEAD (c1);                                       \
132     if ((bytes) == 1)                                                        \
133       (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
134     else if ((bytes) == 2)                                                   \
135       {                                                                      \
136         if ((c1) == LEADING_CODE_8_BIT_CONTROL)                              \
137           (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20;         \
138         else                                                                 \
139           (charset) = (c1), (c1) = (str)[1] & 0x7F;                          \
140       }                                                                      \
141     else if ((bytes) == 3)                                                   \
142       {                                                                      \
143         if ((c1) < LEADING_CODE_PRIVATE_11)                                  \
144           (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F;  \
145         else                                                                 \
146           (charset) = (str)[1], (c1) = (str)[2] & 0x7F;                      \
147       }                                                                      \
148     else                                                                     \
149       (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F;  \
150   } while (0)
151
152 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.
153    Note that this intentionally allows invalid components, such
154    as 0xA0 0xA0, because there exist many files that contain
155    such invalid byte sequences, especially in EUC-GB. */
156 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
157   ((charset) == CHARSET_ASCII                           \
158    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
159    : ((charset) == CHARSET_8_BIT_CONTROL                \
160       ? ((c1) >= 0x80 && (c1) <= 0x9F)                  \
161       : ((charset) == CHARSET_8_BIT_GRAPHIC             \
162          ? ((c1) >= 0x80 && (c1) <= 0xFF)               \
163          : (CHARSET_DIMENSION (charset) == 1            \
164             ? ((c1) >= 0x20 && (c1) <= 0x7F)            \
165             : ((c1) >= 0x20 && (c1) <= 0x7F             \
166                && (c2) >= 0x20 && (c2) <= 0x7F)))))
167
168 /* Store multi-byte form of the character C in STR.  The caller should
169    allocate at least 4-byte area at STR in advance.  Returns the
170    length of the multi-byte form.  If C is an invalid character code,
171    return -1.  */
172
173 int
174 char_to_string_1 (c, str)
175      int c;
176      unsigned char *str;
177 {
178   unsigned char *p = str;
179
180   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
181     {
182       /* Multibyte character can't have a modifier bit.  */
183       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
184         return -1;
185
186       /* For Meta, Shift, and Control modifiers, we need special care.  */
187       if (c & CHAR_META)
188         {
189           /* Move the meta bit to the right place for a string.  */
190           c = (c & ~CHAR_META) | 0x80;
191         }
192       if (c & CHAR_SHIFT)
193         {
194           /* Shift modifier is valid only with [A-Za-z].  */
195           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
196             c &= ~CHAR_SHIFT;
197           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
198             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
199         }
200       if (c & CHAR_CTL)
201         {
202           /* Simulate the code in lread.c.  */
203           /* Allow `\C- ' and `\C-?'.  */
204           if (c == (CHAR_CTL | ' '))
205             c = 0;
206           else if (c == (CHAR_CTL | '?'))
207             c = 127;
208           /* ASCII control chars are made from letters (both cases),
209              as well as the non-letters within 0100...0137.  */
210           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
211             c &= (037 | (~0177 & ~CHAR_CTL));
212           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
213             c &= (037 | (~0177 & ~CHAR_CTL));
214         }
215
216       /* If C still has any modifier bits, just ignore it.  */
217       c &= ~CHAR_MODIFIER_MASK;
218     }
219
220   if (SINGLE_BYTE_CHAR_P (c))
221     {
222       if (ASCII_BYTE_P (c) || c >= 0xA0)
223         *p++ = c;
224       else
225         {
226           *p++ = LEADING_CODE_8_BIT_CONTROL;
227           *p++ = c + 0x20;
228         }
229     }
230   else if (CHAR_VALID_P (c, 0))
231     {
232       int charset, c1, c2;
233
234       SPLIT_CHAR (c, charset, c1, c2);
235
236       if (charset >= LEADING_CODE_EXT_11)
237         *p++ = (charset < LEADING_CODE_EXT_12
238                 ? LEADING_CODE_PRIVATE_11
239                 : (charset < LEADING_CODE_EXT_21
240                    ? LEADING_CODE_PRIVATE_12
241                    : (charset < LEADING_CODE_EXT_22
242                       ? LEADING_CODE_PRIVATE_21
243                       : LEADING_CODE_PRIVATE_22)));
244       *p++ = charset;
245       if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32))
246         return -1;
247       if (c1)
248         {
249           *p++ = c1 | 0x80;
250           if (c2 > 0)
251             *p++ = c2 | 0x80;
252         }
253     }
254   else
255     return -1;
256
257   return (p - str);
258 }
259
260
261 /* Store multi-byte form of the character C in STR.  The caller should
262    allocate at least 4-byte area at STR in advance.  Returns the
263    length of the multi-byte form.  If C is an invalid character code,
264    signal an error.
265
266    Use macro `CHAR_STRING (C, STR)' instead of calling this function
267    directly if C can be an ASCII character.  */
268
269 int
270 char_to_string (c, str)
271      int c;
272      unsigned char *str;
273 {
274   int len;
275   len = char_to_string_1 (c, str);
276   if (len == -1)
277     invalid_character (c);
278   return len;
279 }
280
281
282 /* Return the non-ASCII character corresponding to multi-byte form at
283    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
284    length of the multibyte form in *ACTUAL_LEN.
285
286    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
287    this function directly if you want ot handle ASCII characters as
288    well.  */
289
290 int
291 string_to_char (str, len, actual_len)
292      const unsigned char *str;
293      int len, *actual_len;
294 {
295   int c, bytes, charset, c1, c2;
296
297   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
298   c = MAKE_CHAR (charset, c1, c2);
299   if (actual_len)
300     *actual_len = bytes;
301   return c;
302 }
303
304 /* Return the length of the multi-byte form at string STR of length LEN.
305    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
306 int
307 multibyte_form_length (str, len)
308      const unsigned char *str;
309      int len;
310 {
311   int bytes;
312
313   PARSE_MULTIBYTE_SEQ (str, len, bytes);
314   return bytes;
315 }
316
317 /* Check multibyte form at string STR of length LEN and set variables
318    pointed by CHARSET, C1, and C2 to charset and position codes of the
319    character at STR, and return 0.  If there's no multibyte character,
320    return -1.  This should be used only in the macro SPLIT_STRING
321    which checks range of STR in advance.  */
322
323 int
324 split_string (str, len, charset, c1, c2)
325      const unsigned char *str;
326      unsigned char *c1, *c2;
327      int len, *charset;
328 {
329   register int bytes, cs, code1, code2 = -1;
330
331   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
332   if (cs == CHARSET_ASCII)
333     return -1;
334   *charset = cs;
335   *c1 = code1;
336   *c2 = code2;
337   return 0;
338 }
339
340 /* Return 1 if character C has valid printable glyph.
341    Use the macro CHAR_PRINTABLE_P instead.  */
342 int
343 char_printable_p (c)
344      int c;
345 {
346   int charset, c1, c2;
347
348   if (ASCII_BYTE_P (c))
349     return 1;
350   else if (SINGLE_BYTE_CHAR_P (c))
351     return 0;
352   else if (c >= MAX_CHAR)
353     return 0;
354
355   SPLIT_CHAR (c, charset, c1, c2);
356   if (! CHARSET_DEFINED_P (charset))
357     return 0;
358   if (CHARSET_CHARS (charset) == 94
359       ? c1 <= 32 || c1 >= 127
360       : c1 < 32)
361     return 0;
362   if (CHARSET_DIMENSION (charset) == 2
363       && (CHARSET_CHARS (charset) == 94
364           ? c2 <= 32 || c2 >= 127
365           : c2 < 32))
366     return 0;
367   return 1;
368 }
369
370 /* Translate character C by translation table TABLE.  If C
371    is negative, translate a character specified by CHARSET, C1, and C2
372    (C1 and C2 are code points of the character).  If no translation is
373    found in TABLE, return C.  */
374 int
375 translate_char (table, c, charset, c1, c2)
376      Lisp_Object table;
377      int c, charset, c1, c2;
378 {
379   Lisp_Object ch;
380   int alt_charset, alt_c1, alt_c2, dimension;
381
382   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
383   if (!CHAR_TABLE_P (table)
384       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
385     return c;
386
387   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
388   dimension = CHARSET_DIMENSION (alt_charset);
389   if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0))
390     /* CH is not a generic character, just return it.  */
391     return XFASTINT (ch);
392
393   /* Since CH is a generic character, we must return a specific
394      charater which has the same position codes as C from CH.  */
395   if (charset < 0)
396     SPLIT_CHAR (c, charset, c1, c2);
397   if (dimension != CHARSET_DIMENSION (charset))
398     /* We can't make such a character because of dimension mismatch.  */
399     return c;
400   return MAKE_CHAR (alt_charset, c1, c2);
401 }
402
403 /* Convert the unibyte character C to multibyte based on
404    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
405    convert C to a valid multibyte character, convert it based on
406    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
407
408 int
409 unibyte_char_to_multibyte (c)
410      int c;
411 {
412   if (c < 0400 && c >= 0200)
413     {
414       int c_save = c;
415
416       if (! NILP (Vnonascii_translation_table))
417         {
418           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
419           if (c >= 0400 && ! char_valid_p (c, 0))
420             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
421         }
422       else if (c >= 0240 && nonascii_insert_offset > 0)
423         {
424           c += nonascii_insert_offset;
425           if (c < 0400 || ! char_valid_p (c, 0))
426             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
427         }
428       else if (c >= 0240)
429         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
430     }
431   return c;
432 }
433
434
435 /* Convert the multibyte character C to unibyte 8-bit character based
436    on Vnonascii_translation_table or nonascii_insert_offset.  If
437    REV_TBL is non-nil, it should be a reverse table of
438    Vnonascii_translation_table, i.e. what given by:
439      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
440
441 int
442 multibyte_char_to_unibyte (c, rev_tbl)
443      int c;
444      Lisp_Object rev_tbl;
445 {
446   if (!SINGLE_BYTE_CHAR_P (c))
447     {
448       int c_save = c;
449
450       if (! CHAR_TABLE_P (rev_tbl)
451           && CHAR_TABLE_P (Vnonascii_translation_table))
452         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
453                                           make_number (0));
454       if (CHAR_TABLE_P (rev_tbl))
455         {
456           Lisp_Object temp;
457           temp = Faref (rev_tbl, make_number (c));
458           if (INTEGERP (temp))
459             c = XINT (temp);
460           if (c >= 256)
461             c = (c_save & 0177) + 0200;
462         }
463       else
464         {
465           if (nonascii_insert_offset > 0)
466             c -= nonascii_insert_offset;
467           if (c < 128 || c >= 256)
468             c = (c_save & 0177) + 0200;
469         }
470     }
471
472   return c;
473 }
474
475
476 /* Update the table Vcharset_table with the given arguments (see the
477    document of `define-charset' for the meaning of each argument).
478    Several other table contents are also updated.  The caller should
479    check the validity of CHARSET-ID and the remaining arguments in
480    advance.  */
481
482 void
483 update_charset_table (charset_id, dimension, chars, width, direction,
484                       iso_final_char, iso_graphic_plane,
485                       short_name, long_name, description)
486      Lisp_Object charset_id, dimension, chars, width, direction;
487      Lisp_Object iso_final_char, iso_graphic_plane;
488      Lisp_Object short_name, long_name, description;
489 {
490   int charset = XINT (charset_id);
491   int bytes;
492   unsigned char leading_code_base, leading_code_ext;
493
494   if (NILP (CHARSET_TABLE_ENTRY (charset)))
495     CHARSET_TABLE_ENTRY (charset)
496       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
497
498   if (NILP (long_name))
499     long_name = short_name;
500   if (NILP (description))
501     description = long_name;
502
503   /* Get byte length of multibyte form, base leading-code, and
504      extended leading-code of the charset.  See the comment under the
505      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
506   bytes = XINT (dimension);
507   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
508     {
509       /* Official charset, it doesn't have an extended leading-code.  */
510       if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
511         bytes += 1; /* For a base leading-code.  */
512       leading_code_base = charset;
513       leading_code_ext = 0;
514     }
515   else
516     {
517       /* Private charset.  */
518       bytes += 2; /* For base and extended leading-codes.  */
519       leading_code_base
520         = (charset < LEADING_CODE_EXT_12
521            ? LEADING_CODE_PRIVATE_11
522            : (charset < LEADING_CODE_EXT_21
523               ? LEADING_CODE_PRIVATE_12
524               : (charset < LEADING_CODE_EXT_22
525                  ? LEADING_CODE_PRIVATE_21
526                  : LEADING_CODE_PRIVATE_22)));
527       leading_code_ext = charset;
528       if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
529         error ("Invalid dimension for the charset-ID %d", charset);
530     }
531
532   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
533   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
534   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
535   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
536   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
537   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
538   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
539     = make_number (leading_code_base);
540   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
541     = make_number (leading_code_ext);
542   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
543   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
544     = iso_graphic_plane;
545   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
546   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
547   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
548   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
549
550   {
551     /* If we have already defined a charset which has the same
552        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
553        DIRECTION, we must update the entry REVERSE-CHARSET of both
554        charsets.  If there's no such charset, the value of the entry
555        is set to nil.  */
556     int i;
557
558     for (i = 0; i <= MAX_CHARSET; i++)
559       if (!NILP (CHARSET_TABLE_ENTRY (i)))
560         {
561           if (CHARSET_DIMENSION (i) == XINT (dimension)
562               && CHARSET_CHARS (i) == XINT (chars)
563               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
564               && CHARSET_DIRECTION (i) != XINT (direction))
565             {
566               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
567                 = make_number (i);
568<