root/trunk/lisp/language/chinese.el

Revision 4220, 11.6 kB (checked in by miyoshi, 9 months ago)

Sync up with Emacs22.2.

  • Property svn:eol-style set to LF
Line 
1 ;;; chinese.el --- support for Chinese -*- coding: iso-2022-7bit; -*-
2
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 ;;   Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;;   2005, 2006, 2007, 2008
7 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
8 ;;   Registration Number H14PRO021
9
10 ;; Keywords: multilingual, Chinese
11
12 ;; This file is part of GNU Emacs.
13
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 3, or (at your option)
17 ;; any later version.
18
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 ;; GNU General Public License for more details.
23
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 ;; Boston, MA 02110-1301, USA.
28
29 ;;; Commentary:
30
31 ;; For Chinese, three character sets GB2312, BIG5, and CNS11643 are
32 ;; supported.
33
34 ;;; Code:
35
36 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
37 ;;; Chinese (general)
38 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39
40 (make-coding-system
41  'iso-2022-cn 2 ?C
42  "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN)."
43  '(ascii
44    (nil chinese-gb2312 chinese-cns11643-1)
45    (nil chinese-cns11643-2)
46    nil
47    nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
48    init-bol)
49  '((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2)
50    (mime-charset . iso-2022-cn)))
51
52 (define-coding-system-alias 'chinese-iso-7bit 'iso-2022-cn)
53
54 (make-coding-system
55  'iso-2022-cn-ext 2 ?C
56  "ISO 2022 based 7bit encoding for Chinese GB and CNS (MIME:ISO-2022-CN-EXT)."
57  '(ascii
58    (nil chinese-gb2312 chinese-cns11643-1)
59    (nil chinese-cns11643-2)
60    (nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
61         chinese-cns11643-6 chinese-cns11643-7)
62    nil ascii-eol ascii-cntl seven locking-shift single-shift nil nil nil
63    init-bol)
64  '((safe-charsets ascii chinese-gb2312 chinese-cns11643-1 chinese-cns11643-2
65                   chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
66                   chinese-cns11643-6 chinese-cns11643-7)
67    (mime-charset . iso-2022-cn-ext)))
68
69
70 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
71 ;;; Chinese GB2312 (simplified)
72 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
73
74 (make-coding-system
75  'chinese-iso-8bit 2 ?c
76  "ISO 2022 based EUC encoding for Chinese GB2312 (MIME:GB2312)."
77  '(ascii chinese-gb2312 nil nil
78    nil ascii-eol ascii-cntl nil nil nil nil)
79  '((safe-charsets ascii chinese-gb2312)
80    (mime-charset . gb2312)))
81
82 (define-coding-system-alias 'cn-gb-2312 'chinese-iso-8bit)
83 (define-coding-system-alias 'euc-china 'chinese-iso-8bit)
84 (define-coding-system-alias 'euc-cn 'chinese-iso-8bit)
85 (define-coding-system-alias 'cn-gb 'chinese-iso-8bit)
86 (define-coding-system-alias 'gb2312 'chinese-iso-8bit)
87 (define-coding-system-alias 'cp936 'chinese-iso-8bit)
88
89 (make-coding-system
90  'chinese-hz 0 ?z
91  "Hz/ZW 7-bit encoding for Chinese GB2312 (MIME:HZ-GB-2312)."
92  nil
93  '((safe-charsets ascii chinese-gb2312)
94    (mime-charset . hz-gb-2312)
95    (post-read-conversion . post-read-decode-hz)
96    (pre-write-conversion . pre-write-encode-hz)))
97
98 (define-coding-system-alias 'hz-gb-2312 'chinese-hz)
99 (define-coding-system-alias 'hz 'chinese-hz)
100
101 (defun post-read-decode-hz (len)
102   (let ((pos (point))
103         (buffer-modified-p (buffer-modified-p))
104         last-coding-system-used)
105     (prog1
106         (decode-hz-region pos (+ pos len))
107       (set-buffer-modified-p buffer-modified-p))))
108
109 (defun pre-write-encode-hz (from to)
110   (let ((buf (current-buffer)))
111     (set-buffer (generate-new-buffer " *temp*"))
112     (if (stringp from)
113         (insert from)
114       (insert-buffer-substring buf from to))
115     (let (last-coding-system-used)
116       (encode-hz-region 1 (point-max)))
117     nil))
118
119 (set-language-info-alist
120  "Chinese-GB" '((charset chinese-gb2312 chinese-sisheng)
121                 (coding-system chinese-iso-8bit iso-2022-cn chinese-hz)
122                 (coding-priority chinese-iso-8bit chinese-big5 iso-2022-cn)
123                 (input-method . "chinese-py-punct")
124                 (features china-util)
125                 (sample-text . "Chinese ($AVPND(B,$AFUM(;0(B,$A::So(B)    $ADc:C(B")
126                 (documentation . "Support for Chinese GB2312 character set.")
127                 (tutorial . "TUTORIAL.cn"))
128  '("Chinese"))
129
130 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
131 ;; Chinese BIG5 (traditional)
132 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133
134 (make-coding-system
135  'chinese-big5 3 ?B
136  "BIG5 8-bit encoding for Chinese (MIME:Big5)."
137  nil
138  '((safe-charsets ascii chinese-big5-1 chinese-big5-2)
139    (mime-charset . big5)
140    (charset-origin-alist (chinese-big5-1  "BIG5" encode-big5-char)
141                          (chinese-big5-2  "BIG5" encode-big5-char))))
142
143 (define-coding-system-alias 'big5 'chinese-big5)
144 (define-coding-system-alias 'cn-big5 'chinese-big5)
145 (define-coding-system-alias 'cp950 'chinese-big5)
146
147 ;; Big5 font requires special encoding.
148 (define-ccl-program ccl-encode-big5-font
149   `(0
150     ;; In:  R0:chinese-big5-1 or chinese-big5-2
151     ;;      R1:position code 1
152     ;;      R2:position code 2
153     ;; Out: R1:font code point 1
154     ;;      R2:font code point 2
155     ((r2 = ((((r1 - ?\x21) * 94) + r2) - ?\x21))
156      (if (r0 == ,(charset-id 'chinese-big5-2)) (r2 += 6280))
157      (r1 = ((r2 / 157) + ?\xA1))
158      (r2 %= 157)
159      (if (r2 < ?\x3F) (r2 += ?\x40) (r2 += ?\x62))))
160   "CCL program to encode a Big5 code to code point of Big5 font.")
161
162 (setq font-ccl-encoder-alist
163       (cons (cons "big5" ccl-encode-big5-font) font-ccl-encoder-alist))
164
165 (set-language-info-alist
166  "Chinese-BIG5" '((charset chinese-big5-1 chinese-big5-2)
167                   (coding-system chinese-big5 chinese-iso-7bit)
168                   (coding-priority chinese-big5 iso-2022-cn chinese-iso-8bit)
169                   (input-method . "chinese-py-punct-b5")
170                   (features china-util)
171                   (sample-text . "Cantonese ($(0GnM$(B,$(0N]0*Hd(B) $(0*/=((B, $(0+$)p(B")
172                   (documentation . "Support for Chinese Big5 character set.")
173                   (tutorial . "TUTORIAL.zh"))
174  '("Chinese"))
175
176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
177 ;; Chinese CNS11643 (traditional)
178 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
179
180 (defvar big5-to-cns (make-translation-table)
181   "Translation table for encoding to `euc-tw'.")
182 ;; Could have been done by china-util loaded before.
183 (unless (get 'big5-to-cns 'translation-table)
184   (define-translation-table 'big5-to-cns big5-to-cns))
185
186 (define-ccl-program ccl-decode-euc-tw
187   ;; CNS plane 1 needs either two or four bytes in EUC-TW encoding;
188   ;; CNS planes 2 to 7 always need four bytes.  In internal encoding of
189   ;; Emacs, CNS planes 1 and 2 need three bytes, and planes 3 to 7 need
190   ;; four bytes.  Thus a buffer magnification value of 2 (for both
191   ;; encoding and decoding) is sufficient.
192   `(2
193     ;; we don't have enough registers to hold all charset-ids
194     ((r4 = ,(charset-id 'chinese-cns11643-1))
195      (r5 = ,(charset-id 'chinese-cns11643-2))
196      (r6 = ,(charset-id 'chinese-cns11643-3))
197      (loop
198       (read-if (r0 < #x80)
199           ;; ASCII
200           (write-repeat r0)
201         ;; not ASCII
202         (if (r0 == #x8E)
203             ;; single shift
204             (read-if (r1 < #xA1)
205                 ;; invalid byte
206                 ((write r0)
207                  (write-repeat r1))
208               (if (r1 > #xA7)
209                   ;; invalid plane
210                   ((write r0)
211                    (write-repeat r1))
212                 ;; OK, we have a plane
213                 (read-if (r2 < #xA1)
214                     ;; invalid first byte
215                     ((write r0 r1)
216                      (write-repeat r2))
217                   (read-if (r3 < #xA1)
218                       ;; invalid second byte
219                       ((write r0 r1 r2)
220                        (write-repeat r3))
221                     ;; CNS 1-7, finally
222                     ((branch (r1 - #xA1)
223                       (r1 = r4)
224                       (r1 = r5)
225                       (r1 = r6)
226                       (r1 = ,(charset-id 'chinese-cns11643-4))
227                       (r1 = ,(charset-id 'chinese-cns11643-5))
228                       (r1 = ,(charset-id 'chinese-cns11643-6))
229                       (r1 = ,(charset-id 'chinese-cns11643-7)))
230                      (r2 = ((((r2 - #x80) << 7) + r3) - #x80))
231                      (write-multibyte-character r1 r2)
232                      (repeat))))))
233           ;; standard EUC
234           (if (r0 < #xA1)
235               ;; invalid first byte
236               (write-repeat r0)
237             (read-if (r1 < #xA1)
238                 ;; invalid second byte
239                 ((write r0)
240                  (write-repeat r1))
241               ;; CNS 1, finally
242               ((r1 = ((((r0 - #x80) << 7) + r1) - #x80))
243                (write-multibyte-character r4 r1)
244                (repeat)))))))))
245   "CCL program to decode EUC-TW encoding."
246 )
247
248 (define-ccl-program ccl-encode-euc-tw
249   `(2
250     ;; we don't have enough registers to hold all charset-ids
251     ((r2 = ,(charset-id 'ascii))
252      (r3 = ,(charset-id 'chinese-big5-1))
253      (r4 = ,(charset-id 'chinese-big5-2))
254      (r5 = ,(charset-id 'chinese-cns11643-1))
255      (r6 = ,(charset-id 'chinese-cns11643-2))
256      (loop
257       (read-multibyte-character r0 r1)
258       (if (r0 == r2)
259           (write-repeat r1)
260         (;; Big 5 encoded characters are first translated to CNS
261          (if (r0 == r3)
262              (translate-character big5-to-cns r0 r1)
263            (if (r0 == r4)
264                (translate-character big5-to-cns r0 r1)))
265          (if (r0 == r5)
266              (r0 = #xA1)
267            (if (r0 == r6)
268                (r0 = #xA2)
269              (if (r0 == ,(charset-id 'chinese-cns11643-3))
270                  (r0 = #xA3)
271                (if (r0 == ,(charset-id 'chinese-cns11643-4))
272                    (r0 = #xA4)
273                  (if (r0 == ,(charset-id 'chinese-cns11643-5))
274                      (r0 = #xA5)
275                    (if (r0 == ,(charset-id 'chinese-cns11643-6))
276                        (r0 = #xA6)
277                      (if (r0 == ,(charset-id 'chinese-cns11643-7))
278                          (r0 = #xA7)
279                        ;; not CNS.  We use a dummy character which
280                        ;; can't occur in EUC-TW encoding to indicate
281                        ;; this.
282                        (write-repeat #xFF))))))))))
283       (if (r0 != #xA1)
284           ;; single shift and CNS plane
285           ((write #x8E)
286            (write r0)))
287       (write ((r1 >> 7) + #x80))
288       (write ((r1 % #x80) + #x80))
289       (repeat))))
290   "CCL program to encode EUC-TW encoding."
291 )
292
293 (defun euc-tw-pre-write-conversion (beg end)
294   "Semi-dummy pre-write function effectively to autoload china-util."
295   ;; Ensure translation table is loaded.
296   (require 'china-util)
297   ;; Don't do this again.
298   (coding-system-put 'euc-tw 'pre-write-conversion nil)
299   nil)
300
301 (make-coding-system
302   'euc-tw 4 ?Z
303   "ISO 2022 based EUC encoding for Chinese CNS11643.
304 Big5 encoding is accepted for input also (which is then converted to CNS)."
305   '(ccl-decode-euc-tw . ccl-encode-euc-tw)
306   '((safe-charsets ascii
307                    chinese-big5-1
308                    chinese-big5-2
309                    chinese-cns11643-1
310                    chinese-cns11643-2
311                    chinese-cns11643-3
312                    chinese-cns11643-4
313                    chinese-cns11643-5
314                    chinese-cns11643-6
315                    chinese-cns11643-7)
316     (valid-codes (0 . 255))
317     (pre-write-conversion . euc-tw-pre-write-conversion)))
318
319 (define-coding-system-alias 'euc-taiwan 'euc-tw)
320
321 (set-language-info-alist
322  "Chinese-CNS" '((charset chinese-cns11643-1 chinese-cns11643-2
323                           chinese-cns11643-3 chinese-cns11643-4
324                           chinese-cns11643-5 chinese-cns11643-6
325                           chinese-cns11643-7)
326                  (coding-system iso-2022-cn euc-tw)
327                  (coding-priority iso-2022-cn euc-tw chinese-big5
328                                   chinese-iso-8bit)
329                  (features china-util)
330                  (input-method . "chinese-cns-quick")
331                  (documentation . "\
332 Support for Chinese CNS character sets.  Note that the EUC-TW coding system
333 accepts Big5 for input also (which is then converted to CNS)."))
334  '("Chinese"))
335
336 (set-language-info-alist
337  "Chinese-EUC-TW" '((charset chinese-cns11643-1 chinese-cns11643-2
338                              chinese-cns11643-3 chinese-cns11643-4
339                              chinese-cns11643-5 chinese-cns11643-6
340                              chinese-cns11643-7 chinese-big5-1 chinese-big5-2)
341                     (coding-system euc-tw iso-2022-cn)
342                     (coding-priority euc-tw chinese-big5 iso-2022-cn
343                                      chinese-iso-8bit)
344                     (features china-util)
345                     (input-method . "chinese-cns-quick")
346                     (documentation . "\
347 Support for Chinese, prefering the EUC-TW character set.  Note that
348 the EUC-TW coding system accepts Big5 for input also (which is then
349 converted to CNS)."))
350  '("Chinese"))
351
352 (provide 'chinese)
353
354 ;;; arch-tag: b82fcf7a-84f6-4e0b-b38c-1742dac0e09f
355 ;;; chinese.el ends here
356
Note: See TracBrowser for help on using the browser.