src/coding.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
detect_coding_XXX
decode_coding_XXXX
encode_coding_XXX
encode_inhibit_flag
inhibit_flag
growable_destination
record_conversion_result
coding_set_source
coding_change_source
coding_set_destination
coding_change_destination
coding_alloc_by_realloc
coding_alloc_by_making_gap
alloc_destination
detect_coding_utf_8
decode_coding_utf_8
encode_coding_utf_8
detect_coding_utf_16
decode_coding_utf_16
encode_coding_utf_16
detect_coding_emacs_mule
emacs_mule_char
emacs_mule_finish_composition
decode_coding_emacs_mule
encode_coding_emacs_mule
setup_iso_safe_charsets
detect_coding_iso_2022
finish_composition
decode_coding_iso_2022
encode_invocation_designation
encode_designation_at_bol
encode_coding_iso_2022
detect_coding_sjis
detect_coding_big5
decode_coding_sjis
decode_coding_big5
encode_coding_sjis
encode_coding_big5
detect_coding_ccl
decode_coding_ccl
encode_coding_ccl
decode_coding_raw_text
encode_coding_raw_text
detect_coding_charset
decode_coding_charset
encode_coding_charset
setup_coding_system
coding_charset_list
coding_system_charset_list
raw_text_coding_system
raw_text_coding_system_p
coding_inherit_eol_type
complement_process_encoding_system
check_ascii
check_utf_8
utf8_string_p
make_string_from_utf8
detect_eol
adjust_coding_eol_type
detect_coding
decode_eol
get_translation_table
get_translation
produce_chars
produce_composition
produce_charset
produce_annotation
decode_coding
handle_composition_annotation
handle_charset_annotation
consume_chars
encode_coding
code_conversion_restore
code_conversion_save
coding_restore_undo_list
decode_coding_gap
decode_coding_object
encode_coding_object
preferred_coding_system
from_unicode
from_unicode_buffer
to_unicode
DEFUN
DEFUN
DEFUN
detect_coding_system
char_encodable_p
code_convert_region
string_ascii_p
code_convert_string
code_convert_string_norecord
get_buffer_gap_address
get_char_bytes
encode_string_utf_8
decode_string_utf_8
convert_string_nocopy
decode_file_name
encode_file_name_1
encode_file_name
DEFUN
DEFUN
DEFUN
DEFUN
DEFUN
DEFUN
DEFUN
DEFUN
make_subsidiaries
DEFUN
DEFUN
DEFUN
DEFUN
init_coding_once
syms_of_coding
reset_coding_after_pdumper_load
     1 /* Coding system handler (conversion, detection, etc).
     2    Copyright (C) 2001-2023 Free Software Foundation, Inc.
     3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
     4      2005, 2006, 2007, 2008, 2009, 2010, 2011
     5      National Institute of Advanced Industrial Science and Technology (AIST)
     6      Registration Number H14PRO021
     7    Copyright (C) 2003
     8      National Institute of Advanced Industrial Science and Technology (AIST)
     9      Registration Number H13PRO009
    10 
    11 This file is part of GNU Emacs.
    12 
    13 GNU Emacs is free software: you can redistribute it and/or modify
    14 it under the terms of the GNU General Public License as published by
    15 the Free Software Foundation, either version 3 of the License, or (at
    16 your option) any later version.
    17 
    18 GNU Emacs is distributed in the hope that it will be useful,
    19 but WITHOUT ANY WARRANTY; without even the implied warranty of
    20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    21 GNU General Public License for more details.
    22 
    23 You should have received a copy of the GNU General Public License
    24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
    25 
    26 /*** TABLE OF CONTENTS ***
    27 
    28   0. General comments
    29   1. Preamble
    30   2. Emacs' internal format (emacs-utf-8) handlers
    31   3. UTF-8 handlers
    32   4. UTF-16 handlers
    33   5. Charset-base coding systems handlers
    34   6. emacs-mule (old Emacs' internal format) handlers
    35   7. ISO2022 handlers
    36   8. Shift-JIS and BIG5 handlers
    37   9. CCL handlers
    38   10. C library functions
    39   11. Emacs Lisp library functions
    40   12. Postamble
    41 
    42 */
    43 
    44 /*** 0. General comments ***
    45 
    46 
    47 CODING SYSTEM
    48 
    49   A coding system is an object for an encoding mechanism that contains
    50   information about how to convert byte sequences to character
    51   sequences and vice versa.  When we say "decode", it means converting
    52   a byte sequence of a specific coding system into a character
    53   sequence that is represented by Emacs' internal coding system
    54   `emacs-utf-8', and when we say "encode", it means converting a
    55   character sequence of emacs-utf-8 to a byte sequence of a specific
    56   coding system.
    57 
    58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
    59   the C level, a coding system is represented by a vector of attributes
    60   stored in the hash table Vcharset_hash_table.  The conversion from
    61   coding system symbol to attributes vector is done by looking up
    62   Vcharset_hash_table by the symbol.
    63 
    64   Coding systems are classified into the following types depending on
    65   the encoding mechanism.  Here's a brief description of the types.
    66 
    67   o UTF-8
    68 
    69   o UTF-16
    70 
    71   o Charset-base coding system
    72 
    73   A coding system defined by one or more (coded) character sets.
    74   Decoding and encoding are done by a code converter defined for each
    75   character set.
    76 
    77   o Old Emacs internal format (emacs-mule)
    78 
    79   The coding system adopted by old versions of Emacs (20 and 21).
    80 
    81   o ISO2022-base coding system
    82 
    83   The most famous coding system for multiple character sets.  X's
    84   Compound Text, various EUCs (Extended Unix Code), and coding systems
    85   used in the Internet communication such as ISO-2022-JP are all
    86   variants of ISO2022.
    87 
    88   o SJIS (or Shift-JIS or MS-Kanji-Code)
    89 
    90   A coding system to encode character sets: ASCII, JISX0201, and
    91   JISX0208.  Widely used for PC's in Japan.  Details are described in
    92   section 8.
    93 
    94   o BIG5
    95 
    96   A coding system to encode character sets: ASCII and Big5.  Widely
    97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
    98   described in section 8.  In this file, when we write "big5" (all
    99   lowercase), we mean the coding system, and when we write "Big5"
   100   (capitalized), we mean the character set.
   101 
   102   o CCL
   103 
   104   If a user wants to decode/encode text encoded in a coding system
   105   not listed above, he can supply a decoder and an encoder for it in
   106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
   107   program while decoding/encoding.
   108 
   109   o Raw-text
   110 
   111   A coding system for text containing raw eight-bit data.  Emacs
   112   treats each byte of source text as a character (except for
   113   end-of-line conversion).
   114 
   115   o No-conversion
   116 
   117   Like raw text, but don't do end-of-line conversion.
   118 
   119 
   120 END-OF-LINE FORMAT
   121 
   122   How text end-of-line is encoded depends on operating system.  For
   123   instance, Unix's format is just one byte of LF (line-feed) code,
   124   whereas DOS's format is two-byte sequence of `carriage-return' and
   125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
   126   `carriage-return'.
   127 
   128   Since text character encoding and end-of-line encoding are
   129   independent, any coding system described above can take any format
   130   of end-of-line (except for no-conversion).
   131 
   132 STRUCT CODING_SYSTEM
   133 
   134   Before using a coding system for code conversion (i.e. decoding and
   135   encoding), we setup a structure of type `struct coding_system'.
   136   This structure keeps various information about a specific code
   137   conversion (e.g. the location of source and destination data).
   138 
   139 */
   140 
   141 /* COMMON MACROS */
   142 
   143 
   144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
   145 
   146   These functions check if a byte sequence specified as a source in
   147   CODING conforms to the format of XXX, and update the members of
   148   DETECT_INFO.
   149 
   150   Return true if the byte sequence conforms to XXX.
   151 
   152   Below is the template of these functions.  */
   153 
   154 #if 0
   155 static bool
   156 detect_coding_XXX (struct coding_system *coding,
   157                    struct coding_detection_info *detect_info)
   158 {
   159   const unsigned char *src = coding->source;
   160   const unsigned char *src_end = coding->source + coding->src_bytes;
   161   bool multibytep = coding->src_multibyte;
   162   ptrdiff_t consumed_chars = 0;
   163   int found = 0;
   164   ...;
   165 
   166   while (1)
   167     {
   168       /* Get one byte from the source.  If the source is exhausted, jump
   169          to no_more_source:.  */
   170       ONE_MORE_BYTE (c);
   171 
   172       if (! __C_conforms_to_XXX___ (c))
   173         break;
   174       if (! __C_strongly_suggests_XXX__ (c))
   175         found = CATEGORY_MASK_XXX;
   176     }
   177   /* The byte sequence is invalid for XXX.  */
   178   detect_info->rejected |= CATEGORY_MASK_XXX;
   179   return 0;
   180 
   181  no_more_source:
   182   /* The source exhausted successfully.  */
   183   detect_info->found |= found;
   184   return 1;
   185 }
   186 #endif
   187 
   188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
   189 
   190   These functions decode a byte sequence specified as a source by
   191   CODING.  The resulting multibyte text goes to a place pointed to by
   192   CODING->charbuf, the length of which should not exceed
   193   CODING->charbuf_size;
   194 
   195   These functions set the information of original and decoded texts in
   196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
   197   They also set CODING->result to one of CODING_RESULT_XXX indicating
   198   how the decoding is finished.
   199 
   200   Below is the template of these functions.  */
   201 
   202 #if 0
   203 static void
   204 decode_coding_XXXX (struct coding_system *coding)
   205 {
   206   const unsigned char *src = coding->source + coding->consumed;
   207   const unsigned char *src_end = coding->source + coding->src_bytes;
   208   /* SRC_BASE remembers the start position in source in each loop.
   209      The loop will be exited when there's not enough source code, or
   210      when there's no room in CHARBUF for a decoded character.  */
   211   const unsigned char *src_base;
   212   /* A buffer to produce decoded characters.  */
   213   int *charbuf = coding->charbuf + coding->charbuf_used;
   214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
   215   bool multibytep = coding->src_multibyte;
   216 
   217   while (1)
   218     {
   219       src_base = src;
   220       if (charbuf < charbuf_end)
   221         /* No more room to produce a decoded character.  */
   222         break;
   223       ONE_MORE_BYTE (c);
   224       /* Decode it. */
   225     }
   226 
   227  no_more_source:
   228   if (src_base < src_end
   229       && coding->mode & CODING_MODE_LAST_BLOCK)
   230     /* If the source ends by partial bytes to construct a character,
   231        treat them as eight-bit raw data.  */
   232     while (src_base < src_end && charbuf < charbuf_end)
   233       *charbuf++ = *src_base++;
   234   /* Remember how many bytes and characters we consumed.  If the
   235      source is multibyte, the bytes and chars are not identical.  */
   236   coding->consumed = coding->consumed_char = src_base - coding->source;
   237   /* Remember how many characters we produced.  */
   238   coding->charbuf_used = charbuf - coding->charbuf;
   239 }
   240 #endif
   241 
   242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
   243 
   244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
   245   internal multibyte format by CODING.  The resulting byte sequence
   246   goes to a place pointed to by DESTINATION, the length of which
   247   should not exceed DST_BYTES.
   248 
   249   These functions set the information of original and encoded texts in
   250   the members produced, produced_char, consumed, and consumed_char of
   251   the structure *CODING.  They also set the member result to one of
   252   CODING_RESULT_XXX indicating how the encoding finished.
   253 
   254   DST_BYTES zero means that source area and destination area are
   255   overlapped, which means that we can produce an encoded text until it
   256   reaches at the head of not-yet-encoded source text.
   257 
   258   Below is a template of these functions.  */
   259 #if 0
   260 static void
   261 encode_coding_XXX (struct coding_system *coding)
   262 {
   263   bool multibytep = coding->dst_multibyte;
   264   int *charbuf = coding->charbuf;
   265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
   266   unsigned char *dst = coding->destination + coding->produced;
   267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
   268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
   269   ptrdiff_t produced_chars = 0;
   270 
   271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
   272     {
   273       int c = *charbuf;
   274       /* Encode C into DST, and increment DST.  */
   275     }
   276  label_no_more_destination:
   277   /* How many chars and bytes we produced.  */
   278   coding->produced_char += produced_chars;
   279   coding->produced = dst - coding->destination;
   280 }
   281 #endif
   282 
   283 
   284 /*** 1. Preamble ***/
   285 
   286 #include <config.h>
   287 
   288 #ifdef HAVE_WCHAR_H
   289 #include <wchar.h>
   290 #endif /* HAVE_WCHAR_H */
   291 
   292 #include "lisp.h"
   293 #include "character.h"
   294 #include "buffer.h"
   295 #include "charset.h"
   296 #include "ccl.h"
   297 #include "composite.h"
   298 #include "coding.h"
   299 #include "termhooks.h"
   300 #include "pdumper.h"
   301 
   302 Lisp_Object Vcoding_system_hash_table;
   303 
   304 /* Coding-systems are handed between Emacs Lisp programs and C internal
   305    routines by the following three variables.  */
   306 /* Coding system to be used to encode text for terminal display when
   307    terminal coding system is nil.  */
   308 struct coding_system safe_terminal_coding;
   309 
   310 /* Two special coding systems.  */
   311 static Lisp_Object Vsjis_coding_system;
   312 static Lisp_Object Vbig5_coding_system;
   313 
   314 /* ISO2022 section */
   315 
   316 #define CODING_ISO_INITIAL(coding, reg)                 \
   317   (XFIXNUM (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
   318                      coding_attr_iso_initial),          \
   319                reg)))
   320 
   321 
   322 #define CODING_ISO_REQUEST(coding, charset_id)          \
   323   (((charset_id) <= (coding)->max_charset_id            \
   324     ? ((coding)->safe_charsets[charset_id] != 255       \
   325        ? (coding)->safe_charsets[charset_id]            \
   326        : -1)                                            \
   327     : -1))
   328 
   329 
   330 #define CODING_ISO_FLAGS(coding)        \
   331   ((coding)->spec.iso_2022.flags)
   332 #define CODING_ISO_DESIGNATION(coding, reg)     \
   333   ((coding)->spec.iso_2022.current_designation[reg])
   334 #define CODING_ISO_INVOCATION(coding, plane)    \
   335   ((coding)->spec.iso_2022.current_invocation[plane])
   336 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
   337   ((coding)->spec.iso_2022.single_shifting)
   338 #define CODING_ISO_BOL(coding)  \
   339   ((coding)->spec.iso_2022.bol)
   340 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
   341   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
   342    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
   343 #define CODING_ISO_CMP_STATUS(coding)   \
   344   (&(coding)->spec.iso_2022.cmp_status)
   345 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
   346   ((coding)->spec.iso_2022.ctext_extended_segment_len)
   347 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
   348   ((coding)->spec.iso_2022.embedded_utf_8)
   349 
   350 /* Control characters of ISO2022.  */
   351                         /* code */      /* function */
   352 #define ISO_CODE_SO     0x0E            /* shift-out */
   353 #define ISO_CODE_SI     0x0F            /* shift-in */
   354 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
   355 #define ISO_CODE_ESC    0x1B            /* escape */
   356 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
   357 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
   358 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
   359 
   360 /* All code (1-byte) of ISO2022 is classified into one of the
   361    followings.  */
   362 enum iso_code_class_type
   363   {
   364     ISO_control_0,              /* Control codes in the range
   365                                    0x00..0x1F and 0x7F, except for the
   366                                    following 5 codes.  */
   367     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
   368     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
   369     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
   370     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
   371     ISO_control_1,              /* Control codes in the range
   372                                    0x80..0x9F, except for the
   373                                    following 3 codes.  */
   374     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
   375     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
   376     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
   377     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
   378     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
   379     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
   380     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
   381   };
   382 
   383 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
   384     `iso-flags' attribute of an iso2022 coding system.  */
   385 
   386 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
   387    instead of the correct short-form sequence (e.g. ESC $ A).  */
   388 #define CODING_ISO_FLAG_LONG_FORM       0x0001
   389 
   390 /* If set, reset graphic planes and registers at end-of-line to the
   391    initial state.  */
   392 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
   393 
   394 /* If set, reset graphic planes and registers before any control
   395    characters to the initial state.  */
   396 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
   397 
   398 /* If set, encode by 7-bit environment.  */
   399 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
   400 
   401 /* If set, use locking-shift function.  */
   402 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
   403 
   404 /* If set, use single-shift function.  Overwrite
   405    CODING_ISO_FLAG_LOCKING_SHIFT.  */
   406 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
   407 
   408 /* If set, use designation escape sequence.  */
   409 #define CODING_ISO_FLAG_DESIGNATION     0x0040
   410 
   411 /* If set, produce revision number sequence.  */
   412 #define CODING_ISO_FLAG_REVISION        0x0080
   413 
   414 /* If set, produce ISO6429's direction specifying sequence.  */
   415 #define CODING_ISO_FLAG_DIRECTION       0x0100
   416 
   417 /* If set, assume designation states are reset at beginning of line on
   418    output.  */
   419 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
   420 
   421 /* If set, designation sequence should be placed at beginning of line
   422    on output.  */
   423 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
   424 
   425 /* If set, do not encode unsafe characters on output.  */
   426 #define CODING_ISO_FLAG_SAFE            0x0800
   427 
   428 /* If set, extra latin codes (128..159) are accepted as a valid code
   429    on input.  */
   430 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
   431 
   432 #define CODING_ISO_FLAG_COMPOSITION     0x2000
   433 
   434 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
   435 
   436 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
   437 
   438 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
   439 
   440 #define CODING_ISO_FLAG_LEVEL_4         0x20000
   441 
   442 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
   443 
   444 /* A character to be produced on output if encoding of the original
   445    character is prohibited by CODING_ISO_FLAG_SAFE.  */
   446 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
   447 
   448 /* UTF-8 section */
   449 #define CODING_UTF_8_BOM(coding)        \
   450   ((coding)->spec.utf_8_bom)
   451 
   452 /* UTF-16 section */
   453 #define CODING_UTF_16_BOM(coding)       \
   454   ((coding)->spec.utf_16.bom)
   455 
   456 #define CODING_UTF_16_ENDIAN(coding)    \
   457   ((coding)->spec.utf_16.endian)
   458 
   459 #define CODING_UTF_16_SURROGATE(coding) \
   460   ((coding)->spec.utf_16.surrogate)
   461 
   462 
   463 /* CCL section */
   464 #define CODING_CCL_DECODER(coding)      \
   465   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
   466 #define CODING_CCL_ENCODER(coding)      \
   467   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
   468 #define CODING_CCL_VALIDS(coding)                                          \
   469   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
   470 
   471 /* Index for each coding category in `coding_categories' */
   472 
   473 enum coding_category
   474   {
   475     coding_category_iso_7,
   476     coding_category_iso_7_tight,
   477     coding_category_iso_8_1,
   478     coding_category_iso_8_2,
   479     coding_category_iso_7_else,
   480     coding_category_iso_8_else,
   481     coding_category_utf_8_auto,
   482     coding_category_utf_8_nosig,
   483     coding_category_utf_8_sig,
   484     coding_category_utf_16_auto,
   485     coding_category_utf_16_be,
   486     coding_category_utf_16_le,
   487     coding_category_utf_16_be_nosig,
   488     coding_category_utf_16_le_nosig,
   489     coding_category_charset,
   490     coding_category_sjis,
   491     coding_category_big5,
   492     coding_category_ccl,
   493     coding_category_emacs_mule,
   494     /* All above are targets of code detection.  */
   495     coding_category_raw_text,
   496     coding_category_undecided,
   497     coding_category_max
   498   };
   499 
   500 /* Definitions of flag bits used in detect_coding_XXXX.  */
   501 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
   502 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
   503 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
   504 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
   505 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
   506 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
   507 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
   508 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
   509 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
   510 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
   511 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
   512 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
   513 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
   514 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
   515 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
   516 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
   517 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
   518 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
   519 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
   520 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
   521 
   522 /* This value is returned if detect_coding_mask () find nothing other
   523    than ASCII characters.  */
   524 #define CATEGORY_MASK_ANY               \
   525   (CATEGORY_MASK_ISO_7                  \
   526    | CATEGORY_MASK_ISO_7_TIGHT          \
   527    | CATEGORY_MASK_ISO_8_1              \
   528    | CATEGORY_MASK_ISO_8_2              \
   529    | CATEGORY_MASK_ISO_7_ELSE           \
   530    | CATEGORY_MASK_ISO_8_ELSE           \
   531    | CATEGORY_MASK_UTF_8_AUTO           \
   532    | CATEGORY_MASK_UTF_8_NOSIG          \
   533    | CATEGORY_MASK_UTF_8_SIG            \
   534    | CATEGORY_MASK_UTF_16_AUTO          \
   535    | CATEGORY_MASK_UTF_16_BE            \
   536    | CATEGORY_MASK_UTF_16_LE            \
   537    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
   538    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
   539    | CATEGORY_MASK_CHARSET              \
   540    | CATEGORY_MASK_SJIS                 \
   541    | CATEGORY_MASK_BIG5                 \
   542    | CATEGORY_MASK_CCL                  \
   543    | CATEGORY_MASK_EMACS_MULE)
   544 
   545 
   546 #define CATEGORY_MASK_ISO_7BIT \
   547   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
   548 
   549 #define CATEGORY_MASK_ISO_8BIT \
   550   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
   551 
   552 #define CATEGORY_MASK_ISO_ELSE \
   553   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
   554 
   555 #define CATEGORY_MASK_ISO_ESCAPE        \
   556   (CATEGORY_MASK_ISO_7                  \
   557    | CATEGORY_MASK_ISO_7_TIGHT          \
   558    | CATEGORY_MASK_ISO_7_ELSE           \
   559    | CATEGORY_MASK_ISO_8_ELSE)
   560 
   561 #define CATEGORY_MASK_ISO       \
   562   (  CATEGORY_MASK_ISO_7BIT     \
   563      | CATEGORY_MASK_ISO_8BIT   \
   564      | CATEGORY_MASK_ISO_ELSE)
   565 
   566 #define CATEGORY_MASK_UTF_16            \
   567   (CATEGORY_MASK_UTF_16_AUTO            \
   568    | CATEGORY_MASK_UTF_16_BE            \
   569    | CATEGORY_MASK_UTF_16_LE            \
   570    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
   571    | CATEGORY_MASK_UTF_16_LE_NOSIG)
   572 
   573 #define CATEGORY_MASK_UTF_8     \
   574   (CATEGORY_MASK_UTF_8_AUTO     \
   575    | CATEGORY_MASK_UTF_8_NOSIG  \
   576    | CATEGORY_MASK_UTF_8_SIG)
   577 
   578 /* Table of coding categories (Lisp symbols).  This variable is for
   579    internal use only.  */
   580 static Lisp_Object Vcoding_category_table;
   581 
   582 /* Table of coding-categories ordered by priority.  */
   583 static enum coding_category coding_priorities[coding_category_max];
   584 
   585 /* Nth element is a coding context for the coding system bound to the
   586    Nth coding category.  */
   587 static struct coding_system coding_categories[coding_category_max];
   588 
   589 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
   590 
   591 static int
   592 encode_inhibit_flag (Lisp_Object flag)
   593 {
   594   return NILP (flag) ? -1 : EQ (flag, Qt);
   595 }
   596 
   597 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
   598    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
   599 
   600 static bool
   601 inhibit_flag (int encoded_flag, bool var)
   602 {
   603   return 0 < encoded_flag + var;
   604 }
   605 
   606 #define CODING_GET_INFO(coding, attrs, charset_list)    \
   607   do {                                                  \
   608     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
   609     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
   610   } while (false)
   611 
   612 /* True if CODING's destination can be grown.  */
   613 
   614 static bool
   615 growable_destination (struct coding_system *coding)
   616 {
   617   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
   618 }
   619 
   620 
   621 /* Safely get one byte from the source text pointed by SRC which ends
   622    at SRC_END, and set C to that byte.  If there are not enough bytes
   623    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
   624    and a multibyte character is found at SRC, set C to the
   625    negative value of the character code.  The caller should declare
   626    and set these variables appropriately in advance:
   627         src, src_end, multibytep */
   628 
   629 #define ONE_MORE_BYTE(c)                                \
   630   do {                                                  \
   631     if (src == src_end)                                 \
   632       {                                                 \
   633         if (src_base < src)                             \
   634           record_conversion_result                      \
   635             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
   636         goto no_more_source;                            \
   637       }                                                 \
   638     c = *src++;                                         \
   639     if (multibytep && (c & 0x80))                       \
   640       {                                                 \
   641         if ((c & 0xFE) == 0xC0)                         \
   642           c = ((c & 1) << 6) | *src++;                  \
   643         else                                            \
   644           {                                             \
   645             src--;                                      \
   646             c = - string_char_advance (&src);           \
   647             record_conversion_result                    \
   648               (coding, CODING_RESULT_INVALID_SRC);      \
   649           }                                             \
   650       }                                                 \
   651     consumed_chars++;                                   \
   652   } while (0)
   653 
   654 /* Safely get two bytes from the source text pointed by SRC which ends
   655    at SRC_END, and set C1 and C2 to those bytes while skipping the
   656    heading multibyte characters.  If there are not enough bytes in the
   657    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
   658    a multibyte character is found for C2, set C2 to the negative value
   659    of the character code.  The caller should declare and set these
   660    variables appropriately in advance:
   661         src, src_end, multibytep
   662    It is intended that this macro is used in detect_coding_utf_16.  */
   663 
   664 #define TWO_MORE_BYTES(c1, c2)                          \
   665   do {                                                  \
   666     do {                                                \
   667       if (src == src_end)                               \
   668         goto no_more_source;                            \
   669       c1 = *src++;                                      \
   670       if (multibytep && (c1 & 0x80))                    \
   671         {                                               \
   672           if ((c1 & 0xFE) == 0xC0)                      \
   673             c1 = ((c1 & 1) << 6) | *src++;              \
   674           else                                          \
   675             {                                           \
   676               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
   677               c1 = -1;                                  \
   678             }                                           \
   679         }                                               \
   680     } while (c1 < 0);                                   \
   681     if (src == src_end)                                 \
   682       goto no_more_source;                              \
   683     c2 = *src++;                                        \
   684     if (multibytep && (c2 & 0x80))                      \
   685       {                                                 \
   686         if ((c2 & 0xFE) == 0xC0)                        \
   687           c2 = ((c2 & 1) << 6) | *src++;                \
   688         else                                            \
   689           c2 = -1;                                      \
   690       }                                                 \
   691   } while (0)
   692 
   693 
   694 /* Store a byte C in the place pointed by DST and increment DST to the
   695    next free point, and increment PRODUCED_CHARS.  The caller should
   696    assure that C is 0..127, and declare and set the variable `dst'
   697    appropriately in advance.
   698 */
   699 
   700 
   701 #define EMIT_ONE_ASCII_BYTE(c)  \
   702   do {                          \
   703     produced_chars++;           \
   704     *dst++ = (c);               \
   705   } while (0)
   706 
   707 
   708 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
   709 
   710 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
   711   do {                                  \
   712     produced_chars += 2;                \
   713     *dst++ = (c1), *dst++ = (c2);       \
   714   } while (0)
   715 
   716 
   717 /* Store a byte C in the place pointed by DST and increment DST to the
   718    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
   719    store in an appropriate multibyte form.  The caller should
   720    declare and set the variables `dst' and `multibytep' appropriately
   721    in advance.  */
   722 
   723 #define EMIT_ONE_BYTE(c)                \
   724   do {                                  \
   725     produced_chars++;                   \
   726     if (multibytep)                     \
   727       {                                 \
   728         unsigned ch = (c);              \
   729         if (ch >= 0x80)                 \
   730           ch = BYTE8_TO_CHAR (ch);      \
   731         dst += CHAR_STRING (ch, dst);   \
   732       }                                 \
   733     else                                \
   734       *dst++ = (c);                     \
   735   } while (0)
   736 
   737 
   738 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
   739 
   740 #define EMIT_TWO_BYTES(c1, c2)          \
   741   do {                                  \
   742     produced_chars += 2;                \
   743     if (multibytep)                     \
   744       {                                 \
   745         unsigned ch;                    \
   746                                         \
   747         ch = (c1);                      \
   748         if (ch >= 0x80)                 \
   749           ch = BYTE8_TO_CHAR (ch);      \
   750         dst += CHAR_STRING (ch, dst);   \
   751         ch = (c2);                      \
   752         if (ch >= 0x80)                 \
   753           ch = BYTE8_TO_CHAR (ch);      \
   754         dst += CHAR_STRING (ch, dst);   \
   755       }                                 \
   756     else                                \
   757       {                                 \
   758         *dst++ = (c1);                  \
   759         *dst++ = (c2);                  \
   760       }                                 \
   761   } while (0)
   762 
   763 
   764 #define EMIT_THREE_BYTES(c1, c2, c3)    \
   765   do {                                  \
   766     EMIT_ONE_BYTE (c1);                 \
   767     EMIT_TWO_BYTES (c2, c3);            \
   768   } while (0)
   769 
   770 
   771 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
   772   do {                                          \
   773     EMIT_TWO_BYTES (c1, c2);                    \
   774     EMIT_TWO_BYTES (c3, c4);                    \
   775   } while (0)
   776 
   777 
   778 static void
   779 record_conversion_result (struct coding_system *coding,
   780                           enum coding_result_code result)
   781 {
   782   coding->result = result;
   783   switch (result)
   784     {
   785     case CODING_RESULT_INSUFFICIENT_SRC:
   786       Vlast_code_conversion_error = Qinsufficient_source;
   787       break;
   788     case CODING_RESULT_INVALID_SRC:
   789       Vlast_code_conversion_error = Qinvalid_source;
   790       break;
   791     case CODING_RESULT_INTERRUPT:
   792       Vlast_code_conversion_error = Qinterrupted;
   793       break;
   794     case CODING_RESULT_INSUFFICIENT_DST:
   795       /* Don't record this error in Vlast_code_conversion_error
   796          because it happens just temporarily and is resolved when the
   797          whole conversion is finished.  */
   798       break;
   799     case CODING_RESULT_SUCCESS:
   800       break;
   801     default:
   802       Vlast_code_conversion_error = intern ("Unknown error");
   803     }
   804 }
   805 
   806 /* These wrapper macros are used to preserve validity of pointers into
   807    buffer text across calls to decode_char, encode_char, etc, which
   808    could cause relocation of buffers if it loads a charset map,
   809    because loading a charset map allocates large structures.  */
   810 
   811 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
   812   do {                                                                       \
   813     ptrdiff_t offset;                                                        \
   814                                                                              \
   815     charset_map_loaded = 0;                                                  \
   816     c = DECODE_CHAR (charset, code);                                         \
   817     if (charset_map_loaded                                                   \
   818         && (offset = coding_change_source (coding)))                         \
   819       {                                                                      \
   820         src += offset;                                                       \
   821         src_base += offset;                                                  \
   822         src_end += offset;                                                   \
   823       }                                                                      \
   824   } while (0)
   825 
   826 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
   827   do {                                                                  \
   828     ptrdiff_t offset;                                                   \
   829                                                                         \
   830     charset_map_loaded = 0;                                             \
   831     code = ENCODE_CHAR (charset, c);                                    \
   832     if (charset_map_loaded                                              \
   833         && (offset = coding_change_destination (coding)))               \
   834       {                                                                 \
   835         dst += offset;                                                  \
   836         dst_end += offset;                                              \
   837       }                                                                 \
   838   } while (0)
   839 
   840 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
   841   do {                                                                  \
   842     ptrdiff_t offset;                                                   \
   843                                                                         \
   844     charset_map_loaded = 0;                                             \
   845     charset = char_charset (c, charset_list, code_return);              \
   846     if (charset_map_loaded                                              \
   847         && (offset = coding_change_destination (coding)))               \
   848       {                                                                 \
   849         dst += offset;                                                  \
   850         dst_end += offset;                                              \
   851       }                                                                 \
   852   } while (0)
   853 
   854 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
   855   do {                                                                  \
   856     ptrdiff_t offset;                                                   \
   857                                                                         \
   858     charset_map_loaded = 0;                                             \
   859     result = CHAR_CHARSET_P (c, charset);                               \
   860     if (charset_map_loaded                                              \
   861         && (offset = coding_change_destination (coding)))               \
   862       {                                                                 \
   863         dst += offset;                                                  \
   864         dst_end += offset;                                              \
   865       }                                                                 \
   866   } while (0)
   867 
   868 
   869 /* If there are at least BYTES length of room at dst, allocate memory
   870    for coding->destination and update dst and dst_end.  We don't have
   871    to take care of coding->source which will be relocated.  It is
   872    handled by calling coding_set_source in encode_coding.  */
   873 
   874 #define ASSURE_DESTINATION(bytes)                               \
   875   do {                                                          \
   876     if (dst + (bytes) >= dst_end)                               \
   877       {                                                         \
   878         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
   879                                                                 \
   880         dst = alloc_destination (coding, more_bytes, dst);      \
   881         dst_end = coding->destination + coding->dst_bytes;      \
   882       }                                                         \
   883   } while (0)
   884 
   885 
   886 /* Store multibyte form of the character C in P, and advance P to the
   887    end of the multibyte form.  This used to be like adding CHAR_STRING
   888    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
   889    MAYBE_UNIFY_CHAR in CHAR_STRING.  */
   890 
   891 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) ((p) += CHAR_STRING (c, p))
   892 
   893 /* Return the character code of character whose multibyte form is at
   894    P, and advance P to the end of the multibyte form.  This used to be
   895    like string_char_advance without ever calling MAYBE_UNIFY_CHAR, but
   896    nowadays string_char_advance doesn't call MAYBE_UNIFY_CHAR.  */
   897 
   898 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) string_char_advance (&(p))
   899 
   900 /* Set coding->source from coding->src_object.  */
   901 
   902 static void
   903 coding_set_source (struct coding_system *coding)
   904 {
   905   if (BUFFERP (coding->src_object))
   906     {
   907       struct buffer *buf = XBUFFER (coding->src_object);
   908 
   909       if (coding->src_pos < 0)
   910         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
   911       else
   912         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
   913     }
   914   else if (STRINGP (coding->src_object))
   915     {
   916       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
   917     }
   918   else
   919     {
   920       /* Otherwise, the source is C string and is never relocated
   921          automatically.  Thus we don't have to update anything.  */
   922     }
   923 }
   924 
   925 
   926 /* Set coding->source from coding->src_object, and return how many
   927    bytes coding->source was changed.  */
   928 
   929 static ptrdiff_t
   930 coding_change_source (struct coding_system *coding)
   931 {
   932   const unsigned char *orig = coding->source;
   933   coding_set_source (coding);
   934   return coding->source - orig;
   935 }
   936 
   937 
   938 /* Set coding->destination from coding->dst_object.  */
   939 
   940 static void
   941 coding_set_destination (struct coding_system *coding)
   942 {
   943   if (BUFFERP (coding->dst_object))
   944     {
   945       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
   946         {
   947           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
   948           coding->dst_bytes = (GAP_END_ADDR
   949                                - (coding->src_bytes - coding->consumed)
   950                                - coding->destination);
   951         }
   952       else
   953         {
   954           /* We are sure that coding->dst_pos_byte is before the gap
   955              of the buffer. */
   956           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
   957                                  + coding->dst_pos_byte - BEG_BYTE);
   958           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
   959                                - coding->destination);
   960         }
   961     }
   962   else
   963     {
   964       /* Otherwise, the destination is C string and is never relocated
   965          automatically.  Thus we don't have to update anything.  */
   966     }
   967 }
   968 
   969 
   970 /* Set coding->destination from coding->dst_object, and return how
   971    many bytes coding->destination was changed.  */
   972 
   973 static ptrdiff_t
   974 coding_change_destination (struct coding_system *coding)
   975 {
   976   const unsigned char *orig = coding->destination;
   977   coding_set_destination (coding);
   978   return coding->destination - orig;
   979 }
   980 
   981 
   982 static void
   983 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
   984 {
   985   ptrdiff_t newbytes;
   986   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
   987       || SIZE_MAX < newbytes)
   988     string_overflow ();
   989   coding->destination = xrealloc (coding->destination, newbytes);
   990   coding->dst_bytes = newbytes;
   991 }
   992 
   993 static void
   994 coding_alloc_by_making_gap (struct coding_system *coding,
   995                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
   996 {
   997   if (EQ (coding->src_object, coding->dst_object))
   998     {
   999       /* The gap may contain the produced data at the head and not-yet
  1000          consumed data at the tail.  To preserve those data, we at
  1001          first make the gap size to zero, then increase the gap
  1002          size.  */
  1003       ptrdiff_t add = GAP_SIZE;
  1004 
  1005       GPT += gap_head_used, GPT_BYTE += gap_head_used;
  1006       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
  1007       make_gap (bytes);
  1008       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
  1009       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
  1010     }
  1011   else
  1012     make_gap_1 (XBUFFER (coding->dst_object), bytes);
  1013 }
  1014 
  1015 
  1016 static unsigned char *
  1017 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
  1018                    unsigned char *dst)
  1019 {
  1020   ptrdiff_t offset = dst - coding->destination;
  1021 
  1022   if (BUFFERP (coding->dst_object))
  1023     {
  1024       struct buffer *buf = XBUFFER (coding->dst_object);
  1025 
  1026       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
  1027     }
  1028   else
  1029     coding_alloc_by_realloc (coding, nbytes);
  1030   coding_set_destination (coding);
  1031   dst = coding->destination + offset;
  1032   return dst;
  1033 }
  1034 
  1035 /** Macros for annotations.  */
  1036 
  1037 /* An annotation data is stored in the array coding->charbuf in this
  1038    format:
  1039      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
  1040    LENGTH is the number of elements in the annotation.
  1041    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
  1042    NCHARS is the number of characters in the text annotated.
  1043 
  1044    The format of the following elements depend on ANNOTATION_MASK.
  1045 
  1046    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
  1047    follows:
  1048      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
  1049 
  1050    NBYTES is the number of bytes specified in the header part of
  1051    old-style emacs-mule encoding, or 0 for the other kind of
  1052    composition.
  1053 
  1054    METHOD is one of enum composition_method.
  1055 
  1056    Optional COMPOSITION-COMPONENTS are characters and composition
  1057    rules.
  1058 
  1059    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
  1060    follows.
  1061 
  1062    If ANNOTATION_MASK is 0, this annotation is just a space holder to
  1063    recover from an invalid annotation, and should be skipped by
  1064    produce_annotation.  */
  1065 
  1066 /* Maximum length of the header of annotation data.  */
  1067 #define MAX_ANNOTATION_LENGTH 5
  1068 
  1069 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
  1070   do {                                                  \
  1071     *(buf)++ = -(len);                                  \
  1072     *(buf)++ = (mask);                                  \
  1073     *(buf)++ = (nchars);                                \
  1074     coding->annotated = 1;                              \
  1075   } while (0);
  1076 
  1077 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
  1078   do {                                                                      \
  1079     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
  1080     *buf++ = nbytes;                                                        \
  1081     *buf++ = method;                                                        \
  1082   } while (0)
  1083 
  1084 
  1085 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
  1086   do {                                                                  \
  1087     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
  1088     *buf++ = id;                                                        \
  1089   } while (0)
  1090 
  1091 
  1092 /* Bitmasks for coding->eol_seen.  */
  1093 
  1094 #define EOL_SEEN_NONE   0
  1095 #define EOL_SEEN_LF     1
  1096 #define EOL_SEEN_CR     2
  1097 #define EOL_SEEN_CRLF   4
  1098 
  1099 
  1100 /*** 2. Emacs' internal format (emacs-utf-8) ***/
  1101 
  1102 
  1103 
  1104 
  1105 /*** 3. UTF-8 ***/
  1106 
  1107 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1108    Return true if a text is encoded in UTF-8.  */
  1109 
  1110 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
  1111 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
  1112 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
  1113 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
  1114 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
  1115 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
  1116 
  1117 #define UTF_8_BOM_1 0xEF
  1118 #define UTF_8_BOM_2 0xBB
  1119 #define UTF_8_BOM_3 0xBF
  1120 
  1121 /* Unlike the other detect_coding_XXX, this function counts the number
  1122    of characters and checks the EOL format.  */
  1123 
  1124 static bool
  1125 detect_coding_utf_8 (struct coding_system *coding,
  1126                      struct coding_detection_info *detect_info)
  1127 {
  1128   const unsigned char *src = coding->source, *src_base;
  1129   const unsigned char *src_end = coding->source + coding->src_bytes;
  1130   bool multibytep = coding->src_multibyte;
  1131   ptrdiff_t consumed_chars = 0;
  1132   bool bom_found = 0;
  1133   ptrdiff_t nchars = coding->head_ascii;
  1134 
  1135   detect_info->checked |= CATEGORY_MASK_UTF_8;
  1136   /* A coding system of this category is always ASCII compatible.  */
  1137   src += nchars;
  1138 
  1139   if (src == coding->source     /* BOM should be at the head.  */
  1140       && src + 3 < src_end      /* BOM is 3-byte long.  */
  1141       && src[0] == UTF_8_BOM_1
  1142       && src[1] == UTF_8_BOM_2
  1143       && src[2] == UTF_8_BOM_3)
  1144     {
  1145       bom_found = 1;
  1146       src += 3;
  1147       nchars++;
  1148     }
  1149 
  1150   while (1)
  1151     {
  1152       int c, c1, c2, c3, c4;
  1153 
  1154       src_base = src;
  1155       ONE_MORE_BYTE (c);
  1156       if (c < 0 || UTF_8_1_OCTET_P (c))
  1157         {
  1158           nchars++;
  1159           if (c == '\r')
  1160             {
  1161               if (src < src_end && *src == '\n')
  1162                 {
  1163                   src++;
  1164                   nchars++;
  1165                 }
  1166             }
  1167           continue;
  1168         }
  1169       ONE_MORE_BYTE (c1);
  1170       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
  1171         break;
  1172       if (UTF_8_2_OCTET_LEADING_P (c))
  1173         {
  1174           nchars++;
  1175           continue;
  1176         }
  1177       ONE_MORE_BYTE (c2);
  1178       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
  1179         break;
  1180       if (UTF_8_3_OCTET_LEADING_P (c))
  1181         {
  1182           nchars++;
  1183           continue;
  1184         }
  1185       ONE_MORE_BYTE (c3);
  1186       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
  1187         break;
  1188       if (UTF_8_4_OCTET_LEADING_P (c))
  1189         {
  1190           nchars++;
  1191           continue;
  1192         }
  1193       ONE_MORE_BYTE (c4);
  1194       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
  1195         break;
  1196       if (UTF_8_5_OCTET_LEADING_P (c)
  1197           /* If we ever need to increase MAX_CHAR, the below may need
  1198              to be reviewed.  */
  1199           && c < MAX_MULTIBYTE_LEADING_CODE)
  1200         {
  1201           nchars++;
  1202           continue;
  1203         }
  1204       break;
  1205     }
  1206   detect_info->rejected |= CATEGORY_MASK_UTF_8;
  1207   return 0;
  1208 
  1209  no_more_source:
  1210   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  1211     {
  1212       detect_info->rejected |= CATEGORY_MASK_UTF_8;
  1213       return 0;
  1214     }
  1215   if (bom_found)
  1216     {
  1217       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
  1218       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
  1219     }
  1220   else
  1221     {
  1222       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
  1223       if (nchars < src_end - coding->source)
  1224         /* The found characters are less than source bytes, which
  1225            means that we found a valid non-ASCII characters.  */
  1226         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
  1227     }
  1228   coding->detected_utf8_bytes = src_base - coding->source;
  1229   coding->detected_utf8_chars = nchars;
  1230   return 1;
  1231 }
  1232 
  1233 
  1234 static void
  1235 decode_coding_utf_8 (struct coding_system *coding)
  1236 {
  1237   const unsigned char *src = coding->source + coding->consumed;
  1238   const unsigned char *src_end = coding->source + coding->src_bytes;
  1239   const unsigned char *src_base;
  1240   int *charbuf = coding->charbuf + coding->charbuf_used;
  1241   int *charbuf_end = coding->charbuf + coding->charbuf_size;
  1242   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
  1243   bool multibytep = coding->src_multibyte;
  1244   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
  1245   bool eol_dos
  1246     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  1247   int byte_after_cr = -1;
  1248 
  1249   if (bom != utf_without_bom)
  1250     {
  1251       int c1, c2, c3;
  1252 
  1253       src_base = src;
  1254       ONE_MORE_BYTE (c1);
  1255       if (! UTF_8_3_OCTET_LEADING_P (c1))
  1256         src = src_base;
  1257       else
  1258         {
  1259           ONE_MORE_BYTE (c2);
  1260           if (! UTF_8_EXTRA_OCTET_P (c2))
  1261             src = src_base;
  1262           else
  1263             {
  1264               ONE_MORE_BYTE (c3);
  1265               if (! UTF_8_EXTRA_OCTET_P (c3))
  1266                 src = src_base;
  1267               else
  1268                 {
  1269                   if ((c1 != UTF_8_BOM_1)
  1270                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
  1271                     src = src_base;
  1272                   else
  1273                     CODING_UTF_8_BOM (coding) = utf_without_bom;
  1274                 }
  1275             }
  1276         }
  1277     }
  1278   CODING_UTF_8_BOM (coding) = utf_without_bom;
  1279 
  1280   while (1)
  1281     {
  1282       int c, c1, c2, c3, c4, c5;
  1283 
  1284       src_base = src;
  1285       consumed_chars_base = consumed_chars;
  1286 
  1287       if (charbuf >= charbuf_end)
  1288         {
  1289           if (byte_after_cr >= 0)
  1290             src_base--;
  1291           break;
  1292         }
  1293 
  1294       /* In the simple case, rapidly handle ordinary characters */
  1295       if (multibytep && ! eol_dos
  1296           && charbuf < charbuf_end - 6 && src < src_end - 6)
  1297         {
  1298           while (charbuf < charbuf_end - 6 && src < src_end - 6)
  1299             {
  1300               c1 = *src;
  1301               if (c1 & 0x80)
  1302                 break;
  1303               src++;
  1304               consumed_chars++;
  1305               *charbuf++ = c1;
  1306 
  1307               c1 = *src;
  1308               if (c1 & 0x80)
  1309                 break;
  1310               src++;
  1311               consumed_chars++;
  1312               *charbuf++ = c1;
  1313 
  1314               c1 = *src;
  1315               if (c1 & 0x80)
  1316                 break;
  1317               src++;
  1318               consumed_chars++;
  1319               *charbuf++ = c1;
  1320 
  1321               c1 = *src;
  1322               if (c1 & 0x80)
  1323                 break;
  1324               src++;
  1325               consumed_chars++;
  1326               *charbuf++ = c1;
  1327             }
  1328           /* If we handled at least one character, restart the main loop.  */
  1329           if (src != src_base)
  1330             continue;
  1331         }
  1332 
  1333       if (byte_after_cr >= 0)
  1334         c1 = byte_after_cr, byte_after_cr = -1;
  1335       else
  1336         ONE_MORE_BYTE (c1);
  1337       if (c1 < 0)
  1338         {
  1339           c = - c1;
  1340         }
  1341       else if (UTF_8_1_OCTET_P (c1))
  1342         {
  1343           if (eol_dos && c1 == '\r')
  1344             ONE_MORE_BYTE (byte_after_cr);
  1345           c = c1;
  1346         }
  1347       else
  1348         {
  1349           ONE_MORE_BYTE (c2);
  1350           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
  1351             goto invalid_code;
  1352           if (UTF_8_2_OCTET_LEADING_P (c1))
  1353             {
  1354               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
  1355               /* Reject overlong sequences here and below.  Encoders
  1356                  producing them are incorrect, they can be misleading,
  1357                  and they mess up read/write invariance.  */
  1358               if (c < 128)
  1359                 goto invalid_code;
  1360             }
  1361           else
  1362             {
  1363               ONE_MORE_BYTE (c3);
  1364               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
  1365                 goto invalid_code;
  1366               if (UTF_8_3_OCTET_LEADING_P (c1))
  1367                 {
  1368                   c = (((c1 & 0xF) << 12)
  1369                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
  1370                   if (c < 0x800
  1371                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
  1372                     goto invalid_code;
  1373                 }
  1374               else
  1375                 {
  1376                   ONE_MORE_BYTE (c4);
  1377                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
  1378                     goto invalid_code;
  1379                   if (UTF_8_4_OCTET_LEADING_P (c1))
  1380                     {
  1381                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
  1382                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
  1383                     if (c < 0x10000)
  1384                       goto invalid_code;
  1385                     }
  1386                   else
  1387                     {
  1388                       ONE_MORE_BYTE (c5);
  1389                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
  1390                         goto invalid_code;
  1391                       if (UTF_8_5_OCTET_LEADING_P (c1))
  1392                         {
  1393                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
  1394                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
  1395                                | (c5 & 0x3F));
  1396                           if ((c > MAX_CHAR) || (c < 0x200000))
  1397                             goto invalid_code;
  1398                         }
  1399                       else
  1400                         goto invalid_code;
  1401                     }
  1402                 }
  1403             }
  1404         }
  1405 
  1406       *charbuf++ = c;
  1407       continue;
  1408 
  1409     invalid_code:
  1410       src = src_base;
  1411       consumed_chars = consumed_chars_base;
  1412       ONE_MORE_BYTE (c);
  1413       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  1414     }
  1415 
  1416  no_more_source:
  1417   coding->consumed_char += consumed_chars_base;
  1418   coding->consumed = src_base - coding->source;
  1419   coding->charbuf_used = charbuf - coding->charbuf;
  1420 }
  1421 
  1422 
  1423 bool
  1424 encode_coding_utf_8 (struct coding_system *coding)
  1425 {
  1426   bool multibytep = coding->dst_multibyte;
  1427   int *charbuf = coding->charbuf;
  1428   int *charbuf_end = charbuf + coding->charbuf_used;
  1429   unsigned char *dst = coding->destination + coding->produced;
  1430   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  1431   ptrdiff_t produced_chars = 0;
  1432   int c;
  1433 
  1434   if (CODING_UTF_8_BOM (coding) != utf_without_bom)
  1435     {
  1436       ASSURE_DESTINATION (3);
  1437       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
  1438       CODING_UTF_8_BOM (coding) = utf_without_bom;
  1439     }
  1440 
  1441   if (multibytep)
  1442     {
  1443       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
  1444 
  1445       while (charbuf < charbuf_end)
  1446         {
  1447           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
  1448 
  1449           ASSURE_DESTINATION (safe_room);
  1450           c = *charbuf++;
  1451           if (CHAR_BYTE8_P (c))
  1452             {
  1453               c = CHAR_TO_BYTE8 (c);
  1454               EMIT_ONE_BYTE (c);
  1455             }
  1456           else
  1457             {
  1458               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
  1459               for (p = str; p < pend; p++)
  1460                 EMIT_ONE_BYTE (*p);
  1461             }
  1462         }
  1463     }
  1464   else
  1465     {
  1466       int safe_room = MAX_MULTIBYTE_LENGTH;
  1467 
  1468       while (charbuf < charbuf_end)
  1469         {
  1470           ASSURE_DESTINATION (safe_room);
  1471           c = *charbuf++;
  1472           if (CHAR_BYTE8_P (c))
  1473             *dst++ = CHAR_TO_BYTE8 (c);
  1474           else
  1475             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
  1476         }
  1477       produced_chars = dst - (coding->destination + coding->produced);
  1478     }
  1479   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  1480   coding->produced_char += produced_chars;
  1481   coding->produced = dst - coding->destination;
  1482   return 0;
  1483 }
  1484 
  1485 
  1486 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1487    Return true if a text is encoded in one of UTF-16 based coding systems.  */
  1488 
  1489 static bool
  1490 detect_coding_utf_16 (struct coding_system *coding,
  1491                       struct coding_detection_info *detect_info)
  1492 {
  1493   const unsigned char *src = coding->source;
  1494   const unsigned char *src_end = coding->source + coding->src_bytes;
  1495   bool multibytep = coding->src_multibyte;
  1496   int c1, c2;
  1497 
  1498   detect_info->checked |= CATEGORY_MASK_UTF_16;
  1499   if (coding->mode & CODING_MODE_LAST_BLOCK
  1500       && (coding->src_chars & 1))
  1501     {
  1502       detect_info->rejected |= CATEGORY_MASK_UTF_16;
  1503       return 0;
  1504     }
  1505 
  1506   TWO_MORE_BYTES (c1, c2);
  1507   if ((c1 == 0xFF) && (c2 == 0xFE))
  1508     {
  1509       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
  1510                              | CATEGORY_MASK_UTF_16_AUTO);
  1511       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
  1512                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
  1513                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
  1514     }
  1515   else if ((c1 == 0xFE) && (c2 == 0xFF))
  1516     {
  1517       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
  1518                              | CATEGORY_MASK_UTF_16_AUTO);
  1519       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
  1520                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
  1521                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
  1522     }
  1523   else if (c2 < 0)
  1524     {
  1525       detect_info->rejected |= CATEGORY_MASK_UTF_16;
  1526       return 0;
  1527     }
  1528   else
  1529     {
  1530       /* We check the dispersion of Eth and Oth bytes where E is even and
  1531          O is odd.  If both are high, we assume binary data.*/
  1532       unsigned char e[256], o[256];
  1533       unsigned e_num = 1, o_num = 1;
  1534 
  1535       memset (e, 0, 256);
  1536       memset (o, 0, 256);
  1537       e[c1] = 1;
  1538       o[c2] = 1;
  1539 
  1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
  1541                                 |CATEGORY_MASK_UTF_16_BE
  1542                                 | CATEGORY_MASK_UTF_16_LE);
  1543 
  1544       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
  1545              != CATEGORY_MASK_UTF_16)
  1546         {
  1547           TWO_MORE_BYTES (c1, c2);
  1548           if (c2 < 0)
  1549             break;
  1550           if (! e[c1])
  1551             {
  1552               e[c1] = 1;
  1553               e_num++;
  1554               if (e_num >= 128)
  1555                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
  1556             }
  1557           if (! o[c2])
  1558             {
  1559               o[c2] = 1;
  1560               o_num++;
  1561               if (o_num >= 128)
  1562                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
  1563             }
  1564         }
  1565       return 0;
  1566     }
  1567 
  1568  no_more_source:
  1569   return 1;
  1570 }
  1571 
  1572 static void
  1573 decode_coding_utf_16 (struct coding_system *coding)
  1574 {
  1575   const unsigned char *src = coding->source + coding->consumed;
  1576   const unsigned char *src_end = coding->source + coding->src_bytes;
  1577   const unsigned char *src_base;
  1578   int *charbuf = coding->charbuf + coding->charbuf_used;
  1579   /* We may produces at most 3 chars in one loop.  */
  1580   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
  1581   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
  1582   bool multibytep = coding->src_multibyte;
  1583   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
  1584   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
  1585   int surrogate = CODING_UTF_16_SURROGATE (coding);
  1586   bool eol_dos
  1587     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  1588   int byte_after_cr1 = -1, byte_after_cr2 = -1;
  1589 
  1590   if (bom == utf_with_bom)
  1591     {
  1592       int c, c1, c2;
  1593 
  1594       src_base = src;
  1595       ONE_MORE_BYTE (c1);
  1596       ONE_MORE_BYTE (c2);
  1597       c = (c1 << 8) | c2;
  1598 
  1599       if (endian == utf_16_big_endian
  1600           ? c != 0xFEFF : c != 0xFFFE)
  1601         {
  1602           /* The first two bytes are not BOM.  Treat them as bytes
  1603              for a normal character.  */
  1604           src = src_base;
  1605         }
  1606       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1607     }
  1608   else if (bom == utf_detect_bom)
  1609     {
  1610       /* We have already tried to detect BOM and failed in
  1611          detect_coding.  */
  1612       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1613     }
  1614 
  1615   while (1)
  1616     {
  1617       int c, c1, c2;
  1618 
  1619       src_base = src;
  1620       consumed_chars_base = consumed_chars;
  1621 
  1622       if (charbuf >= charbuf_end)
  1623         {
  1624           if (byte_after_cr1 >= 0)
  1625             src_base -= 2;
  1626           break;
  1627         }
  1628 
  1629       if (byte_after_cr1 >= 0)
  1630         c1 = byte_after_cr1, byte_after_cr1 = -1;
  1631       else
  1632         ONE_MORE_BYTE (c1);
  1633       if (c1 < 0)
  1634         {
  1635           *charbuf++ = -c1;
  1636           continue;
  1637         }
  1638       if (byte_after_cr2 >= 0)
  1639         c2 = byte_after_cr2, byte_after_cr2 = -1;
  1640       else
  1641         ONE_MORE_BYTE (c2);
  1642       if (c2 < 0)
  1643         {
  1644           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  1645           *charbuf++ = -c2;
  1646           continue;
  1647         }
  1648       c = (endian == utf_16_big_endian
  1649            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
  1650 
  1651       if (surrogate)
  1652         {
  1653           if (! UTF_16_LOW_SURROGATE_P (c))
  1654             {
  1655               if (endian == utf_16_big_endian)
  1656                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
  1657               else
  1658                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
  1659               *charbuf++ = c1;
  1660               *charbuf++ = c2;
  1661               if (UTF_16_HIGH_SURROGATE_P (c))
  1662                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
  1663               else
  1664                 *charbuf++ = c;
  1665             }
  1666           else
  1667             {
  1668               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
  1669               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
  1670               *charbuf++ = 0x10000 + c;
  1671             }
  1672         }
  1673       else
  1674         {
  1675           if (UTF_16_HIGH_SURROGATE_P (c))
  1676             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
  1677           else
  1678             {
  1679               if (eol_dos && c == '\r')
  1680                 {
  1681                   ONE_MORE_BYTE (byte_after_cr1);
  1682                   ONE_MORE_BYTE (byte_after_cr2);
  1683                 }
  1684               *charbuf++ = c;
  1685             }
  1686         }
  1687     }
  1688 
  1689  no_more_source:
  1690   coding->consumed_char += consumed_chars_base;
  1691   coding->consumed = src_base - coding->source;
  1692   coding->charbuf_used = charbuf - coding->charbuf;
  1693 }
  1694 
  1695 static bool
  1696 encode_coding_utf_16 (struct coding_system *coding)
  1697 {
  1698   bool multibytep = coding->dst_multibyte;
  1699   int *charbuf = coding->charbuf;
  1700   int *charbuf_end = charbuf + coding->charbuf_used;
  1701   unsigned char *dst = coding->destination + coding->produced;
  1702   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  1703   int safe_room = 8;
  1704   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
  1705   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
  1706   ptrdiff_t produced_chars = 0;
  1707   int c;
  1708 
  1709   if (bom != utf_without_bom)
  1710     {
  1711       ASSURE_DESTINATION (safe_room);
  1712       if (big_endian)
  1713         EMIT_TWO_BYTES (0xFE, 0xFF);
  1714       else
  1715         EMIT_TWO_BYTES (0xFF, 0xFE);
  1716       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1717     }
  1718 
  1719   while (charbuf < charbuf_end)
  1720     {
  1721       ASSURE_DESTINATION (safe_room);
  1722       c = *charbuf++;
  1723       if (c > MAX_UNICODE_CHAR)
  1724         c = coding->default_char;
  1725 
  1726       if (c < 0x10000)
  1727         {
  1728           if (big_endian)
  1729             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
  1730           else
  1731             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
  1732         }
  1733       else
  1734         {
  1735           int c1, c2;
  1736 
  1737           c -= 0x10000;
  1738           c1 = (c >> 10) + 0xD800;
  1739           c2 = (c & 0x3FF) + 0xDC00;
  1740           if (big_endian)
  1741             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
  1742           else
  1743             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
  1744         }
  1745     }
  1746   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  1747   coding->produced = dst - coding->destination;
  1748   coding->produced_char += produced_chars;
  1749   return 0;
  1750 }
  1751 
  1752 
  1753 /*** 6. Old Emacs' internal format (emacs-mule) ***/
  1754 
  1755 /* Emacs' internal format for representation of multiple character
  1756    sets is a kind of multi-byte encoding, i.e. characters are
  1757    represented by variable-length sequences of one-byte codes.
  1758 
  1759    ASCII characters and control characters (e.g. `tab', `newline') are
  1760    represented by one-byte sequences which are their ASCII codes, in
  1761    the range 0x00 through 0x7F.
  1762 
  1763    8-bit characters of the range 0x80..0x9F are represented by
  1764    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
  1765    code + 0x20).
  1766 
  1767    8-bit characters of the range 0xA0..0xFF are represented by
  1768    one-byte sequences which are their 8-bit code.
  1769 
  1770    The other characters are represented by a sequence of `base
  1771    leading-code', optional `extended leading-code', and one or two
  1772    `position-code's.  The length of the sequence is determined by the
  1773    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
  1774    whereas extended leading-code and position-code take the range 0xA0
  1775    through 0xFF.  See `charset.h' for more details about leading-code
  1776    and position-code.
  1777 
  1778    --- CODE RANGE of Emacs' internal format ---
  1779    character set        range
  1780    -------------        -----
  1781    ascii                0x00..0x7F
  1782    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
  1783    eight-bit-graphic    0xA0..0xBF
  1784    ELSE                 0x81..0x9D + [0xA0..0xFF]+
  1785    ---------------------------------------------
  1786 
  1787    As this is the internal character representation, the format is
  1788    usually not used externally (i.e. in a file or in a data sent to a
  1789    process).  But, it is possible to have a text externally in this
  1790    format (i.e. by encoding by the coding system `emacs-mule').
  1791 
  1792    In that case, a sequence of one-byte codes has a slightly different
  1793    form.
  1794 
  1795    At first, all characters in eight-bit-control are represented by
  1796    one-byte sequences which are their 8-bit code.
  1797 
  1798    Next, character composition data are represented by the byte
  1799    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
  1800    where,
  1801         METHOD is 0xF2 plus one of composition method (enum
  1802         composition_method),
  1803 
  1804         BYTES is 0xA0 plus a byte length of this composition data,
  1805 
  1806         CHARS is 0xA0 plus a number of characters composed by this
  1807         data,
  1808 
  1809         COMPONENTs are characters of multibyte form or composition
  1810         rules encoded by two-byte of ASCII codes.
  1811 
  1812    In addition, for backward compatibility, the following formats are
  1813    also recognized as composition data on decoding.
  1814 
  1815    0x80 MSEQ ...
  1816    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
  1817 
  1818    Here,
  1819         MSEQ is a multibyte form but in these special format:
  1820           ASCII: 0xA0 ASCII_CODE+0x80,
  1821           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
  1822         RULE is a one byte code of the range 0xA0..0xF0 that
  1823         represents a composition rule.
  1824   */
  1825 
  1826 char emacs_mule_bytes[256];
  1827 
  1828 
  1829 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1830    Return true if a text is encoded in 'emacs-mule'.  */
  1831 
  1832 static bool
  1833 detect_coding_emacs_mule (struct coding_system *coding,
  1834                           struct coding_detection_info *detect_info)
  1835 {
  1836   const unsigned char *src = coding->source, *src_base;
  1837   const unsigned char *src_end = coding->source + coding->src_bytes;
  1838   bool multibytep = coding->src_multibyte;
  1839   ptrdiff_t consumed_chars = 0;
  1840   int c;
  1841   int found = 0;
  1842 
  1843   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
  1844   /* A coding system of this category is always ASCII compatible.  */
  1845   src += coding->head_ascii;
  1846 
  1847   while (1)
  1848     {
  1849       src_base = src;
  1850       ONE_MORE_BYTE (c);
  1851       if (c < 0)
  1852         continue;
  1853       if (c == 0x80)
  1854         {
  1855           /* Perhaps the start of composite character.  We simply skip
  1856              it because analyzing it is too heavy for detecting.  But,
  1857              at least, we check that the composite character
  1858              constitutes of more than 4 bytes.  */
  1859           const unsigned char *src_start;
  1860 
  1861         repeat:
  1862           src_start = src;
  1863           do
  1864             {
  1865               ONE_MORE_BYTE (c);
  1866             }
  1867           while (c >= 0xA0);
  1868 
  1869           if (src - src_start <= 4)
  1870             break;
  1871           found = CATEGORY_MASK_EMACS_MULE;
  1872           if (c == 0x80)
  1873             goto repeat;
  1874         }
  1875 
  1876       if (c < 0x80)
  1877         {
  1878           if (c < 0x20
  1879               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
  1880             break;
  1881         }
  1882       else
  1883         {
  1884           int more_bytes = emacs_mule_bytes[c] - 1;
  1885 
  1886           while (more_bytes > 0)
  1887             {
  1888               ONE_MORE_BYTE (c);
  1889               if (c < 0xA0)
  1890                 {
  1891                   src--;        /* Unread the last byte.  */
  1892                   break;
  1893                 }
  1894               more_bytes--;
  1895             }
  1896           if (more_bytes != 0)
  1897             break;
  1898           found = CATEGORY_MASK_EMACS_MULE;
  1899         }
  1900     }
  1901   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
  1902   return 0;
  1903 
  1904  no_more_source:
  1905   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  1906     {
  1907       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
  1908       return 0;
  1909     }
  1910   detect_info->found |= found;
  1911   return 1;
  1912 }
  1913 
  1914 
  1915 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
  1916    character.  If CMP_STATUS indicates that we must expect MSEQ or
  1917    RULE described above, decode it and return the negative value of
  1918    the decoded character or rule.  If an invalid byte is found, return
  1919    -1.  If SRC is too short, return -2.  */
  1920 
  1921 static int
  1922 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
  1923                  int *nbytes, int *nchars, int *id,
  1924                  struct composition_status *cmp_status)
  1925 {
  1926   const unsigned char *src_end = coding->source + coding->src_bytes;
  1927   const unsigned char *src_base = src;
  1928   bool multibytep = coding->src_multibyte;
  1929   int charset_ID;
  1930   unsigned code;
  1931   int c;
  1932   ptrdiff_t consumed_chars = 0;
  1933   bool mseq_found = 0;
  1934 
  1935   ONE_MORE_BYTE (c);
  1936   if (c < 0)
  1937     {
  1938       c = -c;
  1939       charset_ID = emacs_mule_charset[0];
  1940     }
  1941   else
  1942     {
  1943       if (c >= 0xA0)
  1944         {
  1945           if (cmp_status->state != COMPOSING_NO
  1946               && cmp_status->old_form)
  1947             {
  1948               if (cmp_status->state == COMPOSING_CHAR)
  1949                 {
  1950                   if (c == 0xA0)
  1951                     {
  1952                       ONE_MORE_BYTE (c);
  1953                       c -= 0x80;
  1954                       if (c < 0)
  1955                         goto invalid_code;
  1956                     }
  1957                   else
  1958                     c -= 0x20;
  1959                   mseq_found = 1;
  1960                 }
  1961               else
  1962                 {
  1963                   *nbytes = src - src_base;
  1964                   *nchars = consumed_chars;
  1965                   return -c;
  1966                 }
  1967             }
  1968           else
  1969             goto invalid_code;
  1970         }
  1971 
  1972       switch (emacs_mule_bytes[c])
  1973         {
  1974         case 2:
  1975           if ((charset_ID = emacs_mule_charset[c]) < 0)
  1976             goto invalid_code;
  1977           ONE_MORE_BYTE (c);
  1978           if (c < 0xA0)
  1979             goto invalid_code;
  1980           code = c & 0x7F;
  1981           break;
  1982 
  1983         case 3:
  1984           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
  1985               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
  1986             {
  1987               ONE_MORE_BYTE (c);
  1988               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
  1989                 goto invalid_code;
  1990               ONE_MORE_BYTE (c);
  1991               if (c < 0xA0)
  1992                 goto invalid_code;
  1993               code = c & 0x7F;
  1994             }
  1995           else
  1996             {
  1997               if ((charset_ID = emacs_mule_charset[c]) < 0)
  1998                 goto invalid_code;
  1999               ONE_MORE_BYTE (c);
  2000               if (c < 0xA0)
  2001                 goto invalid_code;
  2002               code = (c & 0x7F) << 8;
  2003               ONE_MORE_BYTE (c);
  2004               if (c < 0xA0)
  2005                 goto invalid_code;
  2006               code |= c & 0x7F;
  2007             }
  2008           break;
  2009 
  2010         case 4:
  2011           ONE_MORE_BYTE (c);
  2012           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
  2013             goto invalid_code;
  2014           ONE_MORE_BYTE (c);
  2015           if (c < 0xA0)
  2016             goto invalid_code;
  2017           code = (c & 0x7F) << 8;
  2018           ONE_MORE_BYTE (c);
  2019           if (c < 0xA0)
  2020             goto invalid_code;
  2021           code |= c & 0x7F;
  2022           break;
  2023 
  2024         case 1:
  2025           code = c;
  2026           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
  2027           break;
  2028 
  2029         default:
  2030           emacs_abort ();
  2031         }
  2032       CODING_DECODE_CHAR (coding, src, src_base, src_end,
  2033                           CHARSET_FROM_ID (charset_ID), code, c);
  2034       if (c < 0)
  2035         goto invalid_code;
  2036     }
  2037   *nbytes = src - src_base;
  2038   *nchars = consumed_chars;
  2039   if (id)
  2040     *id = charset_ID;
  2041   return (mseq_found ? -c : c);
  2042 
  2043  no_more_source:
  2044   return -2;
  2045 
  2046  invalid_code:
  2047   return -1;
  2048 }
  2049 
  2050 
  2051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  2052 
  2053 /* Handle these composition sequence ('|': the end of header elements,
  2054    BYTES and CHARS >= 0xA0):
  2055 
  2056    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
  2057    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
  2058    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
  2059 
  2060    and these old form:
  2061 
  2062    (4) relative composition: 0x80 | MSEQ ... MSEQ
  2063    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
  2064 
  2065    When the starter 0x80 and the following header elements are found,
  2066    this annotation header is produced.
  2067 
  2068         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
  2069 
  2070    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
  2071    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
  2072 
  2073    Then, upon reading the following elements, these codes are produced
  2074    until the composition end is found:
  2075 
  2076    (1) CHAR ... CHAR
  2077    (2) ALT ... ALT CHAR ... CHAR
  2078    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
  2079    (4) CHAR ... CHAR
  2080    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
  2081 
  2082    When the composition end is found, LENGTH and NCHARS in the
  2083    annotation header is updated as below:
  2084 
  2085    (1) LENGTH: unchanged, NCHARS: unchanged
  2086    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
  2087    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
  2088    (4) LENGTH: unchanged,  NCHARS: number of CHARs
  2089    (5) LENGTH: unchanged,  NCHARS: number of CHARs
  2090 
  2091    If an error is found while composing, the annotation header is
  2092    changed to the original composition header (plus filler -1s) as
  2093    below:
  2094 
  2095    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
  2096    (5)          [ 0x80 0xFF -1 -1- -1 ]
  2097 
  2098    and the sequence [ -2 DECODED-RULE ] is changed to the original
  2099    byte sequence as below:
  2100         o the original byte sequence is B: [ B -1 ]
  2101         o the original byte sequence is B1 B2: [ B1 B2 ]
  2102 
  2103    Most of the routines are implemented by macros because many
  2104    variables and labels in the caller decode_coding_emacs_mule must be
  2105    accessible, and they are usually called just once (thus doesn't
  2106    increase the size of compiled object).  */
  2107 
  2108 /* Decode a composition rule represented by C as a component of
  2109    composition sequence of Emacs 20 style.  Set RULE to the decoded
  2110    rule. */
  2111 
  2112 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
  2113   do {                                                  \
  2114     int gref, nref;                                     \
  2115                                                         \
  2116     c -= 0xA0;                                          \
  2117     if (c < 0 || c >= 81)                               \
  2118       goto invalid_code;                                \
  2119     gref = c / 9, nref = c % 9;                         \
  2120     if (gref == 4) gref = 10;                           \
  2121     if (nref == 4) nref = 10;                           \
  2122     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
  2123   } while (0)
  2124 
  2125 
  2126 /* Decode a composition rule represented by C and the following byte
  2127    at SRC as a component of composition sequence of Emacs 21 style.
  2128    Set RULE to the decoded rule.  */
  2129 
  2130 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
  2131   do {                                                  \
  2132     int gref, nref;                                     \
  2133                                                         \
  2134     gref = c - 0x20;                                    \
  2135     if (gref < 0 || gref >= 81)                         \
  2136       goto invalid_code;                                \
  2137     ONE_MORE_BYTE (c);                                  \
  2138     nref = c - 0x20;                                    \
  2139     if (nref < 0 || nref >= 81)                         \
  2140       goto invalid_code;                                \
  2141     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
  2142   } while (0)
  2143 
  2144 
  2145 /* Start of Emacs 21 style format.  The first three bytes at SRC are
  2146    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
  2147    byte length of this composition information, CHARS is the number of
  2148    characters composed by this composition.  */
  2149 
  2150 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
  2151   do {                                                                  \
  2152     enum composition_method method = c - 0xF2;                          \
  2153     int nbytes, nchars;                                                 \
  2154                                                                         \
  2155     ONE_MORE_BYTE (c);                                                  \
  2156     if (c < 0)                                                          \
  2157       goto invalid_code;                                                \
  2158     nbytes = c - 0xA0;                                                  \
  2159     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
  2160       goto invalid_code;                                                \
  2161     ONE_MORE_BYTE (c);                                                  \
  2162     nchars = c - 0xA0;                                                  \
  2163     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
  2164       goto invalid_code;                                                \
  2165     cmp_status->old_form = 0;                                           \
  2166     cmp_status->method = method;                                        \
  2167     if (method == COMPOSITION_RELATIVE)                                 \
  2168       cmp_status->state = COMPOSING_CHAR;                               \
  2169     else                                                                \
  2170       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
  2171     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
  2172     cmp_status->nchars = nchars;                                        \
  2173     cmp_status->ncomps = nbytes - 4;                                    \
  2174     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
  2175   } while (0)
  2176 
  2177 
  2178 /* Start of Emacs 20 style format for relative composition.  */
  2179 
  2180 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
  2181   do {                                                          \
  2182     cmp_status->old_form = 1;                                   \
  2183     cmp_status->method = COMPOSITION_RELATIVE;                  \
  2184     cmp_status->state = COMPOSING_CHAR;                         \
  2185     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
  2186     cmp_status->nchars = cmp_status->ncomps = 0;                \
  2187     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
  2188   } while (0)
  2189 
  2190 
  2191 /* Start of Emacs 20 style format for rule-base composition.  */
  2192 
  2193 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
  2194   do {                                                          \
  2195     cmp_status->old_form = 1;                                   \
  2196     cmp_status->method = COMPOSITION_WITH_RULE;                 \
  2197     cmp_status->state = COMPOSING_CHAR;                         \
  2198     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
  2199     cmp_status->nchars = cmp_status->ncomps = 0;                \
  2200     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
  2201   } while (0)
  2202 
  2203 
  2204 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
  2205   do {                                                  \
  2206     const unsigned char *current_src = src;             \
  2207                                                         \
  2208     ONE_MORE_BYTE (c);                                  \
  2209     if (c < 0)                                          \
  2210       goto invalid_code;                                \
  2211     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
  2212         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
  2213       DECODE_EMACS_MULE_21_COMPOSITION ();              \
  2214     else if (c < 0xA0)                                  \
  2215       goto invalid_code;                                \
  2216     else if (c < 0xC0)                                  \
  2217       {                                                 \
  2218         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
  2219         /* Re-read C as a composition component.  */    \
  2220         src = current_src;                              \
  2221       }                                                 \
  2222     else if (c == 0xFF)                                 \
  2223       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
  2224     else                                                \
  2225       goto invalid_code;                                \
  2226   } while (0)
  2227 
  2228 #define EMACS_MULE_COMPOSITION_END()                            \
  2229   do {                                                          \
  2230     int idx = - cmp_status->length;                             \
  2231                                                                 \
  2232     if (cmp_status->old_form)                                   \
  2233       charbuf[idx + 2] = cmp_status->nchars;                    \
  2234     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
  2235       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
  2236     cmp_status->state = COMPOSING_NO;                           \
  2237   } while (0)
  2238 
  2239 
  2240 static int
  2241 emacs_mule_finish_composition (int *charbuf,
  2242                                struct composition_status *cmp_status)
  2243 {
  2244   int idx = - cmp_status->length;
  2245   int new_chars;
  2246 
  2247   if (cmp_status->old_form && cmp_status->nchars > 0)
  2248     {
  2249       charbuf[idx + 2] = cmp_status->nchars;
  2250       new_chars = 0;
  2251       if (cmp_status->method == COMPOSITION_WITH_RULE
  2252           && cmp_status->state == COMPOSING_CHAR)
  2253         {
  2254           /* The last rule was invalid.  */
  2255           int rule = charbuf[-1] + 0xA0;
  2256 
  2257           charbuf[-2] = BYTE8_TO_CHAR (rule);
  2258           charbuf[-1] = -1;
  2259           new_chars = 1;
  2260         }
  2261     }
  2262   else
  2263     {
  2264       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
  2265 
  2266       if (cmp_status->method == COMPOSITION_WITH_RULE)
  2267         {
  2268           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
  2269           charbuf[idx++] = -3;
  2270           charbuf[idx++] = 0;
  2271           new_chars = 1;
  2272         }
  2273       else
  2274         {
  2275           int nchars = charbuf[idx + 1] + 0xA0;
  2276           int nbytes = charbuf[idx + 2] + 0xA0;
  2277 
  2278           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
  2279           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
  2280           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
  2281           charbuf[idx++] = -1;
  2282           new_chars = 4;
  2283         }
  2284     }
  2285   cmp_status->state = COMPOSING_NO;
  2286   return new_chars;
  2287 }
  2288 
  2289 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
  2290   do {                                                                    \
  2291     if (cmp_status->state != COMPOSING_NO)                                \
  2292       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
  2293   } while (0)
  2294 
  2295 
  2296 static void
  2297 decode_coding_emacs_mule (struct coding_system *coding)
  2298 {
  2299   const unsigned char *src = coding->source + coding->consumed;
  2300   const unsigned char *src_end = coding->source + coding->src_bytes;
  2301   const unsigned char *src_base;
  2302   int *charbuf = coding->charbuf + coding->charbuf_used;
  2303   /* We may produce two annotations (charset and composition) in one
  2304      loop and one more charset annotation at the end.  */
  2305   int *charbuf_end
  2306     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
  2307       /* We can produce up to 2 characters in a loop.  */
  2308       - 1;
  2309   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  2310   bool multibytep = coding->src_multibyte;
  2311   ptrdiff_t char_offset = coding->produced_char;
  2312   ptrdiff_t last_offset = char_offset;
  2313   int last_id = charset_ascii;
  2314   bool eol_dos
  2315     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  2316   int byte_after_cr = -1;
  2317   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
  2318 
  2319   if (cmp_status->state != COMPOSING_NO)
  2320     {
  2321       int i;
  2322 
  2323       if (charbuf_end - charbuf < cmp_status->length)
  2324         emacs_abort ();
  2325       for (i = 0; i < cmp_status->length; i++)
  2326         *charbuf++ = cmp_status->carryover[i];
  2327       coding->annotated = 1;
  2328     }
  2329 
  2330   while (1)
  2331     {
  2332       int c;
  2333       int id UNINIT;
  2334 
  2335       src_base = src;
  2336       consumed_chars_base = consumed_chars;
  2337 
  2338       if (charbuf >= charbuf_end)
  2339         {
  2340           if (byte_after_cr >= 0)
  2341             src_base--;
  2342           break;
  2343         }
  2344 
  2345       if (byte_after_cr >= 0)
  2346         c = byte_after_cr, byte_after_cr = -1;
  2347       else
  2348         ONE_MORE_BYTE (c);
  2349 
  2350       if (c < 0 || c == 0x80)
  2351         {
  2352           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2353           if (c < 0)
  2354             {
  2355               *charbuf++ = -c;
  2356               char_offset++;
  2357             }
  2358           else
  2359             DECODE_EMACS_MULE_COMPOSITION_START ();
  2360           continue;
  2361         }
  2362 
  2363       if (c < 0x80)
  2364         {
  2365           if (eol_dos && c == '\r')
  2366             ONE_MORE_BYTE (byte_after_cr);
  2367           id = charset_ascii;
  2368           if (cmp_status->state != COMPOSING_NO)
  2369             {
  2370               if (cmp_status->old_form)
  2371                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2372               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
  2373                 cmp_status->ncomps--;
  2374             }
  2375         }
  2376       else
  2377         {
  2378           int nchars UNINIT, nbytes UNINIT;
  2379           /* emacs_mule_char can load a charset map from a file, which
  2380              allocates a large structure and might cause buffer text
  2381              to be relocated as result.  Thus, we need to remember the
  2382              original pointer to buffer text, and fix up all related
  2383              pointers after the call.  */
  2384           const unsigned char *orig = coding->source;
  2385           ptrdiff_t offset;
  2386 
  2387           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
  2388                                cmp_status);
  2389           offset = coding->source - orig;
  2390           if (offset)
  2391             {
  2392               src += offset;
  2393               src_base += offset;
  2394               src_end += offset;
  2395             }
  2396           if (c < 0)
  2397             {
  2398               if (c == -1)
  2399                 goto invalid_code;
  2400               if (c == -2)
  2401                 break;
  2402             }
  2403           src = src_base + nbytes;
  2404           consumed_chars = consumed_chars_base + nchars;
  2405           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
  2406             cmp_status->ncomps -= nchars;
  2407         }
  2408 
  2409       /* Now if C >= 0, we found a normally encoded character, if C <
  2410          0, we found an old-style composition component character or
  2411          rule.  */
  2412 
  2413       if (cmp_status->state == COMPOSING_NO)
  2414         {
  2415           if (last_id != id)
  2416             {
  2417               if (last_id != charset_ascii)
  2418                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
  2419                                   last_id);
  2420               last_id = id;
  2421               last_offset = char_offset;
  2422             }
  2423           *charbuf++ = c;
  2424           char_offset++;
  2425         }
  2426       else if (cmp_status->state == COMPOSING_CHAR)
  2427         {
  2428           if (cmp_status->old_form)
  2429             {
  2430               if (c >= 0)
  2431                 {
  2432                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2433                   *charbuf++ = c;
  2434                   char_offset++;
  2435                 }
  2436               else
  2437                 {
  2438                   *charbuf++ = -c;
  2439                   cmp_status->nchars++;
  2440                   cmp_status->length++;
  2441                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
  2442                     EMACS_MULE_COMPOSITION_END ();
  2443                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
  2444                     cmp_status->state = COMPOSING_RULE;
  2445                 }
  2446             }
  2447           else
  2448             {
  2449               *charbuf++ = c;
  2450               cmp_status->length++;
  2451               cmp_status->nchars--;
  2452               if (cmp_status->nchars == 0)
  2453                 EMACS_MULE_COMPOSITION_END ();
  2454             }
  2455         }
  2456       else if (cmp_status->state == COMPOSING_RULE)
  2457         {
  2458           int rule;
  2459 
  2460           if (c >= 0)
  2461             {
  2462               EMACS_MULE_COMPOSITION_END ();
  2463               *charbuf++ = c;
  2464               char_offset++;
  2465             }
  2466           else
  2467             {
  2468               c = -c;
  2469               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
  2470               if (rule < 0)
  2471                 goto invalid_code;
  2472               *charbuf++ = -2;
  2473               *charbuf++ = rule;
  2474               cmp_status->length += 2;
  2475               cmp_status->state = COMPOSING_CHAR;
  2476             }
  2477         }
  2478       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
  2479         {
  2480           *charbuf++ = c;
  2481           cmp_status->length++;
  2482           if (cmp_status->ncomps == 0)
  2483             cmp_status->state = COMPOSING_CHAR;
  2484           else if (cmp_status->ncomps > 0)
  2485             {
  2486               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
  2487                 cmp_status->state = COMPOSING_COMPONENT_RULE;
  2488             }
  2489           else
  2490             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2491         }
  2492       else                      /* COMPOSING_COMPONENT_RULE */
  2493         {
  2494           int rule;
  2495 
  2496           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
  2497           if (rule < 0)
  2498             goto invalid_code;
  2499           *charbuf++ = -2;
  2500           *charbuf++ = rule;
  2501           cmp_status->length += 2;
  2502           cmp_status->ncomps--;
  2503           if (cmp_status->ncomps > 0)
  2504             cmp_status->state = COMPOSING_COMPONENT_CHAR;
  2505           else
  2506             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2507         }
  2508       continue;
  2509 
  2510     invalid_code:
  2511       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2512       src = src_base;
  2513       consumed_chars = consumed_chars_base;
  2514       ONE_MORE_BYTE (c);
  2515       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  2516       char_offset++;
  2517     }
  2518 
  2519  no_more_source:
  2520   if (cmp_status->state != COMPOSING_NO)
  2521     {
  2522       if (coding->mode & CODING_MODE_LAST_BLOCK)
  2523         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2524       else
  2525         {
  2526           int i;
  2527 
  2528           charbuf -= cmp_status->length;
  2529           for (i = 0; i < cmp_status->length; i++)
  2530             cmp_status->carryover[i] = charbuf[i];
  2531         }
  2532     }
  2533   if (last_id != charset_ascii)
  2534     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  2535   coding->consumed_char += consumed_chars_base;
  2536   coding->consumed = src_base - coding->source;
  2537   coding->charbuf_used = charbuf - coding->charbuf;
  2538 }
  2539 
  2540 
  2541 #define EMACS_MULE_LEADING_CODES(id, codes)     \
  2542   do {                                          \
  2543     if (id < 0xA0)                              \
  2544       codes[0] = id, codes[1] = 0;              \
  2545     else if (id < 0xE0)                         \
  2546       codes[0] = 0x9A, codes[1] = id;           \
  2547     else if (id < 0xF0)                         \
  2548       codes[0] = 0x9B, codes[1] = id;           \
  2549     else if (id < 0xF5)                         \
  2550       codes[0] = 0x9C, codes[1] = id;           \
  2551     else                                        \
  2552       codes[0] = 0x9D, codes[1] = id;           \
  2553   } while (0);
  2554 
  2555 
  2556 static bool
  2557 encode_coding_emacs_mule (struct coding_system *coding)
  2558 {
  2559   bool multibytep = coding->dst_multibyte;
  2560   int *charbuf = coding->charbuf;
  2561   int *charbuf_end = charbuf + coding->charbuf_used;
  2562   unsigned char *dst = coding->destination + coding->produced;
  2563   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  2564   int safe_room = 8;
  2565   ptrdiff_t produced_chars = 0;
  2566   Lisp_Object attrs, charset_list;
  2567   int c;
  2568   int preferred_charset_id = -1;
  2569 
  2570   CODING_GET_INFO (coding, attrs, charset_list);
  2571   if (! EQ (charset_list, Vemacs_mule_charset_list))
  2572     {
  2573       charset_list = Vemacs_mule_charset_list;
  2574       ASET (attrs, coding_attr_charset_list, charset_list);
  2575     }
  2576 
  2577   while (charbuf < charbuf_end)
  2578     {
  2579       ASSURE_DESTINATION (safe_room);
  2580       c = *charbuf++;
  2581 
  2582       if (c < 0)
  2583         {
  2584           /* Handle an annotation.  */
  2585           switch (*charbuf)
  2586             {
  2587             case CODING_ANNOTATE_COMPOSITION_MASK:
  2588               /* Not yet implemented.  */
  2589               break;
  2590             case CODING_ANNOTATE_CHARSET_MASK:
  2591               preferred_charset_id = charbuf[3];
  2592               if (preferred_charset_id >= 0
  2593                   && NILP (Fmemq (make_fixnum (preferred_charset_id),
  2594                                   charset_list)))
  2595                 preferred_charset_id = -1;
  2596               break;
  2597             default:
  2598               emacs_abort ();
  2599             }
  2600           charbuf += -c - 1;
  2601           continue;
  2602         }
  2603 
  2604       if (ASCII_CHAR_P (c))
  2605         EMIT_ONE_ASCII_BYTE (c);
  2606       else if (CHAR_BYTE8_P (c))
  2607         {
  2608           c = CHAR_TO_BYTE8 (c);
  2609           EMIT_ONE_BYTE (c);
  2610         }
  2611       else
  2612         {
  2613           struct charset *charset;
  2614           unsigned code;
  2615           int dimension;
  2616           int emacs_mule_id;
  2617           unsigned char leading_codes[2];
  2618 
  2619           if (preferred_charset_id >= 0)
  2620             {
  2621               bool result;
  2622 
  2623               charset = CHARSET_FROM_ID (preferred_charset_id);
  2624               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
  2625               if (result)
  2626                 code = ENCODE_CHAR (charset, c);
  2627               else
  2628                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2629                                      &code, charset);
  2630             }
  2631           else
  2632             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2633                                  &code, charset);
  2634           if (! charset)
  2635             {
  2636               c = coding->default_char;
  2637               if (ASCII_CHAR_P (c))
  2638                 {
  2639                   EMIT_ONE_ASCII_BYTE (c);
  2640                   continue;
  2641                 }
  2642               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2643                                    &code, charset);
  2644             }
  2645           dimension = CHARSET_DIMENSION (charset);
  2646           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
  2647           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
  2648           EMIT_ONE_BYTE (leading_codes[0]);
  2649           if (leading_codes[1])
  2650             EMIT_ONE_BYTE (leading_codes[1]);
  2651           if (dimension == 1)
  2652             EMIT_ONE_BYTE (code | 0x80);
  2653           else
  2654             {
  2655               code |= 0x8080;
  2656               EMIT_ONE_BYTE (code >> 8);
  2657               EMIT_ONE_BYTE (code & 0xFF);
  2658             }
  2659         }
  2660     }
  2661   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  2662   coding->produced_char += produced_chars;
  2663   coding->produced = dst - coding->destination;
  2664   return 0;
  2665 }
  2666 
  2667 
  2668 /*** 7. ISO2022 handlers ***/
  2669 
  2670 /* The following note describes the coding system ISO2022 briefly.
  2671    Since the intention of this note is to help understand the
  2672    functions in this file, some parts are NOT ACCURATE or are OVERLY
  2673    SIMPLIFIED.  For thorough understanding, please refer to the
  2674    original document of ISO2022.  This is equivalent to the standard
  2675    ECMA-35, obtainable from <URL:https://www.ecma.ch/> (*).
  2676 
  2677    ISO2022 provides many mechanisms to encode several character sets
  2678    in 7-bit and 8-bit environments.  For 7-bit environments, all text
  2679    is encoded using bytes less than 128.  This may make the encoded
  2680    text a little bit longer, but the text passes more easily through
  2681    several types of gateway, some of which strip off the MSB (Most
  2682    Significant Bit).
  2683 
  2684    There are two kinds of character sets: control character sets and
  2685    graphic character sets.  The former contain control characters such
  2686    as `newline' and `escape' to provide control functions (control
  2687    functions are also provided by escape sequences).  The latter
  2688    contain graphic characters such as 'A' and '-'.  Emacs recognizes
  2689    two control character sets and many graphic character sets.
  2690 
  2691    Graphic character sets are classified into one of the following
  2692    four classes, according to the number of bytes (DIMENSION) and
  2693    number of characters in one dimension (CHARS) of the set:
  2694    - DIMENSION1_CHARS94
  2695    - DIMENSION1_CHARS96
  2696    - DIMENSION2_CHARS94
  2697    - DIMENSION2_CHARS96
  2698 
  2699    In addition, each character set is assigned an identification tag,
  2700    unique for each set, called the "final character" (denoted as <F>
  2701    hereafter).  The <F> of each character set is decided by ECMA(*)
  2702    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
  2703    (0x30..0x3F are for private use only).
  2704 
  2705    Note (*): ECMA = European Computer Manufacturers Association
  2706 
  2707    Here are examples of graphic character sets [NAME(<F>)]:
  2708         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
  2709         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
  2710         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
  2711         o DIMENSION2_CHARS96 -- none for the moment
  2712 
  2713    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
  2714         C0 [0x00..0x1F] -- control character plane 0
  2715         GL [0x20..0x7F] -- graphic character plane 0
  2716         C1 [0x80..0x9F] -- control character plane 1
  2717         GR [0xA0..0xFF] -- graphic character plane 1
  2718 
  2719    A control character set is directly designated and invoked to C0 or
  2720    C1 by an escape sequence.  The most common case is that:
  2721    - ISO646's  control character set is designated/invoked to C0, and
  2722    - ISO6429's control character set is designated/invoked to C1,
  2723    and usually these designations/invocations are omitted in encoded
  2724    text.  In a 7-bit environment, only C0 can be used, and a control
  2725    character for C1 is encoded by an appropriate escape sequence to
  2726    fit into the environment.  All control characters for C1 are
  2727    defined to have corresponding escape sequences.
  2728 
  2729    A graphic character set is at first designated to one of four
  2730    graphic registers (G0 through G3), then these graphic registers are
  2731    invoked to GL or GR.  These designations and invocations can be
  2732    done independently.  The most common case is that G0 is invoked to
  2733    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
  2734    these invocations and designations are omitted in encoded text.
  2735    In a 7-bit environment, only GL can be used.
  2736 
  2737    When a graphic character set of CHARS94 is invoked to GL, codes
  2738    0x20 and 0x7F of the GL area work as control characters SPACE and
  2739    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
  2740    be used.
  2741 
  2742    There are two ways of invocation: locking-shift and single-shift.
  2743    With locking-shift, the invocation lasts until the next different
  2744    invocation, whereas with single-shift, the invocation affects the
  2745    following character only and doesn't affect the locking-shift
  2746    state.  Invocations are done by the following control characters or
  2747    escape sequences:
  2748 
  2749    ----------------------------------------------------------------------
  2750    abbrev  function                  cntrl escape seq   description
  2751    ----------------------------------------------------------------------
  2752    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
  2753    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
  2754    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
  2755    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
  2756    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
  2757    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
  2758    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
  2759    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
  2760    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
  2761    ----------------------------------------------------------------------
  2762    (*) These are not used by any known coding system.
  2763 
  2764    Control characters for these functions are defined by macros
  2765    ISO_CODE_XXX in `coding.h'.
  2766 
  2767    Designations are done by the following escape sequences:
  2768    ----------------------------------------------------------------------
  2769    escape sequence      description
  2770    ----------------------------------------------------------------------
  2771    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
  2772    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
  2773    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
  2774    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
  2775    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
  2776    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
  2777    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
  2778    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
  2779    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
  2780    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
  2781    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
  2782    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
  2783    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
  2784    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
  2785    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
  2786    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
  2787    ----------------------------------------------------------------------
  2788 
  2789    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
  2790    of dimension 1, chars 94, and final character <F>, etc...
  2791 
  2792    Note (*): Although these designations are not allowed in ISO2022,
  2793    Emacs accepts them on decoding, and produces them on encoding
  2794    CHARS96 character sets in a coding system which is characterized as
  2795    7-bit environment, non-locking-shift, and non-single-shift.
  2796 
  2797    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
  2798    '(' must be omitted.  We refer to this as "short-form" hereafter.
  2799 
  2800    Now you may notice that there are a lot of ways of encoding the
  2801    same multilingual text in ISO2022.  Actually, there exist many
  2802    coding systems such as Compound Text (used in X11's inter client
  2803    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
  2804    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
  2805    localized platforms), and all of these are variants of ISO2022.
  2806 
  2807    In addition to the above, Emacs handles two more kinds of escape
  2808    sequences: ISO6429's direction specification and Emacs' private
  2809    sequence for specifying character composition.
  2810 
  2811    ISO6429's direction specification takes the following form:
  2812         o CSI ']'      -- end of the current direction
  2813         o CSI '0' ']'  -- end of the current direction
  2814         o CSI '1' ']'  -- start of left-to-right text
  2815         o CSI '2' ']'  -- start of right-to-left text
  2816    The control character CSI (0x9B: control sequence introducer) is
  2817    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
  2818 
  2819    Character composition specification takes the following form:
  2820         o ESC '0' -- start relative composition
  2821         o ESC '1' -- end composition
  2822         o ESC '2' -- start rule-base composition (*)
  2823         o ESC '3' -- start relative composition with alternate chars  (**)
  2824         o ESC '4' -- start rule-base composition with alternate chars  (**)
  2825   Since these are not standard escape sequences of any ISO standard,
  2826   the use of them with these meanings is restricted to Emacs only.
  2827 
  2828   (*) This form is used only in Emacs 20.7 and older versions,
  2829   but newer versions can safely decode it.
  2830   (**) This form is used only in Emacs 21.1 and newer versions,
  2831   and older versions can't decode it.
  2832 
  2833   Here's a list of example usages of these composition escape
  2834   sequences (categorized by `enum composition_method').
  2835 
  2836   COMPOSITION_RELATIVE:
  2837         ESC 0 CHAR [ CHAR ] ESC 1
  2838   COMPOSITION_WITH_RULE:
  2839         ESC 2 CHAR [ RULE CHAR ] ESC 1
  2840   COMPOSITION_WITH_ALTCHARS:
  2841         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
  2842   COMPOSITION_WITH_RULE_ALTCHARS:
  2843         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
  2844 
  2845 static enum iso_code_class_type iso_code_class[256];
  2846 
  2847 #define SAFE_CHARSET_P(coding, id)      \
  2848   ((id) <= (coding)->max_charset_id     \
  2849    && (coding)->safe_charsets[id] != 255)
  2850 
  2851 static void
  2852 setup_iso_safe_charsets (Lisp_Object attrs)
  2853 {
  2854   Lisp_Object charset_list, safe_charsets;
  2855   Lisp_Object request;
  2856   Lisp_Object reg_usage;
  2857   Lisp_Object tail;
  2858   EMACS_INT reg94, reg96;
  2859   int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  2860   int max_charset_id;
  2861 
  2862   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  2863   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
  2864       && ! EQ (charset_list, Viso_2022_charset_list))
  2865     {
  2866       charset_list = Viso_2022_charset_list;
  2867       ASET (attrs, coding_attr_charset_list, charset_list);
  2868       ASET (attrs, coding_attr_safe_charsets, Qnil);
  2869     }
  2870 
  2871   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
  2872     return;
  2873 
  2874   max_charset_id = 0;
  2875   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
  2876     {
  2877       int id = XFIXNUM (XCAR (tail));
  2878       if (max_charset_id < id)
  2879         max_charset_id = id;
  2880     }
  2881 
  2882   safe_charsets = make_uninit_string (max_charset_id + 1);
  2883   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
  2884   request = AREF (attrs, coding_attr_iso_request);
  2885   reg_usage = AREF (attrs, coding_attr_iso_usage);
  2886   reg94 = XFIXNUM (XCAR (reg_usage));
  2887   reg96 = XFIXNUM (XCDR (reg_usage));
  2888 
  2889   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
  2890     {
  2891       Lisp_Object id;
  2892       Lisp_Object reg;
  2893       struct charset *charset;
  2894 
  2895       id = XCAR (tail);
  2896       charset = CHARSET_FROM_ID (XFIXNUM (id));
  2897       reg = Fcdr (Fassq (id, request));
  2898       if (! NILP (reg))
  2899         SSET (safe_charsets, XFIXNUM (id), XFIXNUM (reg));
  2900       else if (charset->iso_chars_96)
  2901         {
  2902           if (reg96 < 4)
  2903             SSET (safe_charsets, XFIXNUM (id), reg96);
  2904         }
  2905       else
  2906         {
  2907           if (reg94 < 4)
  2908             SSET (safe_charsets, XFIXNUM (id), reg94);
  2909         }
  2910     }
  2911   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
  2912 }
  2913 
  2914 
  2915 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  2916    Return true if a text is encoded in one of ISO-2022 based coding
  2917    systems.  */
  2918 
  2919 static bool
  2920 detect_coding_iso_2022 (struct coding_system *coding,
  2921                         struct coding_detection_info *detect_info)
  2922 {
  2923   const unsigned char *src = coding->source, *src_base = src;
  2924   const unsigned char *src_end = coding->source + coding->src_bytes;
  2925   bool multibytep = coding->src_multibyte;
  2926   bool single_shifting = 0;
  2927   int id;
  2928   int c, c1;
  2929   ptrdiff_t consumed_chars = 0;
  2930   int i;
  2931   int rejected = 0;
  2932   int found = 0;
  2933   int composition_count = -1;
  2934 
  2935   detect_info->checked |= CATEGORY_MASK_ISO;
  2936 
  2937   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
  2938     {
  2939       struct coding_system *this = &(coding_categories[i]);
  2940       Lisp_Object attrs, val;
  2941 
  2942       if (this->id < 0)
  2943         continue;
  2944       attrs = CODING_ID_ATTRS (this->id);
  2945       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
  2946           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
  2947         setup_iso_safe_charsets (attrs);
  2948       val = CODING_ATTR_SAFE_CHARSETS (attrs);
  2949       this->max_charset_id = SCHARS (val) - 1;
  2950       this->safe_charsets = SDATA (val);
  2951     }
  2952 
  2953   /* A coding system of this category is always ASCII compatible.  */
  2954   src += coding->head_ascii;
  2955 
  2956   while (rejected != CATEGORY_MASK_ISO)
  2957     {
  2958       src_base = src;
  2959       ONE_MORE_BYTE (c);
  2960       switch (c)
  2961         {
  2962         case ISO_CODE_ESC:
  2963           if (inhibit_iso_escape_detection)
  2964             break;
  2965           single_shifting = 0;
  2966           ONE_MORE_BYTE (c);
  2967           if (c == 'N' || c == 'O')
  2968             {
  2969               /* ESC <Fe> for SS2 or SS3.  */
  2970               single_shifting = 1;
  2971               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
  2972             }
  2973           else if (c == '1')
  2974             {
  2975               /* End of composition.  */
  2976               if (composition_count < 0
  2977                   || composition_count > MAX_COMPOSITION_COMPONENTS)
  2978                 /* Invalid */
  2979                 break;
  2980               composition_count = -1;
  2981               found |= CATEGORY_MASK_ISO;
  2982             }
  2983           else if (c >= '0' && c <= '4')
  2984             {
  2985               /* ESC <Fp> for start/end composition.  */
  2986               composition_count = 0;
  2987             }
  2988           else
  2989             {
  2990               if (c >= '(' && c <= '/')
  2991                 {
  2992                   /* Designation sequence for a charset of dimension 1.  */
  2993                   ONE_MORE_BYTE (c1);
  2994                   if (c1 < ' ' || c1 >= 0x80
  2995                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
  2996                     {
  2997                       /* Invalid designation sequence.  Just ignore.  */
  2998                       if (c1 >= 0x80)
  2999                         rejected |= (CATEGORY_MASK_ISO_7BIT
  3000                                      | CATEGORY_MASK_ISO_7_ELSE);
  3001                       break;
  3002                     }
  3003                 }
  3004               else if (c == '$')
  3005                 {
  3006                   /* Designation sequence for a charset of dimension 2.  */
  3007                   ONE_MORE_BYTE (c);
  3008                   if (c >= '@' && c <= 'B')
  3009                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
  3010                     id = iso_charset_table[1][0][c];
  3011                   else if (c >= '(' && c <= '/')
  3012                     {
  3013                       ONE_MORE_BYTE (c1);
  3014                       if (c1 < ' ' || c1 >= 0x80
  3015                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
  3016                         {
  3017                           /* Invalid designation sequence.  Just ignore.  */
  3018                           if (c1 >= 0x80)
  3019                             rejected |= (CATEGORY_MASK_ISO_7BIT
  3020                                          | CATEGORY_MASK_ISO_7_ELSE);
  3021                           break;
  3022                         }
  3023                     }
  3024                   else
  3025                     {
  3026                       /* Invalid designation sequence.  Just ignore it.  */
  3027                       if (c >= 0x80)
  3028                         rejected |= (CATEGORY_MASK_ISO_7BIT
  3029                                      | CATEGORY_MASK_ISO_7_ELSE);
  3030                       break;
  3031                     }
  3032                 }
  3033               else
  3034                 {
  3035                   /* Invalid escape sequence.  Just ignore it.  */
  3036                   if (c >= 0x80)
  3037                     rejected |= (CATEGORY_MASK_ISO_7BIT
  3038                                  | CATEGORY_MASK_ISO_7_ELSE);
  3039                   break;
  3040                 }
  3041 
  3042               /* We found a valid designation sequence for CHARSET.  */
  3043               rejected |= CATEGORY_MASK_ISO_8BIT;
  3044               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
  3045                                   id))
  3046                 found |= CATEGORY_MASK_ISO_7;
  3047               else
  3048                 rejected |= CATEGORY_MASK_ISO_7;
  3049               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
  3050                                   id))
  3051                 found |= CATEGORY_MASK_ISO_7_TIGHT;
  3052               else
  3053                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
  3054               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
  3055                                   id))
  3056                 found |= CATEGORY_MASK_ISO_7_ELSE;
  3057               else
  3058                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
  3059               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
  3060                                   id))
  3061                 found |= CATEGORY_MASK_ISO_8_ELSE;
  3062               else
  3063                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
  3064             }
  3065           break;
  3066 
  3067         case ISO_CODE_SO:
  3068         case ISO_CODE_SI:
  3069           /* Locking shift out/in.  */
  3070           if (inhibit_iso_escape_detection)
  3071             break;
  3072           single_shifting = 0;
  3073           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
  3074           break;
  3075 
  3076         case ISO_CODE_CSI:
  3077           /* Control sequence introducer.  */
  3078           single_shifting = 0;
  3079           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3080           found |= CATEGORY_MASK_ISO_8_ELSE;
  3081           goto check_extra_latin;
  3082 
  3083         case ISO_CODE_SS2:
  3084         case ISO_CODE_SS3:
  3085           /* Single shift.   */
  3086           if (inhibit_iso_escape_detection)
  3087             break;
  3088           single_shifting = 0;
  3089           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3090           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
  3091               & CODING_ISO_FLAG_SINGLE_SHIFT)
  3092             {
  3093               found |= CATEGORY_MASK_ISO_8_1;
  3094               single_shifting = 1;
  3095             }
  3096           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
  3097               & CODING_ISO_FLAG_SINGLE_SHIFT)
  3098             {
  3099               found |= CATEGORY_MASK_ISO_8_2;
  3100               single_shifting = 1;
  3101             }
  3102           if (single_shifting)
  3103             break;
  3104           goto check_extra_latin;
  3105 
  3106         default:
  3107           if (c < 0)
  3108             continue;
  3109           if (c < 0x80)
  3110             {
  3111               if (composition_count >= 0)
  3112                 composition_count++;
  3113               single_shifting = 0;
  3114               break;
  3115             }
  3116           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3117           if (c >= 0xA0)
  3118             {
  3119               found |= CATEGORY_MASK_ISO_8_1;
  3120               /* Check the length of succeeding codes of the range
  3121                  0xA0..0FF.  If the byte length is even, we include
  3122                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
  3123                  only when we are not single shifting.  */
  3124               if (! single_shifting
  3125                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
  3126                 {
  3127                   ptrdiff_t len = 1;
  3128                   while (src < src_end)
  3129                     {
  3130                       src_base = src;
  3131                       ONE_MORE_BYTE (c);
  3132                       if (c < 0xA0)
  3133                         {
  3134                           src = src_base;
  3135                           break;
  3136                         }
  3137                       len++;
  3138                     }
  3139 
  3140                   if (len & 1 && src < src_end)
  3141                     {
  3142                       rejected |= CATEGORY_MASK_ISO_8_2;
  3143                       if (composition_count >= 0)
  3144                         composition_count += len;
  3145                     }
  3146                   else
  3147                     {
  3148                       found |= CATEGORY_MASK_ISO_8_2;
  3149                       if (composition_count >= 0)
  3150                         composition_count += len / 2;
  3151                     }
  3152                 }
  3153               break;
  3154             }
  3155         check_extra_latin:
  3156           if (! VECTORP (Vlatin_extra_code_table)
  3157               || NILP (AREF (Vlatin_extra_code_table, c)))
  3158             {
  3159               rejected = CATEGORY_MASK_ISO;
  3160               break;
  3161             }
  3162           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
  3163               & CODING_ISO_FLAG_LATIN_EXTRA)
  3164             found |= CATEGORY_MASK_ISO_8_1;
  3165           else
  3166             rejected |= CATEGORY_MASK_ISO_8_1;
  3167           rejected |= CATEGORY_MASK_ISO_8_2;
  3168           break;
  3169         }
  3170     }
  3171   detect_info->rejected |= CATEGORY_MASK_ISO;
  3172   return 0;
  3173 
  3174  no_more_source:
  3175   detect_info->rejected |= rejected;
  3176   detect_info->found |= (found & ~rejected);
  3177   return 1;
  3178 }
  3179 
  3180 
  3181 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
  3182    escape sequence should be kept.  */
  3183 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
  3184   do {                                                                  \
  3185     int id, prev;                                                       \
  3186                                                                         \
  3187     if (final < '0' || final >= 128                                     \
  3188         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
  3189         || !SAFE_CHARSET_P (coding, id))                                \
  3190       {                                                                 \
  3191         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
  3192         chars_96 = -1;                                                  \
  3193         break;                                                          \
  3194       }                                                                 \
  3195     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
  3196     if (id == charset_jisx0201_roman)                                   \
  3197       {                                                                 \
  3198         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
  3199           id = charset_ascii;                                           \
  3200       }                                                                 \
  3201     else if (id == charset_jisx0208_1978)                               \
  3202       {                                                                 \
  3203         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
  3204           id = charset_jisx0208;                                        \
  3205       }                                                                 \
  3206     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
  3207     /* If there was an invalid designation to REG previously, and this  \
  3208        designation is ASCII to REG, we should keep this designation     \
  3209        sequence.  */                                                    \
  3210     if (prev == -2 && id == charset_ascii)                              \
  3211       chars_96 = -1;                                                    \
  3212   } while (0)
  3213 
  3214 
  3215 /* Handle these composition sequence (ALT: alternate char):
  3216 
  3217    (1) relative composition: ESC 0 CHAR ... ESC 1
  3218    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
  3219    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
  3220    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
  3221 
  3222    When the start sequence (ESC 0/2/3/4) is found, this annotation
  3223    header is produced.
  3224 
  3225         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
  3226 
  3227    Then, upon reading CHAR or RULE (one or two bytes), these codes are
  3228    produced until the end sequence (ESC 1) is found:
  3229 
  3230    (1) CHAR ... CHAR
  3231    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
  3232    (3) ALT ... ALT -1 -1 CHAR ... CHAR
  3233    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
  3234 
  3235    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
  3236    annotation header is updated as below:
  3237 
  3238    (1) LENGTH: unchanged,  NCHARS: number of CHARs
  3239    (2) LENGTH: unchanged,  NCHARS: number of CHARs
  3240    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
  3241    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
  3242 
  3243    If an error is found while composing, the annotation header is
  3244    changed to:
  3245 
  3246         [ ESC '0'/'2'/'3'/'4' -2 0 ]
  3247 
  3248    and the sequence [ -2 DECODED-RULE ] is changed to the original
  3249    byte sequence as below:
  3250         o the original byte sequence is B: [ B -1 ]
  3251         o the original byte sequence is B1 B2: [ B1 B2 ]
  3252    and the sequence [ -1 -1 ] is changed to the original byte
  3253    sequence:
  3254         [ ESC '0' ]
  3255 */
  3256 
  3257 /* Decode a composition rule C1 and maybe one more byte from the
  3258    source, and set RULE to the encoded composition rule.  If the rule
  3259    is invalid, goto invalid_code.  */
  3260 
  3261 #define DECODE_COMPOSITION_RULE(rule)                                   \
  3262   do {                                                                  \
  3263     rule = c1 - 32;                                                     \
  3264     if (rule < 0)                                                       \
  3265       goto invalid_code;                                                \
  3266     if (rule < 81)              /* old format (before ver.21) */        \
  3267       {                                                                 \
  3268         int gref = (rule) / 9;                                          \
  3269         int nref = (rule) % 9;                                          \
  3270         if (gref == 4) gref = 10;                                       \
  3271         if (nref == 4) nref = 10;                                       \
  3272         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
  3273       }                                                                 \
  3274     else                        /* new format (after ver.21) */         \
  3275       {                                                                 \
  3276         int b;                                                          \
  3277                                                                         \
  3278         ONE_MORE_BYTE (b);                                              \
  3279         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
  3280           goto invalid_code;                                            \
  3281         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
  3282         rule += 0x100;   /* Distinguish it from the old format.  */     \
  3283       }                                                                 \
  3284   } while (0)
  3285 
  3286 #define ENCODE_COMPOSITION_RULE(rule)                           \
  3287   do {                                                          \
  3288     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
  3289                                                                 \
  3290     if (rule < 0x100)           /* old format */                \
  3291       {                                                         \
  3292         if (gref == 10) gref = 4;                               \
  3293         if (nref == 10) nref = 4;                               \
  3294         charbuf[idx] = 32 + gref * 9 + nref;                    \
  3295         charbuf[idx + 1] = -1;                                  \
  3296         new_chars++;                                            \
  3297       }                                                         \
  3298     else                                /* new format */        \
  3299       {                                                         \
  3300         charbuf[idx] = 32 + 81 + gref;                          \
  3301         charbuf[idx + 1] = 32 + nref;                           \
  3302         new_chars += 2;                                         \
  3303       }                                                         \
  3304   } while (0)
  3305 
  3306 /* Finish the current composition as invalid.  */
  3307 
  3308 static int
  3309 finish_composition (int *charbuf, struct composition_status *cmp_status)
  3310 {
  3311   int idx = - cmp_status->length;
  3312   int new_chars;
  3313 
  3314   /* Recover the original ESC sequence */
  3315   charbuf[idx++] = ISO_CODE_ESC;
  3316   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
  3317                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
  3318                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
  3319                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
  3320                     : '4');
  3321   charbuf[idx++] = -2;
  3322   charbuf[idx++] = 0;
  3323   charbuf[idx++] = -1;
  3324   new_chars = cmp_status->nchars;
  3325   if (cmp_status->method >= COMPOSITION_WITH_RULE)
  3326     for (; idx < 0; idx++)
  3327       {
  3328         int elt = charbuf[idx];
  3329 
  3330         if (elt == -2)
  3331           {
  3332             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
  3333             idx++;
  3334           }
  3335         else if (elt == -1)
  3336           {
  3337             charbuf[idx++] = ISO_CODE_ESC;
  3338             charbuf[idx] = '0';
  3339             new_chars += 2;
  3340           }
  3341       }
  3342   cmp_status->state = COMPOSING_NO;
  3343   return new_chars;
  3344 }
  3345 
  3346 /* If characters are under composition, finish the composition.  */
  3347 #define MAYBE_FINISH_COMPOSITION()                              \
  3348   do {                                                          \
  3349     if (cmp_status->state != COMPOSING_NO)                      \
  3350       char_offset += finish_composition (charbuf, cmp_status);  \
  3351   } while (0)
  3352 
  3353 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
  3354 
  3355    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
  3356    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
  3357    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
  3358    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
  3359 
  3360    Produce this annotation sequence now:
  3361 
  3362    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
  3363 */
  3364 
  3365 #define DECODE_COMPOSITION_START(c1)                                       \
  3366   do {                                                                     \
  3367     if (c1 == '0'                                                          \
  3368         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
  3369              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
  3370             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
  3371                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
  3372       {                                                                    \
  3373         *charbuf++ = -1;                                                   \
  3374         *charbuf++= -1;                                                    \
  3375         cmp_status->state = COMPOSING_CHAR;                                \
  3376         cmp_status->length += 2;                                           \
  3377       }                                                                    \
  3378     else                                                                   \
  3379       {                                                                    \
  3380         MAYBE_FINISH_COMPOSITION ();                                       \
  3381         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
  3382                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
  3383                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
  3384                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
  3385         cmp_status->state                                                  \
  3386           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
  3387         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
  3388         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
  3389         cmp_status->nchars = cmp_status->ncomps = 0;                       \
  3390         coding->annotated = 1;                                             \
  3391       }                                                                    \
  3392   } while (0)
  3393 
  3394 
  3395 /* Handle composition end sequence ESC 1.  */
  3396 
  3397 #define DECODE_COMPOSITION_END()                                        \
  3398   do {                                                                  \
  3399     if (cmp_status->nchars == 0                                         \
  3400         || ((cmp_status->state == COMPOSING_CHAR)                       \
  3401             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
  3402       {                                                                 \
  3403         MAYBE_FINISH_COMPOSITION ();                                    \
  3404         goto invalid_code;                                              \
  3405       }                                                                 \
  3406     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
  3407       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
  3408     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
  3409       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
  3410     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
  3411     char_offset += cmp_status->nchars;                                  \
  3412     cmp_status->state = COMPOSING_NO;                                   \
  3413   } while (0)
  3414 
  3415 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
  3416 
  3417 #define STORE_COMPOSITION_RULE(rule)    \
  3418   do {                                  \
  3419     *charbuf++ = -2;                    \
  3420     *charbuf++ = rule;                  \
  3421     cmp_status->length += 2;            \
  3422     cmp_status->state--;                \
  3423   } while (0)
  3424 
  3425 /* Store a composed char or a component char C in charbuf, and update
  3426    cmp_status.  */
  3427 
  3428 #define STORE_COMPOSITION_CHAR(c)                                       \
  3429   do {                                                                  \
  3430     *charbuf++ = (c);                                                   \
  3431     cmp_status->length++;                                               \
  3432     if (cmp_status->state == COMPOSING_CHAR)                            \
  3433       cmp_status->nchars++;                                             \
  3434     else                                                                \
  3435       cmp_status->ncomps++;                                             \
  3436     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
  3437         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
  3438             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
  3439       cmp_status->state++;                                              \
  3440   } while (0)
  3441 
  3442 
  3443 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  3444 
  3445 static void
  3446 decode_coding_iso_2022 (struct coding_system *coding)
  3447 {
  3448   const unsigned char *src = coding->source + coding->consumed;
  3449   const unsigned char *src_end = coding->source + coding->src_bytes;
  3450   const unsigned char *src_base;
  3451   int *charbuf = coding->charbuf + coding->charbuf_used;
  3452   /* We may produce two annotations (charset and composition) in one
  3453      loop and one more charset annotation at the end.  */
  3454   int *charbuf_end
  3455     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
  3456   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  3457   bool multibytep = coding->src_multibyte;
  3458   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
  3459   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3460   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3461   int charset_id_2, charset_id_3;
  3462   struct charset *charset;
  3463   int c;
  3464   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
  3465   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
  3466   ptrdiff_t char_offset = coding->produced_char;
  3467   ptrdiff_t last_offset = char_offset;
  3468   int last_id = charset_ascii;
  3469   bool eol_dos
  3470     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  3471   int byte_after_cr = -1;
  3472   int i;
  3473 
  3474   setup_iso_safe_charsets (attrs);
  3475   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
  3476 
  3477   if (cmp_status->state != COMPOSING_NO)
  3478     {
  3479       if (charbuf_end - charbuf < cmp_status->length)
  3480         emacs_abort ();
  3481       for (i = 0; i < cmp_status->length; i++)
  3482         *charbuf++ = cmp_status->carryover[i];
  3483       coding->annotated = 1;
  3484     }
  3485 
  3486   while (1)
  3487     {
  3488       int c1, c2, c3;
  3489 
  3490       src_base = src;
  3491       consumed_chars_base = consumed_chars;
  3492 
  3493       if (charbuf >= charbuf_end)
  3494         {
  3495           if (byte_after_cr >= 0)
  3496             src_base--;
  3497           break;
  3498         }
  3499 
  3500       if (byte_after_cr >= 0)
  3501         c1 = byte_after_cr, byte_after_cr = -1;
  3502       else
  3503         ONE_MORE_BYTE (c1);
  3504       if (c1 < 0)
  3505         goto invalid_code;
  3506 
  3507       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
  3508         {
  3509           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  3510           char_offset++;
  3511           CODING_ISO_EXTSEGMENT_LEN (coding)--;
  3512           continue;
  3513         }
  3514 
  3515       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
  3516         {
  3517           if (c1 == ISO_CODE_ESC)
  3518             {
  3519               if (src + 1 >= src_end)
  3520                 goto no_more_source;
  3521               *charbuf++ = ISO_CODE_ESC;
  3522               char_offset++;
  3523               if (src[0] == '%' && src[1] == '@')
  3524                 {
  3525                   src += 2;
  3526                   consumed_chars += 2;
  3527                   char_offset += 2;
  3528                   /* We are sure charbuf can contain two more chars. */
  3529                   *charbuf++ = '%';
  3530                   *charbuf++ = '@';
  3531                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
  3532                 }
  3533             }
  3534           else
  3535             {
  3536               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  3537               char_offset++;
  3538             }
  3539           continue;
  3540         }
  3541 
  3542       if ((cmp_status->state == COMPOSING_RULE
  3543            || cmp_status->state == COMPOSING_COMPONENT_RULE)
  3544           && c1 != ISO_CODE_ESC)
  3545         {
  3546           int rule;
  3547 
  3548           DECODE_COMPOSITION_RULE (rule);
  3549           STORE_COMPOSITION_RULE (rule);
  3550           continue;
  3551         }
  3552 
  3553       /* We produce at most one character.  */
  3554       switch (iso_code_class [c1])
  3555         {
  3556         case ISO_0x20_or_0x7F:
  3557           if (charset_id_0 < 0
  3558               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
  3559             /* This is SPACE or DEL.  */
  3560             charset = CHARSET_FROM_ID (charset_ascii);
  3561           else
  3562             charset = CHARSET_FROM_ID (charset_id_0);
  3563           break;
  3564 
  3565         case ISO_graphic_plane_0:
  3566           if (charset_id_0 < 0)
  3567             charset = CHARSET_FROM_ID (charset_ascii);
  3568           else
  3569             charset = CHARSET_FROM_ID (charset_id_0);
  3570           break;
  3571 
  3572         case ISO_0xA0_or_0xFF:
  3573           if (charset_id_1 < 0
  3574               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
  3575               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3576             goto invalid_code;
  3577           /* This is a graphic character, we fall down ... */
  3578           FALLTHROUGH;
  3579         case ISO_graphic_plane_1:
  3580           if (charset_id_1 < 0)
  3581             goto invalid_code;
  3582           charset = CHARSET_FROM_ID (charset_id_1);
  3583           break;
  3584 
  3585         case ISO_control_0:
  3586           if (eol_dos && c1 == '\r')
  3587             ONE_MORE_BYTE (byte_after_cr);
  3588           MAYBE_FINISH_COMPOSITION ();
  3589           charset = CHARSET_FROM_ID (charset_ascii);
  3590           break;
  3591 
  3592         case ISO_control_1:
  3593           goto invalid_code;
  3594 
  3595         case ISO_shift_out:
  3596           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3597               || CODING_ISO_DESIGNATION (coding, 1) < 0)
  3598             goto invalid_code;
  3599           CODING_ISO_INVOCATION (coding, 0) = 1;
  3600           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3601           continue;
  3602 
  3603         case ISO_shift_in:
  3604           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
  3605             goto invalid_code;
  3606           CODING_ISO_INVOCATION (coding, 0) = 0;
  3607           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3608           continue;
  3609 
  3610         case ISO_single_shift_2_7:
  3611           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
  3612             goto invalid_code;
  3613           FALLTHROUGH;
  3614         case ISO_single_shift_2:
  3615           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
  3616             goto invalid_code;
  3617           /* SS2 is handled as an escape sequence of ESC 'N' */
  3618           c1 = 'N';
  3619           goto label_escape_sequence;
  3620 
  3621         case ISO_single_shift_3:
  3622           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
  3623             goto invalid_code;
  3624           /* SS2 is handled as an escape sequence of ESC 'O' */
  3625           c1 = 'O';
  3626           goto label_escape_sequence;
  3627 
  3628         case ISO_control_sequence_introducer:
  3629           /* CSI is handled as an escape sequence of ESC '[' ...  */
  3630           c1 = '[';
  3631           goto label_escape_sequence;
  3632 
  3633         case ISO_escape:
  3634           ONE_MORE_BYTE (c1);
  3635         label_escape_sequence:
  3636           /* Escape sequences handled here are invocation,
  3637              designation, direction specification, and character
  3638              composition specification.  */
  3639           switch (c1)
  3640             {
  3641             case '&':           /* revision of following character set */
  3642               ONE_MORE_BYTE (c1);
  3643               if (!(c1 >= '@' && c1 <= '~'))
  3644                 goto invalid_code;
  3645               ONE_MORE_BYTE (c1);
  3646               if (c1 != ISO_CODE_ESC)
  3647                 goto invalid_code;
  3648               ONE_MORE_BYTE (c1);
  3649               goto label_escape_sequence;
  3650 
  3651             case '$':           /* designation of 2-byte character set */
  3652               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
  3653                 goto invalid_code;
  3654               {
  3655                 int reg, chars96;
  3656 
  3657                 ONE_MORE_BYTE (c1);
  3658                 if (c1 >= '@' && c1 <= 'B')
  3659                   {     /* designation of JISX0208.1978, GB2312.1980,
  3660                            or JISX0208.1980 */
  3661                     reg = 0, chars96 = 0;
  3662                   }
  3663                 else if (c1 >= 0x28 && c1 <= 0x2B)
  3664                   { /* designation of DIMENSION2_CHARS94 character set */
  3665                     reg = c1 - 0x28, chars96 = 0;
  3666                     ONE_MORE_BYTE (c1);
  3667                   }
  3668                 else if (c1 >= 0x2C && c1 <= 0x2F)
  3669                   { /* designation of DIMENSION2_CHARS96 character set */
  3670                     reg = c1 - 0x2C, chars96 = 1;
  3671                     ONE_MORE_BYTE (c1);
  3672                   }
  3673                 else
  3674                   goto invalid_code;
  3675                 DECODE_DESIGNATION (reg, 2, chars96, c1);
  3676                 /* We must update these variables now.  */
  3677                 if (reg == 0)
  3678                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3679                 else if (reg == 1)
  3680                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3681                 if (chars96 < 0)
  3682                   goto invalid_code;
  3683               }
  3684               continue;
  3685 
  3686             case 'n':           /* invocation of locking-shift-2 */
  3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3688                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
  3689                 goto invalid_code;
  3690               CODING_ISO_INVOCATION (coding, 0) = 2;
  3691               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3692               continue;
  3693 
  3694             case 'o':           /* invocation of locking-shift-3 */
  3695               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3696                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
  3697                 goto invalid_code;
  3698               CODING_ISO_INVOCATION (coding, 0) = 3;
  3699               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3700               continue;
  3701 
  3702             case 'N':           /* invocation of single-shift-2 */
  3703               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  3704                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
  3705                 goto invalid_code;
  3706               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
  3707               if (charset_id_2 < 0)
  3708                 charset = CHARSET_FROM_ID (charset_ascii);
  3709               else
  3710                 charset = CHARSET_FROM_ID (charset_id_2);
  3711               ONE_MORE_BYTE (c1);
  3712               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
  3713                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3714                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
  3715                           ? c1 >= 0x80 : c1 < 0x80)))
  3716                 goto invalid_code;
  3717               break;
  3718 
  3719             case 'O':           /* invocation of single-shift-3 */
  3720               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  3721                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
  3722                 goto invalid_code;
  3723               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
  3724               if (charset_id_3 < 0)
  3725                 charset = CHARSET_FROM_ID (charset_ascii);
  3726               else
  3727                 charset = CHARSET_FROM_ID (charset_id_3);
  3728               ONE_MORE_BYTE (c1);
  3729               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
  3730                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3731                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
  3732                           ? c1 >= 0x80 : c1 < 0x80)))
  3733                 goto invalid_code;
  3734               break;
  3735 
  3736             case '0': case '2': case '3': case '4': /* start composition */
  3737               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
  3738                 goto invalid_code;
  3739               if (last_id != charset_ascii)
  3740                 {
  3741                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
  3742                   last_id = charset_ascii;
  3743                   last_offset = char_offset;
  3744                 }
  3745               DECODE_COMPOSITION_START (c1);
  3746               continue;
  3747 
  3748             case '1':           /* end composition */
  3749               if (cmp_status->state == COMPOSING_NO)
  3750                 goto invalid_code;
  3751               DECODE_COMPOSITION_END ();
  3752               continue;
  3753 
  3754             case '[':           /* specification of direction */
  3755               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
  3756                 goto invalid_code;
  3757               /* For the moment, nested direction is not supported.
  3758                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
  3759                  left-to-right, and nonzero means right-to-left.  */
  3760               ONE_MORE_BYTE (c1);
  3761               switch (c1)
  3762                 {
  3763                 case ']':       /* end of the current direction */
  3764                   coding->mode &= ~CODING_MODE_DIRECTION;
  3765                   break;
  3766 
  3767                 case '0':       /* end of the current direction */
  3768                 case '1':       /* start of left-to-right direction */
  3769                   ONE_MORE_BYTE (c1);
  3770                   if (c1 == ']')
  3771                     coding->mode &= ~CODING_MODE_DIRECTION;
  3772                   else
  3773                     goto invalid_code;
  3774                   break;
  3775 
  3776                 case '2':       /* start of right-to-left direction */
  3777                   ONE_MORE_BYTE (c1);
  3778                   if (c1 == ']')
  3779                     coding->mode |= CODING_MODE_DIRECTION;
  3780                   else
  3781                     goto invalid_code;
  3782                   break;
  3783 
  3784                 default:
  3785                   goto invalid_code;
  3786                 }
  3787               continue;
  3788 
  3789             case '%':
  3790               ONE_MORE_BYTE (c1);
  3791               if (c1 == '/')
  3792                 {
  3793                   /* CTEXT extended segment:
  3794                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
  3795                      We keep these bytes as is for the moment.
  3796                      They may be decoded by post-read-conversion.  */
  3797                   int dim, M, L;
  3798                   int size;
  3799 
  3800                   ONE_MORE_BYTE (dim);
  3801                   if (dim < '0' || dim > '4')
  3802                     goto invalid_code;
  3803                   ONE_MORE_BYTE (M);
  3804                   if (M < 128)
  3805                     goto invalid_code;
  3806                   ONE_MORE_BYTE (L);
  3807                   if (L < 128)
  3808                     goto invalid_code;
  3809                   size = ((M - 128) * 128) + (L - 128);
  3810                   if (charbuf + 6 > charbuf_end)
  3811                     goto break_loop;
  3812                   *charbuf++ = ISO_CODE_ESC;
  3813                   *charbuf++ = '%';
  3814                   *charbuf++ = '/';
  3815                   *charbuf++ = dim;
  3816                   *charbuf++ = BYTE8_TO_CHAR (M);
  3817                   *charbuf++ = BYTE8_TO_CHAR (L);
  3818                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
  3819                 }
  3820               else if (c1 == 'G')
  3821                 {
  3822                   /* XFree86 extension for embedding UTF-8 in CTEXT:
  3823                      ESC % G --UTF-8-BYTES-- ESC % @
  3824                      We keep these bytes as is for the moment.
  3825                      They may be decoded by post-read-conversion.  */
  3826                   if (charbuf + 3 > charbuf_end)
  3827                     goto break_loop;
  3828                   *charbuf++ = ISO_CODE_ESC;
  3829                   *charbuf++ = '%';
  3830                   *charbuf++ = 'G';
  3831                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
  3832                 }
  3833               else
  3834                 goto invalid_code;
  3835               continue;
  3836               break;
  3837 
  3838             default:
  3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
  3840                 goto invalid_code;
  3841               {
  3842                 int reg, chars96;
  3843 
  3844                 if (c1 >= 0x28 && c1 <= 0x2B)
  3845                   { /* designation of DIMENSION1_CHARS94 character set */
  3846                     reg = c1 - 0x28, chars96 = 0;
  3847                     ONE_MORE_BYTE (c1);
  3848                   }
  3849                 else if (c1 >= 0x2C && c1 <= 0x2F)
  3850                   { /* designation of DIMENSION1_CHARS96 character set */
  3851                     reg = c1 - 0x2C, chars96 = 1;
  3852                     ONE_MORE_BYTE (c1);
  3853                   }
  3854                 else
  3855                   goto invalid_code;
  3856                 DECODE_DESIGNATION (reg, 1, chars96, c1);
  3857                 /* We must update these variables now.  */
  3858                 if (reg == 0)
  3859                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3860                 else if (reg == 1)
  3861                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3862                 if (chars96 < 0)
  3863                   goto invalid_code;
  3864               }
  3865               continue;
  3866             }
  3867           break;
  3868 
  3869         default:
  3870           emacs_abort ();
  3871         }
  3872 
  3873       if (cmp_status->state == COMPOSING_NO
  3874           && charset->id != charset_ascii
  3875           && last_id != charset->id)
  3876         {
  3877           if (last_id != charset_ascii)
  3878             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  3879           last_id = charset->id;
  3880           last_offset = char_offset;
  3881         }
  3882 
  3883       /* Now we know CHARSET and 1st position code C1 of a character.
  3884          Produce a decoded character while getting 2nd and 3rd
  3885          position codes C2, C3 if necessary.  */
  3886       if (CHARSET_DIMENSION (charset) > 1)
  3887         {
  3888           ONE_MORE_BYTE (c2);
  3889           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
  3890               || ((c1 & 0x80) != (c2 & 0x80)))
  3891             /* C2 is not in a valid range.  */
  3892             goto invalid_code;
  3893           if (CHARSET_DIMENSION (charset) == 2)
  3894             c1 = (c1 << 8) | c2;
  3895           else
  3896             {
  3897               ONE_MORE_BYTE (c3);
  3898               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
  3899                   || ((c1 & 0x80) != (c3 & 0x80)))
  3900                 /* C3 is not in a valid range.  */
  3901                 goto invalid_code;
  3902               c1 = (c1 << 16) | (c2 << 8) | c2;
  3903             }
  3904         }
  3905       c1 &= 0x7F7F7F;
  3906       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
  3907       if (c < 0)
  3908         {
  3909           MAYBE_FINISH_COMPOSITION ();
  3910           for (; src_base < src; src_base++, char_offset++)
  3911             {
  3912               if (ASCII_CHAR_P (*src_base))
  3913                 *charbuf++ = *src_base;
  3914               else
  3915                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
  3916             }
  3917         }
  3918       else if (cmp_status->state == COMPOSING_NO)
  3919         {
  3920           *charbuf++ = c;
  3921           char_offset++;
  3922         }
  3923       else if ((cmp_status->state == COMPOSING_CHAR
  3924                 ? cmp_status->nchars
  3925                 : cmp_status->ncomps)
  3926                >= MAX_COMPOSITION_COMPONENTS)
  3927         {
  3928           /* Too long composition.  */
  3929           MAYBE_FINISH_COMPOSITION ();
  3930           *charbuf++ = c;
  3931           char_offset++;
  3932         }
  3933       else
  3934         STORE_COMPOSITION_CHAR (c);
  3935       continue;
  3936 
  3937     invalid_code:
  3938       MAYBE_FINISH_COMPOSITION ();
  3939       src = src_base;
  3940       consumed_chars = consumed_chars_base;
  3941       ONE_MORE_BYTE (c);
  3942       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  3943       char_offset++;
  3944       /* Reset the invocation and designation status to the safest
  3945          one; i.e. designate ASCII to the graphic register 0, and
  3946          invoke that register to the graphic plane 0.  This typically
  3947          helps the case that a designation sequence for ASCII "ESC (
  3948          B" is somehow broken (e.g. broken by a newline).  */
  3949       CODING_ISO_INVOCATION (coding, 0) = 0;
  3950       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
  3951       charset_id_0 = charset_ascii;
  3952       continue;
  3953 
  3954     break_loop:
  3955       break;
  3956     }
  3957 
  3958  no_more_source:
  3959   if (cmp_status->state != COMPOSING_NO)
  3960     {
  3961       if (coding->mode & CODING_MODE_LAST_BLOCK)
  3962         MAYBE_FINISH_COMPOSITION ();
  3963       else
  3964         {
  3965           charbuf -= cmp_status->length;
  3966           for (i = 0; i < cmp_status->length; i++)
  3967             cmp_status->carryover[i] = charbuf[i];
  3968         }
  3969     }
  3970   else if (last_id != charset_ascii)
  3971     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  3972   coding->consumed_char += consumed_chars_base;
  3973   coding->consumed = src_base - coding->source;
  3974   coding->charbuf_used = charbuf - coding->charbuf;
  3975 }
  3976 
  3977 
  3978 /* ISO2022 encoding stuff.  */
  3979 
  3980 /*
  3981    It is not enough to say just "ISO2022" on encoding, we have to
  3982    specify more details.  In Emacs, each coding system of ISO2022
  3983    variant has the following specifications:
  3984         1. Initial designation to G0 thru G3.
  3985         2. Allows short-form designation?
  3986         3. ASCII should be designated to G0 before control characters?
  3987         4. ASCII should be designated to G0 at end of line?
  3988         5. 7-bit environment or 8-bit environment?
  3989         6. Use locking-shift?
  3990         7. Use Single-shift?
  3991    And the following two are only for Japanese:
  3992         8. Use ASCII in place of JIS0201-1976-Roman?
  3993         9. Use JISX0208-1983 in place of JISX0208-1978?
  3994    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
  3995    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
  3996    details.
  3997 */
  3998 
  3999 /* Produce codes (escape sequence) for designating CHARSET to graphic
  4000    register REG at DST, and increment DST.  If <final-char> of CHARSET is
  4001    '@', 'A', or 'B' and the coding system CODING allows, produce
  4002    designation sequence of short-form.  */
  4003 
  4004 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
  4005   do {                                                                  \
  4006     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
  4007     const char *intermediate_char_94 = "()*+";                          \
  4008     const char *intermediate_char_96 = ",-./";                          \
  4009     int revision = -1;                                                  \
  4010                                                                         \
  4011     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
  4012       revision = CHARSET_ISO_REVISION (charset);                        \
  4013                                                                         \
  4014     if (revision >= 0)                                                  \
  4015       {                                                                 \
  4016         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
  4017         EMIT_ONE_BYTE ('@' + revision);                                 \
  4018       }                                                                 \
  4019     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
  4020     if (CHARSET_DIMENSION (charset) == 1)                               \
  4021       {                                                                 \
  4022         int b;                                                          \
  4023         if (! CHARSET_ISO_CHARS_96 (charset))                           \
  4024           b = intermediate_char_94[reg];                                \
  4025         else                                                            \
  4026           b = intermediate_char_96[reg];                                \
  4027         EMIT_ONE_ASCII_BYTE (b);                                        \
  4028       }                                                                 \
  4029     else                                                                \
  4030       {                                                                 \
  4031         EMIT_ONE_ASCII_BYTE ('$');                                      \
  4032         if (! CHARSET_ISO_CHARS_96 (charset))                           \
  4033           {                                                             \
  4034             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
  4035                 || reg != 0                                             \
  4036                 || final_char < '@' || final_char > 'B')                \
  4037               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
  4038           }                                                             \
  4039         else                                                            \
  4040           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
  4041       }                                                                 \
  4042     EMIT_ONE_ASCII_BYTE (final_char);                                   \
  4043                                                                         \
  4044     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
  4045   } while (0)
  4046 
  4047 
  4048 /* The following two macros produce codes (control character or escape
  4049    sequence) for ISO2022 single-shift functions (single-shift-2 and
  4050    single-shift-3).  */
  4051 
  4052 #define ENCODE_SINGLE_SHIFT_2                                           \
  4053   do {                                                                  \
  4054     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
  4055       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
  4056     else                                                                \
  4057       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
  4058     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
  4059   } while (0)
  4060 
  4061 
  4062 #define ENCODE_SINGLE_SHIFT_3                                           \
  4063   do {                                                                  \
  4064     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
  4065       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
  4066     else                                                                \
  4067       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
  4068     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
  4069   } while (0)
  4070 
  4071 
  4072 /* The following four macros produce codes (control character or
  4073    escape sequence) for ISO2022 locking-shift functions (shift-in,
  4074    shift-out, locking-shift-2, and locking-shift-3).  */
  4075 
  4076 #define ENCODE_SHIFT_IN                                 \
  4077   do {                                                  \
  4078     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
  4079     CODING_ISO_INVOCATION (coding, 0) = 0;              \
  4080   } while (0)
  4081 
  4082 
  4083 #define ENCODE_SHIFT_OUT                                \
  4084   do {                                                  \
  4085     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
  4086     CODING_ISO_INVOCATION (coding, 0) = 1;              \
  4087   } while (0)
  4088 
  4089 
  4090 #define ENCODE_LOCKING_SHIFT_2                          \
  4091   do {                                                  \
  4092     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
  4093     CODING_ISO_INVOCATION (coding, 0) = 2;              \
  4094   } while (0)
  4095 
  4096 
  4097 #define ENCODE_LOCKING_SHIFT_3                          \
  4098   do {                                                  \
  4099     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
  4100     CODING_ISO_INVOCATION (coding, 0) = 3;              \
  4101   } while (0)
  4102 
  4103 
  4104 /* Produce codes for a DIMENSION1 character whose character set is
  4105    CHARSET and whose position-code is C1.  Designation and invocation
  4106    sequences are also produced in advance if necessary.  */
  4107 
  4108 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
  4109   do {                                                                  \
  4110     int id = CHARSET_ID (charset);                                      \
  4111                                                                         \
  4112     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
  4113         && id == charset_ascii)                                         \
  4114       {                                                                 \
  4115         id = charset_jisx0201_roman;                                    \
  4116         charset = CHARSET_FROM_ID (id);                                 \
  4117       }                                                                 \
  4118                                                                         \
  4119     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
  4120       {                                                                 \
  4121         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
  4122           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
  4123         else                                                            \
  4124           EMIT_ONE_BYTE (c1 | 0x80);                                    \
  4125         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
  4126         break;                                                          \
  4127       }                                                                 \
  4128     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
  4129       {                                                                 \
  4130         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
  4131         break;                                                          \
  4132       }                                                                 \
  4133     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
  4134       {                                                                 \
  4135         EMIT_ONE_BYTE (c1 | 0x80);                                      \
  4136         break;                                                          \
  4137       }                                                                 \
  4138     else                                                                \
  4139       /* Since CHARSET is not yet invoked to any graphic planes, we     \
  4140          must invoke it, or, at first, designate it to some graphic     \
  4141          register.  Then repeat the loop to actually produce the        \
  4142          character.  */                                                 \
  4143       dst = encode_invocation_designation (charset, coding, dst,        \
  4144                                            &produced_chars);            \
  4145   } while (1)
  4146 
  4147 
  4148 /* Produce codes for a DIMENSION2 character whose character set is
  4149    CHARSET and whose position-codes are C1 and C2.  Designation and
  4150    invocation codes are also produced in advance if necessary.  */
  4151 
  4152 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
  4153   do {                                                                  \
  4154     int id = CHARSET_ID (charset);                                      \
  4155                                                                         \
  4156     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
  4157         && id == charset_jisx0208)                                      \
  4158       {                                                                 \
  4159         id = charset_jisx0208_1978;                                     \
  4160         charset = CHARSET_FROM_ID (id);                                 \
  4161       }                                                                 \
  4162                                                                         \
  4163     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
  4164       {                                                                 \
  4165         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
  4166           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
  4167         else                                                            \
  4168           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
  4169         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
  4170         break;                                                          \
  4171       }                                                                 \
  4172     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
  4173       {                                                                 \
  4174         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
  4175         break;                                                          \
  4176       }                                                                 \
  4177     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
  4178       {                                                                 \
  4179         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
  4180         break;                                                          \
  4181       }                                                                 \
  4182     else                                                                \
  4183       /* Since CHARSET is not yet invoked to any graphic planes, we     \
  4184          must invoke it, or, at first, designate it to some graphic     \
  4185          register.  Then repeat the loop to actually produce the        \
  4186          character.  */                                                 \
  4187       dst = encode_invocation_designation (charset, coding, dst,        \
  4188                                            &produced_chars);            \
  4189   } while (1)
  4190 
  4191 
  4192 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
  4193   do {                                                                     \
  4194     unsigned code;                                                         \
  4195     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
  4196                                                                            \
  4197     if (CHARSET_DIMENSION (charset) == 1)                                  \
  4198       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
  4199     else                                                                   \
  4200       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
  4201   } while (0)
  4202 
  4203 
  4204 /* Produce designation and invocation codes at a place pointed by DST
  4205    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
  4206    Return new DST.  */
  4207 
  4208 static unsigned char *
  4209 encode_invocation_designation (struct charset *charset,
  4210                                struct coding_system *coding,
  4211                                unsigned char *dst, ptrdiff_t *p_nchars)
  4212 {
  4213   bool multibytep = coding->dst_multibyte;
  4214   ptrdiff_t produced_chars = *p_nchars;
  4215   int reg;                      /* graphic register number */
  4216   int id = CHARSET_ID (charset);
  4217 
  4218   /* At first, check designations.  */
  4219   for (reg = 0; reg < 4; reg++)
  4220     if (id == CODING_ISO_DESIGNATION (coding, reg))
  4221       break;
  4222 
  4223   if (reg >= 4)
  4224     {
  4225       /* CHARSET is not yet designated to any graphic registers.  */
  4226       /* At first check the requested designation.  */
  4227       reg = CODING_ISO_REQUEST (coding, id);
  4228       if (reg < 0)
  4229         /* Since CHARSET requests no special designation, designate it
  4230            to graphic register 0.  */
  4231         reg = 0;
  4232 
  4233       ENCODE_DESIGNATION (charset, reg, coding);
  4234     }
  4235 
  4236   if (CODING_ISO_INVOCATION (coding, 0) != reg
  4237       && CODING_ISO_INVOCATION (coding, 1) != reg)
  4238     {
  4239       /* Since the graphic register REG is not invoked to any graphic
  4240          planes, invoke it to graphic plane 0.  */
  4241       switch (reg)
  4242         {
  4243         case 0:                 /* graphic register 0 */
  4244           ENCODE_SHIFT_IN;
  4245           break;
  4246 
  4247         case 1:                 /* graphic register 1 */
  4248           ENCODE_SHIFT_OUT;
  4249           break;
  4250 
  4251         case 2:                 /* graphic register 2 */
  4252           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  4253             ENCODE_SINGLE_SHIFT_2;
  4254           else
  4255             ENCODE_LOCKING_SHIFT_2;
  4256           break;
  4257 
  4258         case 3:                 /* graphic register 3 */
  4259           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  4260             ENCODE_SINGLE_SHIFT_3;
  4261           else
  4262             ENCODE_LOCKING_SHIFT_3;
  4263           break;
  4264 
  4265         default:
  4266           break;
  4267         }
  4268     }
  4269 
  4270   *p_nchars = produced_chars;
  4271   return dst;
  4272 }
  4273 
  4274 
  4275 /* Produce codes for designation and invocation to reset the graphic
  4276    planes and registers to initial state.  */
  4277 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
  4278   do {                                                                  \
  4279     int reg;                                                            \
  4280     struct charset *charset;                                            \
  4281                                                                         \
  4282     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
  4283       ENCODE_SHIFT_IN;                                                  \
  4284     for (reg = 0; reg < 4; reg++)                                       \
  4285       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
  4286           && (CODING_ISO_DESIGNATION (coding, reg)                      \
  4287               != CODING_ISO_INITIAL (coding, reg)))                     \
  4288         {                                                               \
  4289           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
  4290           ENCODE_DESIGNATION (charset, reg, coding);                    \
  4291         }                                                               \
  4292   } while (0)
  4293 
  4294 
  4295 /* Produce designation sequences of charsets in the line started from
  4296    CHARBUF to a place pointed by DST, and return the number of
  4297    produced bytes.  DST should not directly point a buffer text area
  4298    which may be relocated by char_charset call.
  4299 
  4300    If the current block ends before any end-of-line, we may fail to
  4301    find all the necessary designations.  */
  4302 
  4303 static ptrdiff_t
  4304 encode_designation_at_bol (struct coding_system *coding,
  4305                            int *charbuf, int *charbuf_end,
  4306                            unsigned char *dst)
  4307 {
  4308   unsigned char *orig = dst;
  4309   struct charset *charset;
  4310   /* Table of charsets to be designated to each graphic register.  */
  4311   int r[4];
  4312   int c, found = 0, reg;
  4313   ptrdiff_t produced_chars = 0;
  4314   bool multibytep = coding->dst_multibyte;
  4315   Lisp_Object attrs;
  4316   Lisp_Object charset_list;
  4317 
  4318   attrs = CODING_ID_ATTRS (coding->id);
  4319   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  4320   if (EQ (charset_list, Qiso_2022))
  4321     charset_list = Viso_2022_charset_list;
  4322 
  4323   for (reg = 0; reg < 4; reg++)
  4324     r[reg] = -1;
  4325 
  4326   while (charbuf < charbuf_end && found < 4)
  4327     {
  4328       int id;
  4329 
  4330       c = *charbuf++;
  4331       if (c == '\n')
  4332         break;
  4333       charset = char_charset (c, charset_list, NULL);
  4334       id = CHARSET_ID (charset);
  4335       reg = CODING_ISO_REQUEST (coding, id);
  4336       if (reg >= 0 && r[reg] < 0)
  4337         {
  4338           found++;
  4339           r[reg] = id;
  4340         }
  4341     }
  4342 
  4343   if (found)
  4344     {
  4345       for (reg = 0; reg < 4; reg++)
  4346         if (r[reg] >= 0
  4347             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
  4348           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
  4349     }
  4350 
  4351   return dst - orig;
  4352 }
  4353 
  4354 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
  4355 
  4356 static bool
  4357 encode_coding_iso_2022 (struct coding_system *coding)
  4358 {
  4359   bool multibytep = coding->dst_multibyte;
  4360   int *charbuf = coding->charbuf;
  4361   int *charbuf_end = charbuf + coding->charbuf_used;
  4362   unsigned char *dst = coding->destination + coding->produced;
  4363   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4364   int safe_room = 16;
  4365   bool bol_designation
  4366     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
  4367        && CODING_ISO_BOL (coding));
  4368   ptrdiff_t produced_chars = 0;
  4369   Lisp_Object attrs, eol_type, charset_list;
  4370   bool ascii_compatible;
  4371   int c;
  4372   int preferred_charset_id = -1;
  4373 
  4374   CODING_GET_INFO (coding, attrs, charset_list);
  4375   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  4376   if (VECTORP (eol_type))
  4377     eol_type = Qunix;
  4378 
  4379   setup_iso_safe_charsets (attrs);
  4380   /* Charset list may have been changed.  */
  4381   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  4382   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
  4383 
  4384   ascii_compatible
  4385     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  4386        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
  4387                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
  4388 
  4389   while (charbuf < charbuf_end)
  4390     {
  4391       ASSURE_DESTINATION (safe_room);
  4392 
  4393       if (bol_designation)
  4394         {
  4395           /* We have to produce designation sequences if any now.  */
  4396           unsigned char desig_buf[16];
  4397           ptrdiff_t nbytes;
  4398           ptrdiff_t offset;
  4399 
  4400           charset_map_loaded = 0;
  4401           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
  4402                                               desig_buf);
  4403           if (charset_map_loaded
  4404               && (offset = coding_change_destination (coding)))
  4405             {
  4406               dst += offset;
  4407               dst_end += offset;
  4408             }
  4409           memcpy (dst, desig_buf, nbytes);
  4410           dst += nbytes;
  4411           /* We are sure that designation sequences are all ASCII bytes.  */
  4412           produced_chars += nbytes;
  4413           bol_designation = 0;
  4414           ASSURE_DESTINATION (safe_room);
  4415         }
  4416 
  4417       c = *charbuf++;
  4418 
  4419       if (c < 0)
  4420         {
  4421           /* Handle an annotation.  */
  4422           switch (*charbuf)
  4423             {
  4424             case CODING_ANNOTATE_COMPOSITION_MASK:
  4425               /* Not yet implemented.  */
  4426               break;
  4427             case CODING_ANNOTATE_CHARSET_MASK:
  4428               preferred_charset_id = charbuf[2];
  4429               if (preferred_charset_id >= 0
  4430                   && NILP (Fmemq (make_fixnum (preferred_charset_id),
  4431                                   charset_list)))
  4432                 preferred_charset_id = -1;
  4433               break;
  4434             default:
  4435               emacs_abort ();
  4436             }
  4437           charbuf += -c - 1;
  4438           continue;
  4439         }
  4440 
  4441       /* Now encode the character C.  */
  4442       if (c < 0x20 || c == 0x7F)
  4443         {
  4444           if (c == '\n'
  4445               || (c == '\r' && EQ (eol_type, Qmac)))
  4446             {
  4447               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
  4448                 ENCODE_RESET_PLANE_AND_REGISTER ();
  4449               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
  4450                 {
  4451                   int i;
  4452 
  4453                   for (i = 0; i < 4; i++)
  4454                     CODING_ISO_DESIGNATION (coding, i)
  4455                       = CODING_ISO_INITIAL (coding, i);
  4456                 }
  4457               bol_designation = ((CODING_ISO_FLAGS (coding)
  4458                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
  4459                                  != 0);
  4460             }
  4461           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
  4462             ENCODE_RESET_PLANE_AND_REGISTER ();
  4463           EMIT_ONE_ASCII_BYTE (c);
  4464         }
  4465       else if (ASCII_CHAR_P (c))
  4466         {
  4467           if (ascii_compatible)
  4468             EMIT_ONE_ASCII_BYTE (c);
  4469           else
  4470             {
  4471               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
  4472               ENCODE_ISO_CHARACTER (charset, c);
  4473             }
  4474         }
  4475       else if (CHAR_BYTE8_P (c))
  4476         {
  4477           c = CHAR_TO_BYTE8 (c);
  4478           EMIT_ONE_BYTE (c);
  4479         }
  4480       else
  4481         {
  4482           struct charset *charset;
  4483 
  4484           if (preferred_charset_id >= 0)
  4485             {
  4486               bool result;
  4487 
  4488               charset = CHARSET_FROM_ID (preferred_charset_id);
  4489               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
  4490               if (! result)
  4491                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4492                                      NULL, charset);
  4493             }
  4494           else
  4495             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4496                                  NULL, charset);
  4497           if (!charset)
  4498             {
  4499               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  4500                 {
  4501                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  4502                   charset = CHARSET_FROM_ID (charset_ascii);
  4503                 }
  4504               else
  4505                 {
  4506                   c = coding->default_char;
  4507                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  4508                                        charset_list, NULL, charset);
  4509                 }
  4510             }
  4511           ENCODE_ISO_CHARACTER (charset, c);
  4512         }
  4513     }
  4514 
  4515   if (coding->mode & CODING_MODE_LAST_BLOCK
  4516       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
  4517     {
  4518       ASSURE_DESTINATION (safe_room);
  4519       ENCODE_RESET_PLANE_AND_REGISTER ();
  4520     }
  4521   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  4522   CODING_ISO_BOL (coding) = bol_designation;
  4523   coding->produced_char += produced_chars;
  4524   coding->produced = dst - coding->destination;
  4525   return 0;
  4526 }
  4527 
  4528 
  4529 /*** 8,9. SJIS and BIG5 handlers ***/
  4530 
  4531 /* Although SJIS and BIG5 are not ISO's coding system, they are used
  4532    quite widely.  So, for the moment, Emacs supports them in the bare
  4533    C code.  But, in the future, they may be supported only by CCL.  */
  4534 
  4535 /* SJIS is a coding system encoding three character sets: ASCII, right
  4536    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
  4537    as is.  A character of charset katakana-jisx0201 is encoded by
  4538    "position-code + 0x80".  A character of charset japanese-jisx0208
  4539    is encoded in 2-byte but two position-codes are divided and shifted
  4540    so that it fit in the range below.
  4541 
  4542    --- CODE RANGE of SJIS ---
  4543    (character set)      (range)
  4544    ASCII                0x00 .. 0x7F
  4545    KATAKANA-JISX0201    0xA0 .. 0xDF
  4546    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
  4547             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
  4548    -------------------------------
  4549 
  4550 */
  4551 
  4552 /* BIG5 is a coding system encoding two character sets: ASCII and
  4553    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
  4554    character set and is encoded in two-byte.
  4555 
  4556    --- CODE RANGE of BIG5 ---
  4557    (character set)      (range)
  4558    ASCII                0x00 .. 0x7F
  4559    Big5 (1st byte)      0xA1 .. 0xFE
  4560         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
  4561    --------------------------
  4562 
  4563   */
  4564 
  4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  4566    Return true if a text is encoded in SJIS.  */
  4567 
  4568 static bool
  4569 detect_coding_sjis (struct coding_system *coding,
  4570                     struct coding_detection_info *detect_info)
  4571 {
  4572   const unsigned char *src = coding->source, *src_base;
  4573   const unsigned char *src_end = coding->source + coding->src_bytes;
  4574   bool multibytep = coding->src_multibyte;
  4575   ptrdiff_t consumed_chars = 0;
  4576   int found = 0;
  4577   int c;
  4578   Lisp_Object attrs, charset_list;
  4579   int max_first_byte_of_2_byte_code;
  4580 
  4581   CODING_GET_INFO (coding, attrs, charset_list);
  4582   max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
  4583 
  4584   detect_info->checked |= CATEGORY_MASK_SJIS;
  4585   /* A coding system of this category is always ASCII compatible.  */
  4586   src += coding->head_ascii;
  4587 
  4588   while (1)
  4589     {
  4590       src_base = src;
  4591       ONE_MORE_BYTE (c);
  4592       if (c < 0x80)
  4593         continue;
  4594       if ((c >= 0x81 && c <= 0x9F)
  4595           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
  4596         {
  4597           ONE_MORE_BYTE (c);
  4598           if (c < 0x40 || c == 0x7F || c > 0xFC)
  4599             break;
  4600           found = CATEGORY_MASK_SJIS;
  4601         }
  4602       else if (c >= 0xA0 && c < 0xE0)
  4603         found = CATEGORY_MASK_SJIS;
  4604       else
  4605         break;
  4606     }
  4607   detect_info->rejected |= CATEGORY_MASK_SJIS;
  4608   return 0;
  4609 
  4610  no_more_source:
  4611   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  4612     {
  4613       detect_info->rejected |= CATEGORY_MASK_SJIS;
  4614       return 0;
  4615     }
  4616   detect_info->found |= found;
  4617   return 1;
  4618 }
  4619 
  4620 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  4621    Return true if a text is encoded in BIG5.  */
  4622 
  4623 static bool
  4624 detect_coding_big5 (struct coding_system *coding,
  4625                     struct coding_detection_info *detect_info)
  4626 {
  4627   const unsigned char *src = coding->source, *src_base;
  4628   const unsigned char *src_end = coding->source + coding->src_bytes;
  4629   bool multibytep = coding->src_multibyte;
  4630   ptrdiff_t consumed_chars = 0;
  4631   int found = 0;
  4632   int c;
  4633 
  4634   detect_info->checked |= CATEGORY_MASK_BIG5;
  4635   /* A coding system of this category is always ASCII compatible.  */
  4636   src += coding->head_ascii;
  4637 
  4638   while (1)
  4639     {
  4640       src_base = src;
  4641       ONE_MORE_BYTE (c);
  4642       if (c < 0x80)
  4643         continue;
  4644       if (c >= 0xA1)
  4645         {
  4646           ONE_MORE_BYTE (c);
  4647           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
  4648             return 0;
  4649           found = CATEGORY_MASK_BIG5;
  4650         }
  4651       else
  4652         break;
  4653     }
  4654   detect_info->rejected |= CATEGORY_MASK_BIG5;
  4655   return 0;
  4656 
  4657  no_more_source:
  4658   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  4659     {
  4660       detect_info->rejected |= CATEGORY_MASK_BIG5;
  4661       return 0;
  4662     }
  4663   detect_info->found |= found;
  4664   return 1;
  4665 }
  4666 
  4667 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  4668 
  4669 static void
  4670 decode_coding_sjis (struct coding_system *coding)
  4671 {
  4672   const unsigned char *src = coding->source + coding->consumed;
  4673   const unsigned char *src_end = coding->source + coding->src_bytes;
  4674   const unsigned char *src_base;
  4675   int *charbuf = coding->charbuf + coding->charbuf_used;
  4676   /* We may produce one charset annotation in one loop and one more at
  4677      the end.  */
  4678   int *charbuf_end
  4679     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  4680   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  4681   bool multibytep = coding->src_multibyte;
  4682   struct charset *charset_roman, *charset_kanji, *charset_kana;
  4683   struct charset *charset_kanji2;
  4684   Lisp_Object attrs, charset_list, val;
  4685   ptrdiff_t char_offset = coding->produced_char;
  4686   ptrdiff_t last_offset = char_offset;
  4687   int last_id = charset_ascii;
  4688   bool eol_dos
  4689     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  4690   int byte_after_cr = -1;
  4691 
  4692   CODING_GET_INFO (coding, attrs, charset_list);
  4693 
  4694   val = charset_list;
  4695   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4696   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4697   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4698   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4699 
  4700   while (1)
  4701     {
  4702       int c, c1;
  4703       struct charset *charset;
  4704 
  4705       src_base = src;
  4706       consumed_chars_base = consumed_chars;
  4707 
  4708       if (charbuf >= charbuf_end)
  4709         {
  4710           if (byte_after_cr >= 0)
  4711             src_base--;
  4712           break;
  4713         }
  4714 
  4715       if (byte_after_cr >= 0)
  4716         c = byte_after_cr, byte_after_cr = -1;
  4717       else
  4718         ONE_MORE_BYTE (c);
  4719       if (c < 0)
  4720         goto invalid_code;
  4721       if (c < 0x80)
  4722         {
  4723           if (eol_dos && c == '\r')
  4724             ONE_MORE_BYTE (byte_after_cr);
  4725           charset = charset_roman;
  4726         }
  4727       else if (c == 0x80 || c == 0xA0)
  4728         goto invalid_code;
  4729       else if (c >= 0xA1 && c <= 0xDF)
  4730         {
  4731           /* SJIS -> JISX0201-Kana */
  4732           c &= 0x7F;
  4733           charset = charset_kana;
  4734         }
  4735       else if (c <= 0xEF)
  4736         {
  4737           /* SJIS -> JISX0208 */
  4738           ONE_MORE_BYTE (c1);
  4739           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
  4740             goto invalid_code;
  4741           c = (c << 8) | c1;
  4742           SJIS_TO_JIS (c);
  4743           charset = charset_kanji;
  4744         }
  4745       else if (c <= 0xFC && charset_kanji2)
  4746         {
  4747           /* SJIS -> JISX0213-2 */
  4748           ONE_MORE_BYTE (c1);
  4749           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
  4750             goto invalid_code;
  4751           c = (c << 8) | c1;
  4752           SJIS_TO_JIS2 (c);
  4753           charset = charset_kanji2;
  4754         }
  4755       else
  4756         goto invalid_code;
  4757       if (charset->id != charset_ascii
  4758           && last_id != charset->id)
  4759         {
  4760           if (last_id != charset_ascii)
  4761             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4762           last_id = charset->id;
  4763           last_offset = char_offset;
  4764         }
  4765       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
  4766       *charbuf++ = c;
  4767       char_offset++;
  4768       continue;
  4769 
  4770     invalid_code:
  4771       src = src_base;
  4772       consumed_chars = consumed_chars_base;
  4773       ONE_MORE_BYTE (c);
  4774       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
  4775       char_offset++;
  4776     }
  4777 
  4778  no_more_source:
  4779   if (last_id != charset_ascii)
  4780     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4781   coding->consumed_char += consumed_chars_base;
  4782   coding->consumed = src_base - coding->source;
  4783   coding->charbuf_used = charbuf - coding->charbuf;
  4784 }
  4785 
  4786 static void
  4787 decode_coding_big5 (struct coding_system *coding)
  4788 {
  4789   const unsigned char *src = coding->source + coding->consumed;
  4790   const unsigned char *src_end = coding->source + coding->src_bytes;
  4791   const unsigned char *src_base;
  4792   int *charbuf = coding->charbuf + coding->charbuf_used;
  4793   /* We may produce one charset annotation in one loop and one more at
  4794      the end.  */
  4795   int *charbuf_end
  4796     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  4797   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  4798   bool multibytep = coding->src_multibyte;
  4799   struct charset *charset_roman, *charset_big5;
  4800   Lisp_Object attrs, charset_list, val;
  4801   ptrdiff_t char_offset = coding->produced_char;
  4802   ptrdiff_t last_offset = char_offset;
  4803   int last_id = charset_ascii;
  4804   bool eol_dos
  4805     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  4806   int byte_after_cr = -1;
  4807 
  4808   CODING_GET_INFO (coding, attrs, charset_list);
  4809   val = charset_list;
  4810   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4811   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4812 
  4813   while (1)
  4814     {
  4815       int c, c1;
  4816       struct charset *charset;
  4817 
  4818       src_base = src;
  4819       consumed_chars_base = consumed_chars;
  4820 
  4821       if (charbuf >= charbuf_end)
  4822         {
  4823           if (byte_after_cr >= 0)
  4824             src_base--;
  4825           break;
  4826         }
  4827 
  4828       if (byte_after_cr >= 0)
  4829         c = byte_after_cr, byte_after_cr = -1;
  4830       else
  4831         ONE_MORE_BYTE (c);
  4832 
  4833       if (c < 0)
  4834         goto invalid_code;
  4835       if (c < 0x80)
  4836         {
  4837           if (eol_dos && c == '\r')
  4838             ONE_MORE_BYTE (byte_after_cr);
  4839           charset = charset_roman;
  4840         }
  4841       else
  4842         {
  4843           /* BIG5 -> Big5 */
  4844           if (c < 0xA1 || c > 0xFE)
  4845             goto invalid_code;
  4846           ONE_MORE_BYTE (c1);
  4847           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
  4848             goto invalid_code;
  4849           c = c << 8 | c1;
  4850           charset = charset_big5;
  4851         }
  4852       if (charset->id != charset_ascii
  4853           && last_id != charset->id)
  4854         {
  4855           if (last_id != charset_ascii)
  4856             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4857           last_id = charset->id;
  4858           last_offset = char_offset;
  4859         }
  4860       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
  4861       *charbuf++ = c;
  4862       char_offset++;
  4863       continue;
  4864 
  4865     invalid_code:
  4866       src = src_base;
  4867       consumed_chars = consumed_chars_base;
  4868       ONE_MORE_BYTE (c);
  4869       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
  4870       char_offset++;
  4871     }
  4872 
  4873  no_more_source:
  4874   if (last_id != charset_ascii)
  4875     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4876   coding->consumed_char += consumed_chars_base;
  4877   coding->consumed = src_base - coding->source;
  4878   coding->charbuf_used = charbuf - coding->charbuf;
  4879 }
  4880 
  4881 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
  4882    This function can encode charsets `ascii', `katakana-jisx0201',
  4883    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
  4884    are sure that all these charsets are registered as official charset
  4885    (i.e. do not have extended leading-codes).  Characters of other
  4886    charsets are produced without any encoding.  */
  4887 
  4888 static bool
  4889 encode_coding_sjis (struct coding_system *coding)
  4890 {
  4891   bool multibytep = coding->dst_multibyte;
  4892   int *charbuf = coding->charbuf;
  4893   int *charbuf_end = charbuf + coding->charbuf_used;
  4894   unsigned char *dst = coding->destination + coding->produced;
  4895   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4896   int safe_room = 4;
  4897   ptrdiff_t produced_chars = 0;
  4898   Lisp_Object attrs, charset_list, val;
  4899   bool ascii_compatible;
  4900   struct charset *charset_kanji, *charset_kana;
  4901   struct charset *charset_kanji2;
  4902   int c;
  4903 
  4904   CODING_GET_INFO (coding, attrs, charset_list);
  4905   val = XCDR (charset_list);
  4906   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4907   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4908   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4909 
  4910   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  4911 
  4912   while (charbuf < charbuf_end)
  4913     {
  4914       ASSURE_DESTINATION (safe_room);
  4915       c = *charbuf++;
  4916       /* Now encode the character C.  */
  4917       if (ASCII_CHAR_P (c) && ascii_compatible)
  4918         EMIT_ONE_ASCII_BYTE (c);
  4919       else if (CHAR_BYTE8_P (c))
  4920         {
  4921           c = CHAR_TO_BYTE8 (c);
  4922           EMIT_ONE_BYTE (c);
  4923         }
  4924       else
  4925         {
  4926           unsigned code;
  4927           struct charset *charset;
  4928           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4929                                &code, charset);
  4930 
  4931           if (!charset)
  4932             {
  4933               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  4934                 {
  4935                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  4936                   charset = CHARSET_FROM_ID (charset_ascii);
  4937                 }
  4938               else
  4939                 {
  4940                   c = coding->default_char;
  4941                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  4942                                        charset_list, &code, charset);
  4943                 }
  4944             }
  4945           if (code == CHARSET_INVALID_CODE (charset))
  4946             emacs_abort ();
  4947           if (charset == charset_kanji)
  4948             {
  4949               int c1, c2;
  4950               JIS_TO_SJIS (code);
  4951               c1 = code >> 8, c2 = code & 0xFF;
  4952               EMIT_TWO_BYTES (c1, c2);
  4953             }
  4954           else if (charset == charset_kana)
  4955             EMIT_ONE_BYTE (code | 0x80);
  4956           else if (charset_kanji2 && charset == charset_kanji2)
  4957             {
  4958               int c1, c2;
  4959 
  4960               c1 = code >> 8;
  4961               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
  4962                   || c1 == 0x28
  4963                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
  4964                 {
  4965                   JIS_TO_SJIS2 (code);
  4966                   c1 = code >> 8, c2 = code & 0xFF;
  4967                   EMIT_TWO_BYTES (c1, c2);
  4968                 }
  4969               else
  4970                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
  4971             }
  4972           else
  4973             EMIT_ONE_ASCII_BYTE (code & 0x7F);
  4974         }
  4975     }
  4976   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  4977   coding->produced_char += produced_chars;
  4978   coding->produced = dst - coding->destination;
  4979   return 0;
  4980 }
  4981 
  4982 static bool
  4983 encode_coding_big5 (struct coding_system *coding)
  4984 {
  4985   bool multibytep = coding->dst_multibyte;
  4986   int *charbuf = coding->charbuf;
  4987   int *charbuf_end = charbuf + coding->charbuf_used;
  4988   unsigned char *dst = coding->destination + coding->produced;
  4989   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4990   int safe_room = 4;
  4991   ptrdiff_t produced_chars = 0;
  4992   Lisp_Object attrs, charset_list, val;
  4993   bool ascii_compatible;
  4994   struct charset *charset_big5;
  4995   int c;
  4996 
  4997   CODING_GET_INFO (coding, attrs, charset_list);
  4998   val = XCDR (charset_list);
  4999   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  5000   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  5001 
  5002   while (charbuf < charbuf_end)
  5003     {
  5004       ASSURE_DESTINATION (safe_room);
  5005       c = *charbuf++;
  5006       /* Now encode the character C.  */
  5007       if (ASCII_CHAR_P (c) && ascii_compatible)
  5008         EMIT_ONE_ASCII_BYTE (c);
  5009       else if (CHAR_BYTE8_P (c))
  5010         {
  5011           c = CHAR_TO_BYTE8 (c);
  5012           EMIT_ONE_BYTE (c);
  5013         }
  5014       else
  5015         {
  5016           unsigned code;
  5017           struct charset *charset;
  5018           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  5019                                &code, charset);
  5020 
  5021           if (! charset)
  5022             {
  5023               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  5024                 {
  5025                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  5026                   charset = CHARSET_FROM_ID (charset_ascii);
  5027                 }
  5028               else
  5029                 {
  5030                   c = coding->default_char;
  5031                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  5032                                        charset_list, &code, charset);
  5033                 }
  5034             }
  5035           if (code == CHARSET_INVALID_CODE (charset))
  5036             emacs_abort ();
  5037           if (charset == charset_big5)
  5038             {
  5039               int c1, c2;
  5040 
  5041               c1 = code >> 8, c2 = code & 0xFF;
  5042               EMIT_TWO_BYTES (c1, c2);
  5043             }
  5044           else
  5045             EMIT_ONE_ASCII_BYTE (code & 0x7F);
  5046         }
  5047     }
  5048   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5049   coding->produced_char += produced_chars;
  5050   coding->produced = dst - coding->destination;
  5051   return 0;
  5052 }
  5053 
  5054 
  5055 /*** 10. CCL handlers ***/
  5056 
  5057 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  5058    Return true if a text is encoded in a coding system of which
  5059    encoder/decoder are written in CCL program.  */
  5060 
  5061 static bool
  5062 detect_coding_ccl (struct coding_system *coding,
  5063                    struct coding_detection_info *detect_info)
  5064 {
  5065   const unsigned char *src = coding->source, *src_base;
  5066   const unsigned char *src_end = coding->source + coding->src_bytes;
  5067   bool multibytep = coding->src_multibyte;
  5068   ptrdiff_t consumed_chars = 0;
  5069   int found = 0;
  5070   unsigned char *valids;
  5071   ptrdiff_t head_ascii = coding->head_ascii;
  5072   Lisp_Object attrs;
  5073 
  5074   detect_info->checked |= CATEGORY_MASK_CCL;
  5075 
  5076   coding = &coding_categories[coding_category_ccl];
  5077   valids = CODING_CCL_VALIDS (coding);
  5078   attrs = CODING_ID_ATTRS (coding->id);
  5079   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
  5080     src += head_ascii;
  5081 
  5082   while (1)
  5083     {
  5084       int c;
  5085 
  5086       src_base = src;
  5087       ONE_MORE_BYTE (c);
  5088       if (c < 0 || ! valids[c])
  5089         break;
  5090       if ((valids[c] > 1))
  5091         found = CATEGORY_MASK_CCL;
  5092     }
  5093   detect_info->rejected |= CATEGORY_MASK_CCL;
  5094   return 0;
  5095 
  5096  no_more_source:
  5097   detect_info->found |= found;
  5098   return 1;
  5099 }
  5100 
  5101 static void
  5102 decode_coding_ccl (struct coding_system *coding)
  5103 {
  5104   const unsigned char *src = coding->source + coding->consumed;
  5105   const unsigned char *src_end = coding->source + coding->src_bytes;
  5106   int *charbuf = coding->charbuf + coding->charbuf_used;
  5107   int *charbuf_end = coding->charbuf + coding->charbuf_size;
  5108   ptrdiff_t consumed_chars = 0;
  5109   bool multibytep = coding->src_multibyte;
  5110   struct ccl_program *ccl = &coding->spec.ccl->ccl;
  5111   int source_charbuf[1024];
  5112   int source_byteidx[1025];
  5113   Lisp_Object attrs, charset_list;
  5114 
  5115   CODING_GET_INFO (coding, attrs, charset_list);
  5116 
  5117   while (1)
  5118     {
  5119       const unsigned char *p = src;
  5120       ptrdiff_t offset;
  5121       int i = 0;
  5122 
  5123       if (multibytep)
  5124         {
  5125           while (i < 1024 && p < src_end)
  5126             {
  5127               source_byteidx[i] = p - src;
  5128               source_charbuf[i++] = string_char_advance (&p);
  5129             }
  5130           source_byteidx[i] = p - src;
  5131         }
  5132       else
  5133         while (i < 1024 && p < src_end)
  5134           source_charbuf[i++] = *p++;
  5135 
  5136       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
  5137         ccl->last_block = true;
  5138       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
  5139       charset_map_loaded = 0;
  5140       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
  5141                   charset_list);
  5142       if (charset_map_loaded
  5143           && (offset = coding_change_source (coding)))
  5144         {
  5145           p += offset;
  5146           src += offset;
  5147           src_end += offset;
  5148         }
  5149       charbuf += ccl->produced;
  5150       if (multibytep)
  5151         src += source_byteidx[ccl->consumed];
  5152       else
  5153         src += ccl->consumed;
  5154       consumed_chars += ccl->consumed;
  5155       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
  5156         break;
  5157     }
  5158 
  5159   switch (ccl->status)
  5160     {
  5161     case CCL_STAT_SUSPEND_BY_SRC:
  5162       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5163       break;
  5164     case CCL_STAT_SUSPEND_BY_DST:
  5165       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
  5166       break;
  5167     case CCL_STAT_QUIT:
  5168     case CCL_STAT_INVALID_CMD:
  5169       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
  5170       break;
  5171     default:
  5172       record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5173       break;
  5174     }
  5175   coding->consumed_char += consumed_chars;
  5176   coding->consumed = src - coding->source;
  5177   coding->charbuf_used = charbuf - coding->charbuf;
  5178 }
  5179 
  5180 static bool
  5181 encode_coding_ccl (struct coding_system *coding)
  5182 {
  5183   struct ccl_program *ccl = &coding->spec.ccl->ccl;
  5184   bool multibytep = coding->dst_multibyte;
  5185   int *charbuf = coding->charbuf;
  5186   int *charbuf_end = charbuf + coding->charbuf_used;
  5187   unsigned char *dst = coding->destination + coding->produced;
  5188   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5189   int destination_charbuf[1024];
  5190   ptrdiff_t produced_chars = 0;
  5191   int i;
  5192   Lisp_Object attrs, charset_list;
  5193 
  5194   CODING_GET_INFO (coding, attrs, charset_list);
  5195   if (coding->consumed_char == coding->src_chars
  5196       && coding->mode & CODING_MODE_LAST_BLOCK)
  5197     ccl->last_block = true;
  5198 
  5199   do
  5200     {
  5201       ptrdiff_t offset;
  5202 
  5203       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
  5204       charset_map_loaded = 0;
  5205       ccl_driver (ccl, charbuf, destination_charbuf,
  5206                   charbuf_end - charbuf, 1024, charset_list);
  5207       if (charset_map_loaded
  5208           && (offset = coding_change_destination (coding)))
  5209         dst += offset;
  5210       if (multibytep)
  5211         {
  5212           ASSURE_DESTINATION (ccl->produced * 2);
  5213           for (i = 0; i < ccl->produced; i++)
  5214             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
  5215         }
  5216       else
  5217         {
  5218           ASSURE_DESTINATION (ccl->produced);
  5219           for (i = 0; i < ccl->produced; i++)
  5220             *dst++ = destination_charbuf[i] & 0xFF;
  5221           produced_chars += ccl->produced;
  5222         }
  5223       charbuf += ccl->consumed;
  5224       if (ccl->status == CCL_STAT_QUIT
  5225           || ccl->status == CCL_STAT_INVALID_CMD)
  5226         break;
  5227     }
  5228   while (charbuf < charbuf_end);
  5229 
  5230   switch (ccl->status)
  5231     {
  5232     case CCL_STAT_SUSPEND_BY_SRC:
  5233       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5234       break;
  5235     case CCL_STAT_SUSPEND_BY_DST:
  5236       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
  5237       break;
  5238     case CCL_STAT_QUIT:
  5239     case CCL_STAT_INVALID_CMD:
  5240       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
  5241       break;
  5242     default:
  5243       record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5244       break;
  5245     }
  5246 
  5247   coding->produced_char += produced_chars;
  5248   coding->produced = dst - coding->destination;
  5249   return 0;
  5250 }
  5251 
  5252 
  5253 /*** 10, 11. no-conversion handlers ***/
  5254 
  5255 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  5256 
  5257 static void
  5258 decode_coding_raw_text (struct coding_system *coding)
  5259 {
  5260   bool eol_dos
  5261     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  5262 
  5263   coding->chars_at_source = 1;
  5264   coding->consumed_char = coding->src_chars;
  5265   coding->consumed = coding->src_bytes;
  5266   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
  5267     {
  5268       coding->consumed_char--;
  5269       coding->consumed--;
  5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5271     }
  5272   else
  5273     record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5274 }
  5275 
  5276 static bool
  5277 encode_coding_raw_text (struct coding_system *coding)
  5278 {
  5279   bool multibytep = coding->dst_multibyte;
  5280   int *charbuf = coding->charbuf;
  5281   int *charbuf_end = coding->charbuf + coding->charbuf_used;
  5282   unsigned char *dst = coding->destination + coding->produced;
  5283   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5284   ptrdiff_t produced_chars = 0;
  5285   int c;
  5286 
  5287   if (multibytep)
  5288     {
  5289       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
  5290 
  5291       if (coding->src_multibyte)
  5292         while (charbuf < charbuf_end)
  5293           {
  5294             ASSURE_DESTINATION (safe_room);
  5295             c = *charbuf++;
  5296             if (ASCII_CHAR_P (c))
  5297               EMIT_ONE_ASCII_BYTE (c);
  5298             else if (CHAR_BYTE8_P (c))
  5299               {
  5300                 c = CHAR_TO_BYTE8 (c);
  5301                 EMIT_ONE_BYTE (c);
  5302               }
  5303             else
  5304               {
  5305                 unsigned char str[MAX_MULTIBYTE_LENGTH];
  5306                 int len = CHAR_STRING (c, str);
  5307                 for (int i = 0; i < len; i++)
  5308                   EMIT_ONE_BYTE (str[i]);
  5309               }
  5310           }
  5311       else
  5312         while (charbuf < charbuf_end)
  5313           {
  5314             ASSURE_DESTINATION (safe_room);
  5315             c = *charbuf++;
  5316             EMIT_ONE_BYTE (c);
  5317           }
  5318     }
  5319   else
  5320     {
  5321       if (coding->src_multibyte)
  5322         {
  5323           int safe_room = MAX_MULTIBYTE_LENGTH;
  5324 
  5325           while (charbuf < charbuf_end)
  5326             {
  5327               ASSURE_DESTINATION (safe_room);
  5328               c = *charbuf++;
  5329               if (ASCII_CHAR_P (c))
  5330                 *dst++ = c;
  5331               else if (CHAR_BYTE8_P (c))
  5332                 *dst++ = CHAR_TO_BYTE8 (c);
  5333               else
  5334                 dst += CHAR_STRING (c, dst);
  5335             }
  5336         }
  5337       else
  5338         {
  5339           ASSURE_DESTINATION (charbuf_end - charbuf);
  5340           while (charbuf < charbuf_end && dst < dst_end)
  5341             *dst++ = *charbuf++;
  5342         }
  5343       produced_chars = dst - (coding->destination + coding->produced);
  5344     }
  5345   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5346   coding->produced_char += produced_chars;
  5347   coding->produced = dst - coding->destination;
  5348   return 0;
  5349 }
  5350 
  5351 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  5352    Return true if a text is encoded in a charset-based coding system.  */
  5353 
  5354 static bool
  5355 detect_coding_charset (struct coding_system *coding,
  5356                        struct coding_detection_info *detect_info)
  5357 {
  5358   const unsigned char *src = coding->source, *src_base;
  5359   const unsigned char *src_end = coding->source + coding->src_bytes;
  5360   bool multibytep = coding->src_multibyte;
  5361   ptrdiff_t consumed_chars = 0;
  5362   Lisp_Object attrs, valids, name;
  5363   int found = 0;
  5364   ptrdiff_t head_ascii = coding->head_ascii;
  5365   bool check_latin_extra = 0;
  5366 
  5367   detect_info->checked |= CATEGORY_MASK_CHARSET;
  5368 
  5369   coding = &coding_categories[coding_category_charset];
  5370   attrs = CODING_ID_ATTRS (coding->id);
  5371   valids = AREF (attrs, coding_attr_charset_valids);
  5372   name = CODING_ID_NAME (coding->id);
  5373   if (strncmp (SSDATA (SYMBOL_NAME (name)),
  5374                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
  5375       || strncmp (SSDATA (SYMBOL_NAME (name)),
  5376                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
  5377     check_latin_extra = 1;
  5378 
  5379   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
  5380     src += head_ascii;
  5381 
  5382   while (1)
  5383     {
  5384       int c;
  5385       Lisp_Object val;
  5386       struct charset *charset;
  5387       int dim, idx;
  5388 
  5389       src_base = src;
  5390       ONE_MORE_BYTE (c);
  5391       if (c < 0)
  5392         continue;
  5393       val = AREF (valids, c);
  5394       if (NILP (val))
  5395         break;
  5396       if (c >= 0x80)
  5397         {
  5398           if (c < 0xA0
  5399               && check_latin_extra
  5400               && (!VECTORP (Vlatin_extra_code_table)
  5401                   || NILP (AREF (Vlatin_extra_code_table, c))))
  5402             break;
  5403           found = CATEGORY_MASK_CHARSET;
  5404         }
  5405       if (FIXNUMP (val))
  5406         {
  5407           charset = CHARSET_FROM_ID (XFIXNAT (val));
  5408           dim = CHARSET_DIMENSION (charset);
  5409           for (idx = 1; idx < dim; idx++)
  5410             {
  5411               if (src == src_end)
  5412                 goto too_short;
  5413               ONE_MORE_BYTE (c);
  5414               if (c < charset->code_space[(dim - 1 - idx) * 4]
  5415                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
  5416                 break;
  5417             }
  5418           if (idx < dim)
  5419             break;
  5420         }
  5421       else
  5422         {
  5423           idx = 1;
  5424           for (; CONSP (val); val = XCDR (val))
  5425             {
  5426               charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
  5427               dim = CHARSET_DIMENSION (charset);
  5428               while (idx < dim)
  5429                 {
  5430                   if (src == src_end)
  5431                     goto too_short;
  5432                   ONE_MORE_BYTE (c);
  5433                   if (c < charset->code_space[(dim - 1 - idx) * 4]
  5434                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
  5435                     break;
  5436                   idx++;
  5437                 }
  5438               if (idx == dim)
  5439                 {
  5440                   val = Qnil;
  5441                   break;
  5442                 }
  5443             }
  5444           if (CONSP (val))
  5445             break;
  5446         }
  5447     }
  5448  too_short:
  5449   detect_info->rejected |= CATEGORY_MASK_CHARSET;
  5450   return 0;
  5451 
  5452  no_more_source:
  5453   detect_info->found |= found;
  5454   return 1;
  5455 }
  5456 
  5457 static void
  5458 decode_coding_charset (struct coding_system *coding)
  5459 {
  5460   const unsigned char *src = coding->source + coding->consumed;
  5461   const unsigned char *src_end = coding->source + coding->src_bytes;
  5462   const unsigned char *src_base;
  5463   int *charbuf = coding->charbuf + coding->charbuf_used;
  5464   /* We may produce one charset annotation in one loop and one more at
  5465      the end.  */
  5466   int *charbuf_end
  5467     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  5468   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  5469   bool multibytep = coding->src_multibyte;
  5470   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
  5471   Lisp_Object valids;
  5472   ptrdiff_t char_offset = coding->produced_char;
  5473   ptrdiff_t last_offset = char_offset;
  5474   int last_id = charset_ascii;
  5475   bool eol_dos
  5476     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  5477   int byte_after_cr = -1;
  5478 
  5479   valids = AREF (attrs, coding_attr_charset_valids);
  5480 
  5481   while (1)
  5482     {
  5483       int c;
  5484       Lisp_Object val;
  5485       struct charset *charset;
  5486       int dim;
  5487       int len = 1;
  5488       unsigned code;
  5489 
  5490       src_base = src;
  5491       consumed_chars_base = consumed_chars;
  5492 
  5493       if (charbuf >= charbuf_end)
  5494         {
  5495           if (byte_after_cr >= 0)
  5496             src_base--;
  5497           break;
  5498         }
  5499 
  5500       if (byte_after_cr >= 0)
  5501         {
  5502           c = byte_after_cr;
  5503           byte_after_cr = -1;
  5504         }
  5505       else
  5506         {
  5507           ONE_MORE_BYTE (c);
  5508           if (eol_dos && c == '\r')
  5509             ONE_MORE_BYTE (byte_after_cr);
  5510         }
  5511       if (c < 0)
  5512         goto invalid_code;
  5513       code = c;
  5514 
  5515       val = AREF (valids, c);
  5516       if (! FIXNUMP (val) && ! CONSP (val))
  5517         goto invalid_code;
  5518       if (FIXNUMP (val))
  5519         {
  5520           charset = CHARSET_FROM_ID (XFIXNAT (val));
  5521           dim = CHARSET_DIMENSION (charset);
  5522           while (len < dim)
  5523             {
  5524               ONE_MORE_BYTE (c);
  5525               code = (code << 8) | c;
  5526               len++;
  5527             }
  5528           CODING_DECODE_CHAR (coding, src, src_base, src_end,
  5529                               charset, code, c);
  5530         }
  5531       else
  5532         {
  5533           /* VAL is a list of charset IDs.  It is assured that the
  5534              list is sorted by charset dimensions (smaller one
  5535              comes first).  */
  5536           while (CONSP (val))
  5537             {
  5538               charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
  5539               dim = CHARSET_DIMENSION (charset);
  5540               while (len < dim)
  5541                 {
  5542                   ONE_MORE_BYTE (c);
  5543                   code = (code << 8) | c;
  5544                   len++;
  5545                 }
  5546               CODING_DECODE_CHAR (coding, src, src_base,
  5547                                   src_end, charset, code, c);
  5548               if (c >= 0)
  5549                 break;
  5550               val = XCDR (val);
  5551             }
  5552         }
  5553       if (c < 0)
  5554         goto invalid_code;
  5555       if (charset->id != charset_ascii
  5556           && last_id != charset->id)
  5557         {
  5558           if (last_id != charset_ascii)
  5559             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  5560           last_id = charset->id;
  5561           last_offset = char_offset;
  5562         }
  5563 
  5564       *charbuf++ = c;
  5565       char_offset++;
  5566       continue;
  5567 
  5568     invalid_code:
  5569       src = src_base;
  5570       consumed_chars = consumed_chars_base;
  5571       ONE_MORE_BYTE (c);
  5572       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  5573       char_offset++;
  5574     }
  5575 
  5576  no_more_source:
  5577   if (last_id != charset_ascii)
  5578     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  5579   coding->consumed_char += consumed_chars_base;
  5580   coding->consumed = src_base - coding->source;
  5581   coding->charbuf_used = charbuf - coding->charbuf;
  5582 }
  5583 
  5584 static bool
  5585 encode_coding_charset (struct coding_system *coding)
  5586 {
  5587   bool multibytep = coding->dst_multibyte;
  5588   int *charbuf = coding->charbuf;
  5589   int *charbuf_end = charbuf + coding->charbuf_used;
  5590   unsigned char *dst = coding->destination + coding->produced;
  5591   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5592   int safe_room = MAX_MULTIBYTE_LENGTH;
  5593   ptrdiff_t produced_chars = 0;
  5594   Lisp_Object attrs, charset_list;
  5595   bool ascii_compatible;
  5596   int c;
  5597 
  5598   CODING_GET_INFO (coding, attrs, charset_list);
  5599   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  5600 
  5601   while (charbuf < charbuf_end)
  5602     {
  5603       struct charset *charset;
  5604       unsigned code;
  5605 
  5606       ASSURE_DESTINATION (safe_room);
  5607       c = *charbuf++;
  5608       if (ascii_compatible && ASCII_CHAR_P (c))
  5609         EMIT_ONE_ASCII_BYTE (c);
  5610       else if (CHAR_BYTE8_P (c))
  5611         {
  5612           c = CHAR_TO_BYTE8 (c);
  5613           EMIT_ONE_BYTE (c);
  5614         }
  5615       else
  5616         {
  5617           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  5618                                &code, charset);
  5619 
  5620           if (charset)
  5621             {
  5622               if (CHARSET_DIMENSION (charset) == 1)
  5623                 EMIT_ONE_BYTE (code);
  5624               else if (CHARSET_DIMENSION (charset) == 2)
  5625                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
  5626               else if (CHARSET_DIMENSION (charset) == 3)
  5627                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
  5628               else
  5629                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
  5630                                  (code >> 8) & 0xFF, code & 0xFF);
  5631             }
  5632           else
  5633             {
  5634               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  5635                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  5636               else
  5637                 c = coding->default_char;
  5638               EMIT_ONE_BYTE (c);
  5639             }
  5640         }
  5641     }
  5642 
  5643   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5644   coding->produced_char += produced_chars;
  5645   coding->produced = dst - coding->destination;
  5646   return 0;
  5647 }
  5648 
  5649 
  5650 /*** 7. C library functions ***/
  5651 
  5652 /* Setup coding context CODING from information about CODING_SYSTEM.
  5653    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
  5654    CODING_SYSTEM is invalid, signal an error.  */
  5655 
  5656 void
  5657 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
  5658 {
  5659   Lisp_Object attrs;
  5660   Lisp_Object eol_type;
  5661   Lisp_Object coding_type;
  5662   Lisp_Object val;
  5663 
  5664   if (NILP (coding_system))
  5665     coding_system = Qundecided;
  5666 
  5667   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
  5668 
  5669   attrs = CODING_ID_ATTRS (coding->id);
  5670   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  5671 
  5672   coding->mode = 0;
  5673   if (VECTORP (eol_type))
  5674     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
  5675                             | CODING_REQUIRE_DETECTION_MASK);
  5676   else if (! EQ (eol_type, Qunix))
  5677     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
  5678                             | CODING_REQUIRE_ENCODING_MASK);
  5679   else
  5680     coding->common_flags = 0;
  5681   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  5682     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
  5683   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
  5684     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
  5685   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
  5686     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
  5687 
  5688   val = CODING_ATTR_SAFE_CHARSETS (attrs);
  5689   coding->max_charset_id = SCHARS (val) - 1;
  5690   coding->safe_charsets = SDATA (val);
  5691   coding->default_char = XFIXNUM (CODING_ATTR_DEFAULT_CHAR (attrs));
  5692   coding->carryover_bytes = 0;
  5693   coding->raw_destination = 0;
  5694 
  5695   coding_type = CODING_ATTR_TYPE (attrs);
  5696   if (EQ (coding_type, Qundecided))
  5697     {
  5698       coding->detector = NULL;
  5699       coding->decoder = decode_coding_raw_text;
  5700       coding->encoder = encode_coding_raw_text;
  5701       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5702       coding->spec.undecided.inhibit_nbd
  5703         = (encode_inhibit_flag
  5704            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
  5705       coding->spec.undecided.inhibit_ied
  5706         = (encode_inhibit_flag
  5707            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
  5708       coding->spec.undecided.prefer_utf_8
  5709         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
  5710     }
  5711   else if (EQ (coding_type, Qiso_2022))
  5712     {
  5713       int i;
  5714       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5715 
  5716       /* Invoke graphic register 0 to plane 0.  */
  5717       CODING_ISO_INVOCATION (coding, 0) = 0;
  5718       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
  5719       CODING_ISO_INVOCATION (coding, 1)
  5720         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
  5721       /* Setup the initial status of designation.  */
  5722       for (i = 0; i < 4; i++)
  5723         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
  5724       /* Not single shifting initially.  */
  5725       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
  5726       /* Beginning of buffer should also be regarded as bol. */
  5727       CODING_ISO_BOL (coding) = 1;
  5728       coding->detector = detect_coding_iso_2022;
  5729       coding->decoder = decode_coding_iso_2022;
  5730       coding->encoder = encode_coding_iso_2022;
  5731       if (flags & CODING_ISO_FLAG_SAFE)
  5732         coding->mode |= CODING_MODE_SAFE_ENCODING;
  5733       coding->common_flags
  5734         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
  5735             | CODING_REQUIRE_FLUSHING_MASK);
  5736       if (flags & CODING_ISO_FLAG_COMPOSITION)
  5737         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
  5738       if (flags & CODING_ISO_FLAG_DESIGNATION)
  5739         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
  5740       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5741         {
  5742           setup_iso_safe_charsets (attrs);
  5743           val = CODING_ATTR_SAFE_CHARSETS (attrs);
  5744           coding->max_charset_id = SCHARS (val) - 1;
  5745           coding->safe_charsets = SDATA (val);
  5746         }
  5747       CODING_ISO_FLAGS (coding) = flags;
  5748       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
  5749       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
  5750       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
  5751       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
  5752     }
  5753   else if (EQ (coding_type, Qcharset))
  5754     {
  5755       coding->detector = detect_coding_charset;
  5756       coding->decoder = decode_coding_charset;
  5757       coding->encoder = encode_coding_charset;
  5758       coding->common_flags
  5759         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5760     }
  5761   else if (EQ (coding_type, Qutf_8))
  5762     {
  5763       val = AREF (attrs, coding_attr_utf_bom);
  5764       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
  5765                                    : EQ (val, Qt) ? utf_with_bom
  5766                                    : utf_without_bom);
  5767       coding->detector = detect_coding_utf_8;
  5768       coding->decoder = decode_coding_utf_8;
  5769       coding->encoder = encode_coding_utf_8;
  5770       coding->common_flags
  5771         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5772       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
  5773         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5774     }
  5775   else if (EQ (coding_type, Qutf_16))
  5776     {
  5777       val = AREF (attrs, coding_attr_utf_bom);
  5778       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
  5779                                     : EQ (val, Qt) ? utf_with_bom
  5780                                     : utf_without_bom);
  5781       val = AREF (attrs, coding_attr_utf_16_endian);
  5782       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
  5783                                        : utf_16_little_endian);
  5784       CODING_UTF_16_SURROGATE (coding) = 0;
  5785       coding->detector = detect_coding_utf_16;
  5786       coding->decoder = decode_coding_utf_16;
  5787       coding->encoder = encode_coding_utf_16;
  5788       coding->common_flags
  5789         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5790       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
  5791         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5792     }
  5793   else if (EQ (coding_type, Qccl))
  5794     {
  5795       coding->detector = detect_coding_ccl;
  5796       coding->decoder = decode_coding_ccl;
  5797       coding->encoder = encode_coding_ccl;
  5798       coding->common_flags
  5799         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
  5800             | CODING_REQUIRE_FLUSHING_MASK);
  5801     }
  5802   else if (EQ (coding_type, Qemacs_mule))
  5803     {
  5804       coding->detector = detect_coding_emacs_mule;
  5805       coding->decoder = decode_coding_emacs_mule;
  5806       coding->encoder = encode_coding_emacs_mule;
  5807       coding->common_flags
  5808         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5809       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
  5810           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
  5811         {
  5812           Lisp_Object tail, safe_charsets;
  5813           int max_charset_id = 0;
  5814 
  5815           for (tail = Vemacs_mule_charset_list; CONSP (tail);
  5816                tail = XCDR (tail))
  5817             if (max_charset_id < XFIXNAT (XCAR (tail)))
  5818               max_charset_id = XFIXNAT (XCAR (tail));
  5819           safe_charsets = make_uninit_string (max_charset_id + 1);
  5820           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
  5821           for (tail = Vemacs_mule_charset_list; CONSP (tail);
  5822                tail = XCDR (tail))
  5823             SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
  5824           coding->max_charset_id = max_charset_id;
  5825           coding->safe_charsets = SDATA (safe_charsets);
  5826         }
  5827       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
  5828       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
  5829     }
  5830   else if (EQ (coding_type, Qshift_jis))
  5831     {
  5832       coding->detector = detect_coding_sjis;
  5833       coding->decoder = decode_coding_sjis;
  5834       coding->encoder = encode_coding_sjis;
  5835       coding->common_flags
  5836         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5837     }
  5838   else if (EQ (coding_type, Qbig5))
  5839     {
  5840       coding->detector = detect_coding_big5;
  5841       coding->decoder = decode_coding_big5;
  5842       coding->encoder = encode_coding_big5;
  5843       coding->common_flags
  5844         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5845     }
  5846   else                          /* EQ (coding_type, Qraw_text) */
  5847     {
  5848       coding->detector = NULL;
  5849       coding->decoder = decode_coding_raw_text;
  5850       coding->encoder = encode_coding_raw_text;
  5851       if (! EQ (eol_type, Qunix))
  5852         {
  5853           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
  5854           if (! VECTORP (eol_type))
  5855             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
  5856         }
  5857 
  5858     }
  5859 
  5860   return;
  5861 }
  5862 
  5863 /* Return a list of charsets supported by CODING.  */
  5864 
  5865 Lisp_Object
  5866 coding_charset_list (struct coding_system *coding)
  5867 {
  5868   Lisp_Object attrs, charset_list;
  5869 
  5870   CODING_GET_INFO (coding, attrs, charset_list);
  5871   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
  5872     {
  5873       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5874 
  5875       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5876         charset_list = Viso_2022_charset_list;
  5877     }
  5878   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
  5879     {
  5880       charset_list = Vemacs_mule_charset_list;
  5881     }
  5882   return charset_list;
  5883 }
  5884 
  5885 
  5886 /* Return a list of charsets supported by CODING-SYSTEM.  */
  5887 
  5888 Lisp_Object
  5889 coding_system_charset_list (Lisp_Object coding_system)
  5890 {
  5891   ptrdiff_t id;
  5892   Lisp_Object attrs, charset_list;
  5893 
  5894   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
  5895   attrs = CODING_ID_ATTRS (id);
  5896 
  5897   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
  5898     {
  5899       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5900 
  5901       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5902         charset_list = Viso_2022_charset_list;
  5903       else
  5904         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  5905     }
  5906   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
  5907     {
  5908       charset_list = Vemacs_mule_charset_list;
  5909     }
  5910   else
  5911     {
  5912       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  5913     }
  5914   return charset_list;
  5915 }
  5916 
  5917 
  5918 /* Return raw-text or one of its subsidiaries that has the same
  5919    eol_type as CODING-SYSTEM.  */
  5920 
  5921 Lisp_Object
  5922 raw_text_coding_system (Lisp_Object coding_system)
  5923 {
  5924   Lisp_Object spec, attrs;
  5925   Lisp_Object eol_type, raw_text_eol_type;
  5926 
  5927   if (NILP (coding_system))
  5928     return Qraw_text;
  5929   spec = CODING_SYSTEM_SPEC (coding_system);
  5930   attrs = AREF (spec, 0);
  5931 
  5932   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
  5933     return coding_system;
  5934 
  5935   eol_type = AREF (spec, 2);
  5936   if (VECTORP (eol_type))
  5937     return Qraw_text;
  5938   spec = CODING_SYSTEM_SPEC (Qraw_text);
  5939   raw_text_eol_type = AREF (spec, 2);
  5940   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
  5941           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
  5942           : AREF (raw_text_eol_type, 2));
  5943 }
  5944 
  5945 /* Return true if CODING corresponds to raw-text coding-system.  */
  5946 
  5947 bool
  5948 raw_text_coding_system_p (struct coding_system *coding)
  5949 {
  5950   return (coding->decoder == decode_coding_raw_text
  5951           && coding->encoder == encode_coding_raw_text) ? true : false;
  5952 }
  5953 
  5954 
  5955 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
  5956    the subsidiary that has the same eol-spec as PARENT (if it is not
  5957    nil and specifies end-of-line format) or the system's setting.  */
  5958 
  5959 Lisp_Object
  5960 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
  5961 {
  5962   Lisp_Object spec, eol_type;
  5963 
  5964   if (NILP (coding_system))
  5965     coding_system = Qraw_text;
  5966   else
  5967     CHECK_CODING_SYSTEM (coding_system);
  5968   spec = CODING_SYSTEM_SPEC (coding_system);
  5969   eol_type = AREF (spec, 2);
  5970   if (VECTORP (eol_type))
  5971     {
  5972       /* Format of end-of-line decided by system.
  5973          This is Qunix on Unix and Mac, Qdos on DOS/Windows.
  5974          This has an effect only for external encoding (i.e., for output to
  5975          file and process), not for in-buffer or Lisp string encoding.  */
  5976       Lisp_Object system_eol_type = Qunix;
  5977       #ifdef DOS_NT
  5978        system_eol_type = Qdos;
  5979       #endif
  5980 
  5981       Lisp_Object parent_eol_type = system_eol_type;
  5982       if (! NILP (parent))
  5983         {
  5984           CHECK_CODING_SYSTEM (parent);
  5985           Lisp_Object parent_spec = CODING_SYSTEM_SPEC (parent);
  5986           Lisp_Object pspec_type = AREF (parent_spec, 2);
  5987           if (!VECTORP (pspec_type))
  5988             parent_eol_type = pspec_type;
  5989         }
  5990       if (EQ (parent_eol_type, Qunix))
  5991         coding_system = AREF (eol_type, 0);
  5992       else if (EQ (parent_eol_type, Qdos))
  5993         coding_system = AREF (eol_type, 1);
  5994       else if (EQ (parent_eol_type, Qmac))
  5995         coding_system = AREF (eol_type, 2);
  5996     }
  5997   return coding_system;
  5998 }
  5999 
  6000 
  6001 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
  6002    decided for writing to a process.  If not, complement them, and
  6003    return a new coding system.  */
  6004 
  6005 Lisp_Object
  6006 complement_process_encoding_system (Lisp_Object coding_system)
  6007 {
  6008   Lisp_Object coding_base = Qnil, eol_base = Qnil;
  6009   Lisp_Object spec, attrs;
  6010   int i;
  6011 
  6012   for (i = 0; i < 3; i++)
  6013     {
  6014       if (i == 1)
  6015         coding_system = CDR_SAFE (Vdefault_process_coding_system);
  6016       else if (i == 2)
  6017         coding_system = preferred_coding_system ();
  6018       spec = CODING_SYSTEM_SPEC (coding_system);
  6019       if (NILP (spec))
  6020         continue;
  6021       attrs = AREF (spec, 0);
  6022       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
  6023         coding_base = CODING_ATTR_BASE_NAME (attrs);
  6024       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
  6025         eol_base = coding_system;
  6026       if (! NILP (coding_base) && ! NILP (eol_base))
  6027         break;
  6028     }
  6029 
  6030   if (i > 0)
  6031     /* The original CODING_SYSTEM didn't specify text-conversion or
  6032        eol-conversion.  Be sure that we return a fully complemented
  6033        coding system.  */
  6034     coding_system = coding_inherit_eol_type (coding_base, eol_base);
  6035   return coding_system;
  6036 }
  6037 
  6038 
  6039 /* Emacs has a mechanism to automatically detect a coding system if it
  6040    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
  6041    it's impossible to distinguish some coding systems accurately
  6042    because they use the same range of codes.  So, at first, coding
  6043    systems are categorized into 7, those are:
  6044 
  6045    o coding-category-emacs-mule
  6046 
  6047         The category for a coding system which has the same code range
  6048         as Emacs' internal format.  Assigned the coding-system (Lisp
  6049         symbol) `emacs-mule' by default.
  6050 
  6051    o coding-category-sjis
  6052 
  6053         The category for a coding system which has the same code range
  6054         as SJIS.  Assigned the coding-system (Lisp
  6055         symbol) `japanese-shift-jis' by default.
  6056 
  6057    o coding-category-iso-7
  6058 
  6059         The category for a coding system which has the same code range
  6060         as ISO2022 of 7-bit environment.  This doesn't use any locking
  6061         shift and single shift functions.  This can encode/decode all
  6062         charsets.  Assigned the coding-system (Lisp symbol)
  6063         `iso-2022-7bit' by default.
  6064 
  6065    o coding-category-iso-7-tight
  6066 
  6067         Same as coding-category-iso-7 except that this can
  6068         encode/decode only the specified charsets.
  6069 
  6070    o coding-category-iso-8-1
  6071 
  6072         The category for a coding system which has the same code range
  6073         as ISO2022 of 8-bit environment and graphic plane 1 used only
  6074         for DIMENSION1 charset.  This doesn't use any locking shift
  6075         and single shift functions.  Assigned the coding-system (Lisp
  6076         symbol) `iso-latin-1' by default.
  6077 
  6078    o coding-category-iso-8-2
  6079 
  6080         The category for a coding system which has the same code range
  6081         as ISO2022 of 8-bit environment and graphic plane 1 used only
  6082         for DIMENSION2 charset.  This doesn't use any locking shift
  6083         and single shift functions.  Assigned the coding-system (Lisp
  6084         symbol) `japanese-iso-8bit' by default.
  6085 
  6086    o coding-category-iso-7-else
  6087 
  6088         The category for a coding system which has the same code range
  6089         as ISO2022 of 7-bit environment but uses locking shift or
  6090         single shift functions.  Assigned the coding-system (Lisp
  6091         symbol) `iso-2022-7bit-lock' by default.
  6092 
  6093    o coding-category-iso-8-else
  6094 
  6095         The category for a coding system which has the same code range
  6096         as ISO2022 of 8-bit environment but uses locking shift or
  6097         single shift functions.  Assigned the coding-system (Lisp
  6098         symbol) `iso-2022-8bit-ss2' by default.
  6099 
  6100    o coding-category-big5
  6101 
  6102         The category for a coding system which has the same code range
  6103         as BIG5.  Assigned the coding-system (Lisp symbol)
  6104         `cn-big5' by default.
  6105 
  6106    o coding-category-utf-8
  6107 
  6108         The category for a coding system which has the same code range
  6109         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
  6110         symbol) `utf-8' by default.
  6111 
  6112    o coding-category-utf-16-be
  6113 
  6114         The category for a coding system in which a text has an
  6115         Unicode signature (cf. Unicode Standard) in the order of BIG
  6116         endian at the head.  Assigned the coding-system (Lisp symbol)
  6117         `utf-16-be' by default.
  6118 
  6119    o coding-category-utf-16-le
  6120 
  6121         The category for a coding system in which a text has an
  6122         Unicode signature (cf. Unicode Standard) in the order of
  6123         LITTLE endian at the head.  Assigned the coding-system (Lisp
  6124         symbol) `utf-16-le' by default.
  6125 
  6126    o coding-category-ccl
  6127 
  6128         The category for a coding system of which encoder/decoder is
  6129         written in CCL programs.  The default value is nil, i.e., no
  6130         coding system is assigned.
  6131 
  6132    o coding-category-binary
  6133 
  6134         The category for a coding system not categorized in any of the
  6135         above.  Assigned the coding-system (Lisp symbol)
  6136         `no-conversion' by default.
  6137 
  6138    Each of them is a Lisp symbol and the value is an actual
  6139    `coding-system's (this is also a Lisp symbol) assigned by a user.
  6140    What Emacs does actually is to detect a category of coding system.
  6141    Then, it uses a `coding-system' assigned to it.  If Emacs can't
  6142    decide only one possible category, it selects a category of the
  6143    highest priority.  Priorities of categories are also specified by a
  6144    user in a Lisp variable `coding-category-list'.
  6145 
  6146 */
  6147 
  6148 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
  6149                                            int eol_seen);
  6150 
  6151 
  6152 /* Return the number of ASCII characters at the head of the source.
  6153    By side effects, set coding->head_ascii and update
  6154    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
  6155    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
  6156    reliable only when all the source bytes are ASCII.  */
  6157 
  6158 static ptrdiff_t
  6159 check_ascii (struct coding_system *coding)
  6160 {
  6161   const unsigned char *src, *end;
  6162   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  6163   int eol_seen = coding->eol_seen;
  6164 
  6165   coding_set_source (coding);
  6166   src = coding->source;
  6167   end = src + coding->src_bytes;
  6168 
  6169   if (inhibit_eol_conversion
  6170       || SYMBOLP (eol_type))
  6171     {
  6172       /* We don't have to check EOL format.  */
  6173       while (src < end && !( *src & 0x80))
  6174         {
  6175           if (*src++ == '\n')
  6176             eol_seen |= EOL_SEEN_LF;
  6177         }
  6178     }
  6179   else
  6180     {
  6181       end--;                /* We look ahead one byte for "CR LF".  */
  6182       while (src < end)
  6183         {
  6184           int c = *src;
  6185 
  6186           if (c & 0x80)
  6187             break;
  6188           src++;
  6189           if (c == '\r')
  6190             {
  6191               if (*src == '\n')
  6192                 {
  6193                   eol_seen |= EOL_SEEN_CRLF;
  6194                   src++;
  6195                 }
  6196               else
  6197                 eol_seen |= EOL_SEEN_CR;
  6198             }
  6199           else if (c == '\n')
  6200             eol_seen |= EOL_SEEN_LF;
  6201         }
  6202       if (src == end)
  6203         {
  6204           int c = *src;
  6205 
  6206           /* All bytes but the last one C are ASCII.  */
  6207           if (! (c & 0x80))
  6208             {
  6209               if (c == '\r')
  6210                 eol_seen |= EOL_SEEN_CR;
  6211               else if (c  == '\n')
  6212                 eol_seen |= EOL_SEEN_LF;
  6213               src++;
  6214             }
  6215         }
  6216     }
  6217   coding->head_ascii = src - coding->source;
  6218   coding->eol_seen = eol_seen;
  6219   return (coding->head_ascii);
  6220 }
  6221 
  6222 
  6223 /* Return the number of characters at the source if all the bytes are
  6224    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
  6225    effects, update coding->eol_seen.  The value of coding->eol_seen is
  6226    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
  6227    the value is reliable only when all the source bytes are valid
  6228    UTF-8.  */
  6229 
  6230 static ptrdiff_t
  6231 check_utf_8 (struct coding_system *coding)
  6232 {
  6233   const unsigned char *src, *end;
  6234   int eol_seen;
  6235   ptrdiff_t nchars = coding->head_ascii;
  6236 
  6237   if (coding->head_ascii < 0)
  6238     check_ascii (coding);
  6239   else
  6240     coding_set_source (coding);
  6241   src = coding->source + coding->head_ascii;
  6242   /* We look ahead one byte for CR LF.  */
  6243   end = coding->source + coding->src_bytes - 1;
  6244   eol_seen = coding->eol_seen;
  6245   while (src < end)
  6246     {
  6247       int c = *src;
  6248 
  6249       if (UTF_8_1_OCTET_P (*src))
  6250         {
  6251           src++;
  6252           if (c < 0x20)
  6253             {
  6254               if (c == '\r')
  6255                 {
  6256                   if (*src == '\n')
  6257                     {
  6258                       eol_seen |= EOL_SEEN_CRLF;
  6259                       src++;
  6260                       nchars++;
  6261                     }
  6262                   else
  6263                     eol_seen |= EOL_SEEN_CR;
  6264                 }
  6265               else if (c == '\n')
  6266                 eol_seen |= EOL_SEEN_LF;
  6267             }
  6268         }
  6269       else if (UTF_8_2_OCTET_LEADING_P (c))
  6270         {
  6271           if (c < 0xC2          /* overlong sequence */
  6272               || src + 1 >= end
  6273               || ! UTF_8_EXTRA_OCTET_P (src[1]))
  6274             return -1;
  6275           src += 2;
  6276         }
  6277       else if (UTF_8_3_OCTET_LEADING_P (c))
  6278         {
  6279           if (src + 2 >= end
  6280               || ! (UTF_8_EXTRA_OCTET_P (src[1])
  6281                     && UTF_8_EXTRA_OCTET_P (src[2])))
  6282             return -1;
  6283           c = (((c & 0xF) << 12)
  6284                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
  6285           if (c < 0x800                       /* overlong sequence */
  6286               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
  6287             return -1;
  6288           src += 3;
  6289         }
  6290       else if (UTF_8_4_OCTET_LEADING_P (c))
  6291         {
  6292           if (src + 3 >= end
  6293               || ! (UTF_8_EXTRA_OCTET_P (src[1])
  6294                     && UTF_8_EXTRA_OCTET_P (src[2])
  6295                     && UTF_8_EXTRA_OCTET_P (src[3])))
  6296             return -1;
  6297           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
  6298                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
  6299           if (c < 0x10000       /* overlong sequence */
  6300               || c >= 0x110000) /* non-Unicode character  */
  6301             return -1;
  6302           src += 4;
  6303         }
  6304       else
  6305         return -1;
  6306       nchars++;
  6307     }
  6308 
  6309   if (src == end)
  6310     {
  6311       if (! UTF_8_1_OCTET_P (*src))
  6312         return -1;
  6313       nchars++;
  6314       if (*src == '\r')
  6315         eol_seen |= EOL_SEEN_CR;
  6316       else if (*src  == '\n')
  6317         eol_seen |= EOL_SEEN_LF;
  6318     }
  6319   coding->eol_seen = eol_seen;
  6320   return nchars;
  6321 }
  6322 
  6323 
  6324 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
  6325    unibyte string.  */
  6326 
  6327 bool
  6328 utf8_string_p (Lisp_Object string)
  6329 {
  6330   eassert (!STRING_MULTIBYTE (string));
  6331   struct coding_system coding;
  6332   setup_coding_system (Qutf_8_unix, &coding);
  6333   /* We initialize only the fields that check_utf_8 accesses.  */
  6334   coding.head_ascii = -1;
  6335   coding.src_pos = 0;
  6336   coding.src_pos_byte = 0;
  6337   coding.src_chars = SCHARS (string);
  6338   coding.src_bytes = SBYTES (string);
  6339   coding.src_object = string;
  6340   coding.eol_seen = EOL_SEEN_NONE;
  6341   return check_utf_8 (&coding) != -1;
  6342 }
  6343 
  6344 /* Like make_string, but always returns a multibyte Lisp string, and
  6345    avoids decoding if TEXT is encoded in UTF-8.  */
  6346 Lisp_Object
  6347 make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
  6348 {
  6349 #if 0
  6350   /* This method is on average 2 times slower than if we use
  6351      decode_string_utf_8.  However, please leave the slower
  6352      implementation in the code for now, in case it needs to be reused
  6353      in some situations.  */
  6354   ptrdiff_t chars, bytes;
  6355   parse_str_as_multibyte ((const unsigned char *) text, nbytes,
  6356                           &chars, &bytes);
  6357   /* If TEXT is a valid UTF-8 string, we can convert it to a Lisp
  6358      string directly.  Otherwise, we need to decode it.  */
  6359   if (chars == nbytes || bytes == nbytes)
  6360     return make_specified_string (text, chars, nbytes, true);
  6361   else
  6362     {
  6363       struct coding_system coding;
  6364       setup_coding_system (Qutf_8_unix, &coding);
  6365       coding.mode |= CODING_MODE_LAST_BLOCK;
  6366       coding.source = (const unsigned char *) text;
  6367       decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
  6368       return coding.dst_object;
  6369     }
  6370 #else
  6371   return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
  6372 #endif
  6373 }
  6374 
  6375 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
  6376    SOURCE is encoded.  If CATEGORY is one of
  6377    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
  6378    two-byte, else they are encoded by one-byte.
  6379 
  6380    Return one of EOL_SEEN_XXX.  */
  6381 
  6382 #define MAX_EOL_CHECK_COUNT 3
  6383 
  6384 static int
  6385 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
  6386             enum coding_category category)
  6387 {
  6388   const unsigned char *src = source, *src_end = src + src_bytes;
  6389   unsigned char c;
  6390   int total  = 0;
  6391   int eol_seen = EOL_SEEN_NONE;
  6392 
  6393   if ((1 << category) & CATEGORY_MASK_UTF_16)
  6394     {
  6395       bool msb = category == (coding_category_utf_16_le
  6396                               | coding_category_utf_16_le_nosig);
  6397       bool lsb = !msb;
  6398 
  6399       while (src + 1 < src_end)
  6400         {
  6401           c = src[lsb];
  6402           if (src[msb] == 0 && (c == '\n' || c == '\r'))
  6403             {
  6404               int this_eol;
  6405 
  6406               if (c == '\n')
  6407                 this_eol = EOL_SEEN_LF;
  6408               else if (src + 3 >= src_end
  6409                        || src[msb + 2] != 0
  6410                        || src[lsb + 2] != '\n')
  6411                 this_eol = EOL_SEEN_CR;
  6412               else
  6413                 {
  6414                   this_eol = EOL_SEEN_CRLF;
  6415                   src += 2;
  6416                 }
  6417 
  6418               if (eol_seen == EOL_SEEN_NONE)
  6419                 /* This is the first end-of-line.  */
  6420                 eol_seen = this_eol;
  6421               else if (eol_seen != this_eol)
  6422                 {
  6423                   /* The found type is different from what found before.
  6424                      Allow for stray ^M characters in DOS EOL files.  */
  6425                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
  6426                       || (eol_seen == EOL_SEEN_CRLF
  6427                           && this_eol == EOL_SEEN_CR))
  6428                     eol_seen = EOL_SEEN_CRLF;
  6429                   else
  6430                     {
  6431                       eol_seen = EOL_SEEN_LF;
  6432                       break;
  6433                     }
  6434                 }
  6435               if (++total == MAX_EOL_CHECK_COUNT)
  6436                 break;
  6437             }
  6438           src += 2;
  6439         }
  6440     }
  6441   else
  6442     while (src < src_end)
  6443       {
  6444         c = *src++;
  6445         if (c == '\n' || c == '\r')
  6446           {
  6447             int this_eol;
  6448 
  6449             if (c == '\n')
  6450               this_eol = EOL_SEEN_LF;
  6451             else if (src >= src_end || *src != '\n')
  6452               this_eol = EOL_SEEN_CR;
  6453             else
  6454               this_eol = EOL_SEEN_CRLF, src++;
  6455 
  6456             if (eol_seen == EOL_SEEN_NONE)
  6457               /* This is the first end-of-line.  */
  6458               eol_seen = this_eol;
  6459             else if (eol_seen != this_eol)
  6460               {
  6461                 /* The found type is different from what found before.
  6462                    Allow for stray ^M characters in DOS EOL files.  */
  6463                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
  6464                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
  6465                   eol_seen = EOL_SEEN_CRLF;
  6466                 else
  6467                   {
  6468                     eol_seen = EOL_SEEN_LF;
  6469                     break;
  6470                   }
  6471               }
  6472             if (++total == MAX_EOL_CHECK_COUNT)
  6473               break;
  6474           }
  6475       }
  6476   return eol_seen;
  6477 }
  6478 
  6479 
  6480 static Lisp_Object
  6481 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
  6482 {
  6483   Lisp_Object eol_type;
  6484 
  6485   eol_type = CODING_ID_EOL_TYPE (coding->id);
  6486   if (! VECTORP (eol_type))
  6487     /* Already adjusted.  */
  6488     return eol_type;
  6489   if (eol_seen & EOL_SEEN_LF)
  6490     {
  6491       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
  6492       eol_type = Qunix;
  6493     }
  6494   else if (eol_seen & EOL_SEEN_CRLF)
  6495     {
  6496       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
  6497       eol_type = Qdos;
  6498     }
  6499   else if (eol_seen & EOL_SEEN_CR)
  6500     {
  6501       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
  6502       eol_type = Qmac;
  6503     }
  6504   return eol_type;
  6505 }
  6506 
  6507 /* Detect how a text specified in CODING is encoded.  If a coding
  6508    system is detected, update fields of CODING by the detected coding
  6509    system.  */
  6510 
  6511 static void
  6512 detect_coding (struct coding_system *coding)
  6513 {
  6514   const unsigned char *src, *src_end;
  6515   unsigned int saved_mode = coding->mode;
  6516   Lisp_Object found = Qnil;
  6517   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  6518 
  6519   coding->consumed = coding->consumed_char = 0;
  6520   coding->produced = coding->produced_char = 0;
  6521   coding_set_source (coding);
  6522 
  6523   src_end = coding->source + coding->src_bytes;
  6524 
  6525   coding->eol_seen = EOL_SEEN_NONE;
  6526   /* If we have not yet decided the text encoding type, detect it
  6527      now.  */
  6528   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
  6529     {
  6530       int c, i;
  6531       struct coding_detection_info detect_info = {0};
  6532       bool null_byte_found = 0, eight_bit_found = 0;
  6533       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
  6534                                        inhibit_null_byte_detection);
  6535       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
  6536                                        inhibit_iso_escape_detection);
  6537       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
  6538 
  6539       coding->head_ascii = 0;
  6540       for (src = coding->source; src < src_end; src++)
  6541         {
  6542           c = *src;
  6543           if (c & 0x80)
  6544             {
  6545               eight_bit_found = 1;
  6546               if (null_byte_found)
  6547                 break;
  6548             }
  6549           else if (c < 0x20)
  6550             {
  6551               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
  6552                   && ! inhibit_ied
  6553                   && ! detect_info.checked)
  6554                 {
  6555                   if (detect_coding_iso_2022 (coding, &detect_info))
  6556                     {
  6557                       /* We have scanned the whole data.  */
  6558                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
  6559                         {
  6560                           /* We didn't find an 8-bit code.  We may
  6561                              have found a null-byte, but it's very
  6562                              rare that a binary file conforms to
  6563                              ISO-2022.  */
  6564                           src = src_end;
  6565                           coding->head_ascii = src - coding->source;
  6566                         }
  6567                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
  6568                       break;
  6569                     }
  6570                 }
  6571               else if (! c && !inhibit_nbd)
  6572                 {
  6573                   null_byte_found = 1;
  6574                   if (eight_bit_found)
  6575                     break;
  6576                 }
  6577               else if (! disable_ascii_optimization
  6578                        && ! inhibit_eol_conversion)
  6579                 {
  6580                   if (c == '\r')
  6581                     {
  6582                       if (src < src_end && src[1] == '\n')
  6583                         {
  6584                           coding->eol_seen |= EOL_SEEN_CRLF;
  6585                           src++;
  6586                           if (! eight_bit_found)
  6587                             coding->head_ascii++;
  6588                         }
  6589                       else
  6590                         coding->eol_seen |= EOL_SEEN_CR;
  6591                     }
  6592                   else if (c == '\n')
  6593                     {
  6594                       coding->eol_seen |= EOL_SEEN_LF;
  6595                     }
  6596                 }
  6597 
  6598               if (! eight_bit_found)
  6599                 coding->head_ascii++;
  6600             }
  6601           else if (! eight_bit_found)
  6602             coding->head_ascii++;
  6603         }
  6604 
  6605       if (null_byte_found || eight_bit_found
  6606           || coding->head_ascii < coding->src_bytes
  6607           || detect_info.found)
  6608         {
  6609           enum coding_category category;
  6610           struct coding_system *this;
  6611 
  6612           if (coding->head_ascii == coding->src_bytes)
  6613             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
  6614             for (i = 0; i < coding_category_raw_text; i++)
  6615               {
  6616                 category = coding_priorities[i];
  6617                 this = coding_categories + category;
  6618                 if (detect_info.found & (1 << category))
  6619                   break;
  6620               }
  6621           else
  6622             {
  6623               if (null_byte_found)
  6624                 {
  6625                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
  6626                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
  6627                 }
  6628               else if (prefer_utf_8
  6629                        && detect_coding_utf_8 (coding, &detect_info))
  6630                 {
  6631                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
  6632                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
  6633                 }
  6634               for (i = 0; i < coding_category_raw_text; i++)
  6635                 {
  6636                   category = coding_priorities[i];
  6637                   this = coding_categories + category;
  6638                   /* Some of this->detector (e.g. detect_coding_sjis)
  6639                      require this information.  */
  6640                   coding->id = this->id;
  6641                   if (this->id < 0)
  6642                     {
  6643                       /* No coding system of this category is defined.  */
  6644                       detect_info.rejected |= (1 << category);
  6645                     }
  6646                   else if (category >= coding_category_raw_text)
  6647                     continue;
  6648                   else if (detect_info.checked & (1 << category))
  6649                     {
  6650                       if (detect_info.found & (1 << category))
  6651                         break;
  6652                     }
  6653                   else if ((*(this->detector)) (coding, &detect_info)
  6654                            && detect_info.found & (1 << category))
  6655                     break;
  6656                 }
  6657             }
  6658 
  6659           if (i < coding_category_raw_text)
  6660             {
  6661               if (category == coding_category_utf_8_auto)
  6662                 {
  6663                   Lisp_Object coding_systems;
  6664 
  6665                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
  6666                                          coding_attr_utf_bom);
  6667                   if (CONSP (coding_systems))
  6668                     {
  6669                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  6670                         found = XCAR (coding_systems);
  6671                       else
  6672                         found = XCDR (coding_systems);
  6673                     }
  6674                   else
  6675                     found = CODING_ID_NAME (this->id);
  6676                 }
  6677               else if (category == coding_category_utf_16_auto)
  6678                 {
  6679                   Lisp_Object coding_systems;
  6680 
  6681                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
  6682                                          coding_attr_utf_bom);
  6683                   if (CONSP (coding_systems))
  6684                     {
  6685                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  6686                         found = XCAR (coding_systems);
  6687                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  6688                         found = XCDR (coding_systems);
  6689                     }
  6690                   else
  6691                     found = CODING_ID_NAME (this->id);
  6692                 }
  6693               else
  6694                 found = CODING_ID_NAME (this->id);
  6695             }
  6696           else if (null_byte_found)
  6697             found = Qno_conversion;
  6698           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
  6699                    == CATEGORY_MASK_ANY)
  6700             found = Qraw_text;
  6701           else if (detect_info.rejected)
  6702             for (i = 0; i < coding_category_raw_text; i++)
  6703               if (! (detect_info.rejected & (1 << coding_priorities[i])))
  6704                 {
  6705                   this = coding_categories + coding_priorities[i];
  6706                   found = CODING_ID_NAME (this->id);
  6707                   break;
  6708                 }
  6709         }
  6710     }
  6711   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
  6712            == coding_category_utf_8_auto)
  6713     {
  6714       Lisp_Object coding_systems
  6715         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
  6716       if (check_ascii (coding) == coding->src_bytes)
  6717         {
  6718           if (CONSP (coding_systems))
  6719             found = XCDR (coding_systems);
  6720         }
  6721       else
  6722         {
  6723           struct coding_detection_info detect_info = {0};
  6724           if (CONSP (coding_systems)
  6725               && detect_coding_utf_8 (coding, &detect_info))
  6726             {
  6727               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  6728                 found = XCAR (coding_systems);
  6729               else
  6730                 found = XCDR (coding_systems);
  6731             }
  6732         }
  6733     }
  6734   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
  6735            == coding_category_utf_16_auto)
  6736     {
  6737       Lisp_Object coding_systems
  6738         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
  6739       coding->head_ascii = 0;
  6740       if (CONSP (coding_systems))
  6741         {
  6742           struct coding_detection_info detect_info = {0};
  6743           if (detect_coding_utf_16 (coding, &detect_info))
  6744             {
  6745               if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  6746                 found = XCAR (coding_systems);
  6747               else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  6748                 found = XCDR (coding_systems);
  6749             }
  6750         }
  6751     }
  6752 
  6753   if (! NILP (found))
  6754     {
  6755       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
  6756                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
  6757                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
  6758                            : EOL_SEEN_LF);
  6759 
  6760       setup_coding_system (found, coding);
  6761       if (specified_eol != EOL_SEEN_NONE)
  6762         adjust_coding_eol_type (coding, specified_eol);
  6763     }
  6764 
  6765   coding->mode = saved_mode;
  6766 }
  6767 
  6768 
  6769 static void
  6770 decode_eol (struct coding_system *coding)
  6771 {
  6772   Lisp_Object eol_type;
  6773   unsigned char *p, *pbeg, *pend;
  6774 
  6775   eol_type = CODING_ID_EOL_TYPE (coding->id);
  6776   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
  6777     return;
  6778 
  6779   if (NILP (coding->dst_object))
  6780     pbeg = coding->destination;
  6781   else
  6782     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
  6783   pend = pbeg + coding->produced;
  6784 
  6785   if (VECTORP (eol_type))
  6786     {
  6787       int eol_seen = EOL_SEEN_NONE;
  6788 
  6789       for (p = pbeg; p < pend; p++)
  6790         {
  6791           if (*p == '\n')
  6792             eol_seen |= EOL_SEEN_LF;
  6793           else if (*p == '\r')
  6794             {
  6795               if (p + 1 < pend && *(p + 1) == '\n')
  6796                 {
  6797                   eol_seen |= EOL_SEEN_CRLF;
  6798                   p++;
  6799                 }
  6800               else
  6801                 eol_seen |= EOL_SEEN_CR;
  6802             }
  6803         }
  6804       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
  6805       if ((eol_seen & EOL_SEEN_CRLF) != 0
  6806           && (eol_seen & EOL_SEEN_CR) != 0
  6807           && (eol_seen & EOL_SEEN_LF) == 0)
  6808         eol_seen = EOL_SEEN_CRLF;
  6809       else if (eol_seen != EOL_SEEN_NONE
  6810           && eol_seen != EOL_SEEN_LF
  6811           && eol_seen != EOL_SEEN_CRLF
  6812           && eol_seen != EOL_SEEN_CR)
  6813         eol_seen = EOL_SEEN_LF;
  6814       if (eol_seen != EOL_SEEN_NONE)
  6815         eol_type = adjust_coding_eol_type (coding, eol_seen);
  6816     }
  6817 
  6818   if (EQ (eol_type, Qmac))
  6819     {
  6820       for (p = pbeg; p < pend; p++)
  6821         if (*p == '\r')
  6822           *p = '\n';
  6823     }
  6824   else if (EQ (eol_type, Qdos))
  6825     {
  6826       ptrdiff_t n = 0;
  6827       ptrdiff_t pos = coding->dst_pos;
  6828       ptrdiff_t pos_byte = coding->dst_pos_byte;
  6829       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
  6830 
  6831       /* This assertion is here instead of code, now deleted, that
  6832          handled the NILP case, which no longer happens with the
  6833          current codebase.  */
  6834       eassert (!NILP (coding->dst_object));
  6835 
  6836       while (pos_byte < pos_end)
  6837         {
  6838           int incr;
  6839 
  6840           p = BYTE_POS_ADDR (pos_byte);
  6841           if (coding->dst_multibyte)
  6842             incr = BYTES_BY_CHAR_HEAD (*p);
  6843           else
  6844             incr = 1;
  6845 
  6846           if (*p == '\r' && p[1] == '\n')
  6847             {
  6848               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
  6849               n++;
  6850               pos_end--;
  6851             }
  6852           pos++;
  6853           pos_byte += incr;
  6854         }
  6855       coding->produced -= n;
  6856       coding->produced_char -= n;
  6857     }
  6858 }
  6859 
  6860 
  6861 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
  6862    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
  6863    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
  6864 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
  6865 
  6866 /* Return a translation table (or list of them) from coding system
  6867    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
  6868    not ENCODEP). */
  6869 
  6870 static Lisp_Object
  6871 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
  6872 {
  6873   Lisp_Object standard, translation_table;
  6874   Lisp_Object val;
  6875 
  6876   if (NILP (Venable_character_translation))
  6877     {
  6878       if (max_lookup)
  6879         *max_lookup = 0;
  6880       return Qnil;
  6881     }
  6882   if (encodep)
  6883     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
  6884       standard = Vstandard_translation_table_for_encode;
  6885   else
  6886     translation_table = CODING_ATTR_DECODE_TBL (attrs),
  6887       standard = Vstandard_translation_table_for_decode;
  6888   if (NILP (translation_table))
  6889     translation_table = standard;
  6890   else
  6891     {
  6892       if (SYMBOLP (translation_table))
  6893         translation_table = Fget (translation_table, Qtranslation_table);
  6894       else if (CONSP (translation_table))
  6895         {
  6896           translation_table = Fcopy_sequence (translation_table);
  6897           for (val = translation_table; CONSP (val); val = XCDR (val))
  6898             if (SYMBOLP (XCAR (val)))
  6899               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
  6900         }
  6901       if (CHAR_TABLE_P (standard))
  6902         {
  6903           if (CONSP (translation_table))
  6904             translation_table = nconc2 (translation_table, list1 (standard));
  6905           else
  6906             translation_table = list2 (translation_table, standard);
  6907         }
  6908     }
  6909 
  6910   if (max_lookup)
  6911     {
  6912       *max_lookup = 1;
  6913       if (CHAR_TABLE_P (translation_table)
  6914           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
  6915         {
  6916           val = XCHAR_TABLE (translation_table)->extras[1];
  6917           if (FIXNATP (val) && *max_lookup < XFIXNAT (val))
  6918             *max_lookup = min (XFIXNAT (val), MAX_LOOKUP_MAX);
  6919         }
  6920       else if (CONSP (translation_table))
  6921         {
  6922           Lisp_Object tail;
  6923 
  6924           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
  6925             if (CHAR_TABLE_P (XCAR (tail))
  6926                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
  6927               {
  6928                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
  6929                 if (FIXNATP (tailval) && *max_lookup < XFIXNAT (tailval))
  6930                   *max_lookup = min (XFIXNAT (tailval), MAX_LOOKUP_MAX);
  6931               }
  6932         }
  6933     }
  6934   return translation_table;
  6935 }
  6936 
  6937 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
  6938   do {                                                          \
  6939     trans = Qnil;                                               \
  6940     if (CHAR_TABLE_P (table))                                   \
  6941       {                                                         \
  6942         trans = CHAR_TABLE_REF (table, c);                      \
  6943         if (CHARACTERP (trans))                                 \
  6944           c = XFIXNAT (trans), trans = Qnil;                    \
  6945       }                                                         \
  6946     else if (CONSP (table))                                     \
  6947       {                                                         \
  6948         Lisp_Object tail;                                       \
  6949                                                                 \
  6950         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
  6951           if (CHAR_TABLE_P (XCAR (tail)))                       \
  6952             {                                                   \
  6953               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
  6954               if (CHARACTERP (trans))                           \
  6955                 c = XFIXNAT (trans), trans = Qnil;              \
  6956               else if (! NILP (trans))                          \
  6957                 break;                                          \
  6958             }                                                   \
  6959       }                                                         \
  6960   } while (0)
  6961 
  6962 
  6963 /* Return a translation of character(s) at BUF according to TRANS.
  6964    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
  6965    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
  6966    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
  6967    found, or Qt if BUF is too short to lookup characters in FROM.  As
  6968    a side effect, if a translation is found, *NCHARS is set to the
  6969    number of characters being translated.  */
  6970 
  6971 static Lisp_Object
  6972 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
  6973 {
  6974   if (FIXNUMP (trans) || VECTORP (trans))
  6975     {
  6976       *nchars = 1;
  6977       return trans;
  6978     }
  6979   for (; CONSP (trans); trans = XCDR (trans))
  6980     {
  6981       Lisp_Object val = XCAR (trans);
  6982       Lisp_Object from = XCAR (val);
  6983       ptrdiff_t len = ASIZE (from);
  6984       ptrdiff_t i;
  6985 
  6986       for (i = 0; i < len; i++)
  6987         {
  6988           if (buf + i == buf_end)
  6989             return Qt;
  6990           if (XFIXNUM (AREF (from, i)) != buf[i])
  6991             break;
  6992         }
  6993       if (i == len)
  6994         {
  6995           *nchars = len;
  6996           return XCDR (val);
  6997         }
  6998     }
  6999   return Qnil;
  7000 }
  7001 
  7002 
  7003 static int
  7004 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
  7005                bool last_block)
  7006 {
  7007   unsigned char *dst = coding->destination + coding->produced;
  7008   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  7009   ptrdiff_t produced;
  7010   ptrdiff_t produced_chars = 0;
  7011   int carryover = 0;
  7012 
  7013   if (! coding->chars_at_source)
  7014     {
  7015       /* Source characters are in coding->charbuf.  */
  7016       int *buf = coding->charbuf;
  7017       int *buf_end = buf + coding->charbuf_used;
  7018 
  7019       if (EQ (coding->src_object, coding->dst_object)
  7020           && ! NILP (coding->dst_object))
  7021         {
  7022           eassert (growable_destination (coding));
  7023           coding_set_source (coding);
  7024           dst_end = ((unsigned char *) coding->source) + coding->consumed;
  7025         }
  7026 
  7027       while (buf < buf_end)
  7028         {
  7029           int c = *buf;
  7030           ptrdiff_t i;
  7031 
  7032           if (c >= 0)
  7033             {
  7034               ptrdiff_t from_nchars = 1, to_nchars = 1;
  7035               Lisp_Object trans = Qnil;
  7036 
  7037               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
  7038               if (! NILP (trans))
  7039                 {
  7040                   trans = get_translation (trans, buf, buf_end, &from_nchars);
  7041                   if (FIXNUMP (trans))
  7042                     c = XFIXNUM (trans);
  7043                   else if (VECTORP (trans))
  7044                     {
  7045                       to_nchars = ASIZE (trans);
  7046                       c = XFIXNUM (AREF (trans, 0));
  7047                     }
  7048                   else if (EQ (trans, Qt) && ! last_block)
  7049                     break;
  7050                 }
  7051 
  7052               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
  7053                 {
  7054                   eassert (growable_destination (coding));
  7055                   ptrdiff_t dst_size;
  7056                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
  7057                                           &dst_size)
  7058                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
  7059                     memory_full (SIZE_MAX);
  7060                   dst = alloc_destination (coding, dst_size, dst);
  7061                   if (EQ (coding->src_object, coding->dst_object))
  7062                     {
  7063                       coding_set_source (coding);
  7064                       dst_end = (((unsigned char *) coding->source)
  7065                                  + coding->consumed);
  7066                     }
  7067                   else
  7068                     dst_end = coding->destination + coding->dst_bytes;
  7069                 }
  7070 
  7071               for (i = 0; i < to_nchars; i++)
  7072                 {
  7073                   if (i > 0)
  7074                     c = XFIXNUM (AREF (trans, i));
  7075                   if (coding->dst_multibyte
  7076                       || ! CHAR_BYTE8_P (c))
  7077                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
  7078                   else
  7079                     *dst++ = CHAR_TO_BYTE8 (c);
  7080                 }
  7081               produced_chars += to_nchars;
  7082               buf += from_nchars;
  7083             }
  7084           else
  7085             /* This is an annotation datum.  (-C) is the length.  */
  7086             buf += -c;
  7087         }
  7088       carryover = buf_end - buf;
  7089     }
  7090   else
  7091     {
  7092       /* Source characters are at coding->source.  */
  7093       const unsigned char *src = coding->source;
  7094       const unsigned char *src_end = src + coding->consumed;
  7095 
  7096       if (EQ (coding->dst_object, coding->src_object))
  7097         {
  7098           eassert (growable_destination (coding));
  7099           dst_end = (unsigned char *) src;
  7100         }
  7101       if (coding->src_multibyte != coding->dst_multibyte)
  7102         {
  7103           if (coding->src_multibyte)
  7104             {
  7105               bool multibytep = 1;
  7106               ptrdiff_t consumed_chars = 0;
  7107 
  7108               while (1)
  7109                 {
  7110                   const unsigned char *src_base = src;
  7111                   int c;
  7112 
  7113                   ONE_MORE_BYTE (c);
  7114                   if (dst == dst_end)
  7115                     {
  7116                       eassert (growable_destination (coding));
  7117                       if (EQ (coding->src_object, coding->dst_object))
  7118                         dst_end = (unsigned char *) src;
  7119                       if (dst == dst_end)
  7120                         {
  7121                           ptrdiff_t offset = src - coding->source;
  7122 
  7123                           dst = alloc_destination (coding, src_end - src + 1,
  7124                                                    dst);
  7125                           dst_end = coding->destination + coding->dst_bytes;
  7126                           coding_set_source (coding);
  7127                           src = coding->source + offset;
  7128                           src_end = coding->source + coding->consumed;
  7129                           if (EQ (coding->src_object, coding->dst_object))
  7130                             dst_end = (unsigned char *) src;
  7131                         }
  7132                     }
  7133                   *dst++ = c;
  7134                   produced_chars++;
  7135                 }
  7136             no_more_source:
  7137               ;
  7138             }
  7139           else
  7140             while (src < src_end)
  7141               {
  7142                 bool multibytep = 1;
  7143                 int c = *src++;
  7144 
  7145                 if (dst >= dst_end - 1)
  7146                   {
  7147                     eassert (growable_destination (coding));
  7148                     if (EQ (coding->src_object, coding->dst_object))
  7149                       dst_end = (unsigned char *) src;
  7150                     if (dst >= dst_end - 1)
  7151                       {
  7152                         ptrdiff_t offset = src - coding->source;
  7153                         ptrdiff_t more_bytes;
  7154 
  7155                         if (EQ (coding->src_object, coding->dst_object))
  7156                           more_bytes = ((src_end - src) / 2) + 2;
  7157                         else
  7158                           more_bytes = src_end - src + 2;
  7159                         dst = alloc_destination (coding, more_bytes, dst);
  7160                         dst_end = coding->destination + coding->dst_bytes;
  7161                         coding_set_source (coding);
  7162                         src = coding->source + offset;
  7163                         src_end = coding->source + coding->consumed;
  7164                         if (EQ (coding->src_object, coding->dst_object))
  7165                           dst_end = (unsigned char *) src;
  7166                       }
  7167                   }
  7168                 EMIT_ONE_BYTE (c);
  7169               }
  7170         }
  7171       else
  7172         {
  7173           if (!EQ (coding->src_object, coding->dst_object))
  7174             {
  7175               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
  7176 
  7177               if (require > 0)
  7178                 {
  7179                   ptrdiff_t offset = src - coding->source;
  7180 
  7181                   dst = alloc_destination (coding, require, dst);
  7182                   coding_set_source (coding);
  7183                   src = coding->source + offset;
  7184                   src_end = coding->source + coding->consumed;
  7185                 }
  7186             }
  7187           produced_chars = coding->consumed_char;
  7188           while (src < src_end)
  7189             *dst++ = *src++;
  7190         }
  7191     }
  7192 
  7193   produced = dst - (coding->destination + coding->produced);
  7194   if (BUFFERP (coding->dst_object) && produced_chars > 0)
  7195     insert_from_gap (produced_chars, produced, 0);
  7196   coding->produced += produced;
  7197   coding->produced_char += produced_chars;
  7198   return carryover;
  7199 }
  7200 
  7201 /* Compose text in CODING->object according to the annotation data at
  7202    CHARBUF.  CHARBUF is an array:
  7203      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
  7204  */
  7205 
  7206 static void
  7207 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
  7208 {
  7209   int len;
  7210   ptrdiff_t to;
  7211   enum composition_method method;
  7212   Lisp_Object components;
  7213 
  7214   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
  7215   to = pos + charbuf[2];
  7216   method = (enum composition_method) (charbuf[4]);
  7217 
  7218   if (method == COMPOSITION_RELATIVE)
  7219     components = Qnil;
  7220   else
  7221     {
  7222       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
  7223       int i, j;
  7224 
  7225       if (method == COMPOSITION_WITH_RULE)
  7226         len = charbuf[2] * 3 - 2;
  7227       charbuf += MAX_ANNOTATION_LENGTH;
  7228       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
  7229       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
  7230         {
  7231           if (charbuf[i] >= 0)
  7232             args[j] = make_fixnum (charbuf[i]);
  7233           else
  7234             {
  7235               i++;
  7236               args[j] = make_fixnum (charbuf[i] % 0x100);
  7237             }
  7238         }
  7239       components = (i == j ? Fstring (j, args) : Fvector (j, args));
  7240     }
  7241   compose_text (pos, to, components, Qnil, coding->dst_object);
  7242 }
  7243 
  7244 
  7245 /* Put `charset' property on text in CODING->object according to
  7246    the annotation data at CHARBUF.  CHARBUF is an array:
  7247      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
  7248  */
  7249 
  7250 static void
  7251 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
  7252 {
  7253   ptrdiff_t from = pos - charbuf[2];
  7254   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
  7255 
  7256   Fput_text_property (make_fixnum (from), make_fixnum (pos),
  7257                       Qcharset, CHARSET_NAME (charset),
  7258                       coding->dst_object);
  7259 }
  7260 
  7261 #define MAX_CHARBUF_SIZE 0x4000
  7262 /* How many units decoding functions expect in coding->charbuf at
  7263    most.  Currently, decode_coding_emacs_mule expects the following
  7264    size, and that is the largest value.  */
  7265 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
  7266 
  7267 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
  7268   do {                                                          \
  7269     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
  7270                            MAX_CHARBUF_SIZE);                   \
  7271     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
  7272     coding->charbuf_size = units;                               \
  7273   } while (0)
  7274 
  7275 static void
  7276 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
  7277 {
  7278   int *charbuf = coding->charbuf;
  7279   int *charbuf_end = charbuf + coding->charbuf_used;
  7280 
  7281   if (NILP (coding->dst_object))
  7282     return;
  7283 
  7284   while (charbuf < charbuf_end)
  7285     {
  7286       if (*charbuf >= 0)
  7287         pos++, charbuf++;
  7288       else
  7289         {
  7290           int len = -*charbuf;
  7291 
  7292           if (len > 2)
  7293             switch (charbuf[1])
  7294               {
  7295               case CODING_ANNOTATE_COMPOSITION_MASK:
  7296                 produce_composition (coding, charbuf, pos);
  7297                 break;
  7298               case CODING_ANNOTATE_CHARSET_MASK:
  7299                 produce_charset (coding, charbuf, pos);
  7300                 break;
  7301               default:
  7302                 break;
  7303               }
  7304           charbuf += len;
  7305         }
  7306     }
  7307 }
  7308 
  7309 /* Decode the data at CODING->src_object into CODING->dst_object.
  7310    CODING->src_object is a buffer, a string, or nil.
  7311    CODING->dst_object is a buffer.
  7312 
  7313    If CODING->src_object is a buffer, it must be the current buffer.
  7314    In this case, if CODING->src_pos is positive, it is a position of
  7315    the source text in the buffer, otherwise, the source text is in the
  7316    gap area of the buffer, and CODING->src_pos specifies the offset of
  7317    the text from the end of the gap (and GPT must be equal to PT).
  7318 
  7319    When the text is taken from the gap, it can't be at the beginning
  7320    of the gap because the new decoded text is progressively accumulated
  7321    at the beginning of the gap before it gets inserted at PT (this way,
  7322    as the output grows, the input shrinks, so we only need to allocate
  7323    enough space for `max(IN, OUT)` instead of `IN + OUT`).
  7324 
  7325    If CODING->src_object is a string, CODING->src_pos is an index to
  7326    that string.
  7327 
  7328    If CODING->src_object is nil, CODING->source must already point to
  7329    the non-relocatable memory area.  In this case, CODING->src_pos is
  7330    an offset from CODING->source.
  7331 
  7332    The decoded data is inserted at the current point of the buffer
  7333    CODING->dst_object.
  7334 */
  7335 
  7336 static void
  7337 decode_coding (struct coding_system *coding)
  7338 {
  7339   Lisp_Object attrs;
  7340   Lisp_Object undo_list;
  7341   Lisp_Object translation_table;
  7342   struct ccl_spec cclspec;
  7343   int carryover;
  7344   int i;
  7345 
  7346   USE_SAFE_ALLOCA;
  7347 
  7348   if (BUFFERP (coding->src_object)
  7349       && coding->src_pos > 0
  7350       && coding->src_pos < GPT
  7351       && coding->src_pos + coding->src_chars > GPT)
  7352     move_gap_both (coding->src_pos, coding->src_pos_byte);
  7353 
  7354   undo_list = Qt;
  7355   if (BUFFERP (coding->dst_object))
  7356     {
  7357       set_buffer_internal (XBUFFER (coding->dst_object));
  7358       if (GPT != PT)
  7359         move_gap_both (PT, PT_BYTE);
  7360 
  7361       /* We must disable undo_list in order to record the whole insert
  7362          transaction via record_insert at the end.  But doing so also
  7363          disables the recording of the first change to the undo_list.
  7364          Therefore we check for first change here and record it via
  7365          record_first_change if needed.  */
  7366       if (MODIFF <= SAVE_MODIFF)
  7367         record_first_change ();
  7368 
  7369       undo_list = BVAR (current_buffer, undo_list);
  7370       bset_undo_list (current_buffer, Qt);
  7371     }
  7372 
  7373   coding->consumed = coding->consumed_char = 0;
  7374   coding->produced = coding->produced_char = 0;
  7375   coding->chars_at_source = 0;
  7376   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  7377 
  7378   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
  7379 
  7380   attrs = CODING_ID_ATTRS (coding->id);
  7381   translation_table = get_translation_table (attrs, 0, NULL);
  7382 
  7383   carryover = 0;
  7384   if (coding->decoder == decode_coding_ccl)
  7385     {
  7386       coding->spec.ccl = &cclspec;
  7387       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
  7388     }
  7389   do
  7390     {
  7391       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
  7392 
  7393       coding_set_source (coding);
  7394       coding->annotated = 0;
  7395       coding->charbuf_used = carryover;
  7396       (*(coding->decoder)) (coding);
  7397       coding_set_destination (coding);
  7398       carryover = produce_chars (coding, translation_table, 0);
  7399       if (coding->annotated)
  7400         produce_annotation (coding, pos);
  7401       for (i = 0; i < carryover; i++)
  7402         coding->charbuf[i]
  7403           = coding->charbuf[coding->charbuf_used - carryover + i];
  7404     }
  7405   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
  7406          || (coding->consumed < coding->src_bytes
  7407              && (coding->result == CODING_RESULT_SUCCESS
  7408                  || coding->result == CODING_RESULT_INVALID_SRC)));
  7409 
  7410   if (carryover > 0)
  7411     {
  7412       coding_set_destination (coding);
  7413       coding->charbuf_used = carryover;
  7414       produce_chars (coding, translation_table, 1);
  7415     }
  7416 
  7417   coding->carryover_bytes = 0;
  7418   if (coding->consumed < coding->src_bytes)
  7419     {
  7420       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
  7421       const unsigned char *src;
  7422 
  7423       coding_set_source (coding);
  7424       coding_set_destination (coding);
  7425       src = coding->source + coding->consumed;
  7426 
  7427       if (coding->mode & CODING_MODE_LAST_BLOCK)
  7428         {
  7429           /* Flush out unprocessed data as binary chars.  We are sure
  7430              that the number of data is less than the size of
  7431              coding->charbuf.  */
  7432           coding->charbuf_used = 0;
  7433           coding->chars_at_source = 0;
  7434 
  7435           while (nbytes-- > 0)
  7436             {
  7437               int c;
  7438 
  7439               /* Copy raw bytes in their 2-byte forms from multibyte
  7440                  text as single characters.  */
  7441               if (coding->src_multibyte
  7442                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
  7443                 {
  7444                   c = string_char_advance (&src);
  7445                   nbytes--;
  7446                 }
  7447               else
  7448                 {
  7449                   c = *src++;
  7450 
  7451                   if (c & 0x80)
  7452                     c = BYTE8_TO_CHAR (c);
  7453                 }
  7454               coding->charbuf[coding->charbuf_used++] = c;
  7455             }
  7456           produce_chars (coding, Qnil, 1);
  7457         }
  7458       else
  7459         {
  7460           /* Record unprocessed bytes in coding->carryover.  We are
  7461              sure that the number of data is less than the size of
  7462              coding->carryover.  */
  7463           unsigned char *p = coding->carryover;
  7464 
  7465           if (nbytes > sizeof coding->carryover)
  7466             nbytes = sizeof coding->carryover;
  7467           coding->carryover_bytes = nbytes;
  7468           while (nbytes-- > 0)
  7469             *p++ = *src++;
  7470         }
  7471       coding->consumed = coding->src_bytes;
  7472     }
  7473 
  7474   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
  7475       && !inhibit_eol_conversion)
  7476     decode_eol (coding);
  7477   if (BUFFERP (coding->dst_object))
  7478     {
  7479       bset_undo_list (current_buffer, undo_list);
  7480       record_insert (coding->dst_pos, coding->produced_char);
  7481     }
  7482 
  7483   SAFE_FREE ();
  7484 }
  7485 
  7486 
  7487 /* Extract an annotation datum from a composition starting at POS and
  7488    ending before LIMIT of CODING->src_object (buffer or string), store
  7489    the data in BUF, set *STOP to a starting position of the next
  7490    composition (if any) or to LIMIT, and return the address of the
  7491    next element of BUF.
  7492 
  7493    If such an annotation is not found, set *STOP to a starting
  7494    position of a composition after POS (if any) or to LIMIT, and
  7495    return BUF.  */
  7496 
  7497 static int *
  7498 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
  7499                                struct coding_system *coding, int *buf,
  7500                                ptrdiff_t *stop)
  7501 {
  7502   ptrdiff_t start, end;
  7503   Lisp_Object prop;
  7504 
  7505   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
  7506       || end > limit)
  7507     *stop = limit;
  7508   else if (start > pos)
  7509     *stop = start;
  7510   else
  7511     {
  7512       if (start == pos)
  7513         {
  7514           /* We found a composition.  Store the corresponding
  7515              annotation data in BUF.  */
  7516           int *head = buf;
  7517           enum composition_method method = composition_method (prop);
  7518           int nchars = COMPOSITION_LENGTH (prop);
  7519 
  7520           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
  7521           if (method != COMPOSITION_RELATIVE)
  7522             {
  7523               Lisp_Object components;
  7524               ptrdiff_t i, len, i_byte;
  7525 
  7526               components = COMPOSITION_COMPONENTS (prop);
  7527               if (VECTORP (components))
  7528                 {
  7529                   len = ASIZE (components);
  7530                   for (i = 0; i < len; i++)
  7531                     *buf++ = XFIXNUM (AREF (components, i));
  7532                 }
  7533               else if (STRINGP (components))
  7534                 {
  7535                   len = SCHARS (components);
  7536                   i = i_byte = 0;
  7537                   while (i < len)
  7538                     *buf++ = fetch_string_char_advance (components,
  7539                                                         &i, &i_byte);
  7540                 }
  7541               else if (FIXNUMP (components))
  7542                 {
  7543                   len = 1;
  7544                   *buf++ = XFIXNUM (components);
  7545                 }
  7546               else if (CONSP (components))
  7547                 {
  7548                   for (len = 0; CONSP (components);
  7549                        len++, components = XCDR (components))
  7550                     *buf++ = XFIXNUM (XCAR (components));
  7551                 }
  7552               else
  7553                 emacs_abort ();
  7554               *head -= len;
  7555             }
  7556         }
  7557 
  7558       if (find_composition (end, limit, &start, &end, &prop,
  7559                             coding->src_object)
  7560           && end <= limit)
  7561         *stop = start;
  7562       else
  7563         *stop = limit;
  7564     }
  7565   return buf;
  7566 }
  7567 
  7568 
  7569 /* Extract an annotation datum from a text property `charset' at POS of
  7570    CODING->src_object (buffer of string), store the data in BUF, set
  7571    *STOP to the position where the value of `charset' property changes
  7572    (limiting by LIMIT), and return the address of the next element of
  7573    BUF.
  7574 
  7575    If the property value is nil, set *STOP to the position where the
  7576    property value is non-nil (limiting by LIMIT), and return BUF.  */
  7577 
  7578 static int *
  7579 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
  7580                            struct coding_system *coding, int *buf,
  7581                            ptrdiff_t *stop)
  7582 {
  7583   Lisp_Object val, next;
  7584   int id;
  7585 
  7586   val = Fget_text_property (make_fixnum (pos), Qcharset, coding->src_object);
  7587   if (! NILP (val) && CHARSETP (val))
  7588     id = XFIXNUM (CHARSET_SYMBOL_ID (val));
  7589   else
  7590     id = -1;
  7591   ADD_CHARSET_DATA (buf, 0, id);
  7592   next = Fnext_single_property_change (make_fixnum (pos), Qcharset,
  7593                                        coding->src_object,
  7594                                        make_fixnum (limit));
  7595   *stop = XFIXNUM (next);
  7596   return buf;
  7597 }
  7598 
  7599 
  7600 static void
  7601 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
  7602                int max_lookup)
  7603 {
  7604   int *buf = coding->charbuf;
  7605   int *buf_end = coding->charbuf + coding->charbuf_size;
  7606   const unsigned char *src = coding->source + coding->consumed;
  7607   const unsigned char *src_end = coding->source + coding->src_bytes;
  7608   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
  7609   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
  7610   bool multibytep = coding->src_multibyte;
  7611   Lisp_Object eol_type;
  7612   int c;
  7613   ptrdiff_t stop, stop_composition, stop_charset;
  7614   int *lookup_buf = NULL;
  7615 
  7616   if (! NILP (translation_table))
  7617     lookup_buf = alloca (sizeof (int) * max_lookup);
  7618 
  7619   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  7620   if (VECTORP (eol_type))
  7621     eol_type = Qunix;
  7622 
  7623   /* Note: composition handling is not yet implemented.  */
  7624   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
  7625 
  7626   if (NILP (coding->src_object))
  7627     stop = stop_composition = stop_charset = end_pos;
  7628   else
  7629     {
  7630       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
  7631         stop = stop_composition = pos;
  7632       else
  7633         stop = stop_composition = end_pos;
  7634       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
  7635         stop = stop_charset = pos;
  7636       else
  7637         stop_charset = end_pos;
  7638     }
  7639 
  7640   /* Compensate for CRLF and conversion.  */
  7641   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
  7642   while (buf < buf_end)
  7643     {
  7644       Lisp_Object trans;
  7645 
  7646       if (pos == stop)
  7647         {
  7648           if (pos == end_pos)
  7649             break;
  7650           if (pos == stop_composition)
  7651             buf = handle_composition_annotation (pos, end_pos, coding,
  7652                                                  buf, &stop_composition);
  7653           if (pos == stop_charset)
  7654             buf = handle_charset_annotation (pos, end_pos, coding,
  7655                                              buf, &stop_charset);
  7656           stop = (stop_composition < stop_charset
  7657                   ? stop_composition : stop_charset);
  7658         }
  7659 
  7660       if (! multibytep)
  7661         {
  7662           if (coding->encoder == encode_coding_raw_text
  7663               || coding->encoder == encode_coding_ccl)
  7664             c = *src++, pos++;
  7665           else
  7666             {
  7667               int bytes = multibyte_length (src, src_end, true, true);
  7668               if (0 < bytes)
  7669                 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
  7670               else
  7671                 c = BYTE8_TO_CHAR (*src), src++, pos++;
  7672             }
  7673         }
  7674       else
  7675         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
  7676       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
  7677         c = '\n';
  7678       if (! EQ (eol_type, Qunix))
  7679         {
  7680           if (c == '\n')
  7681             {
  7682               if (EQ (eol_type, Qdos))
  7683                 *buf++ = '\r';
  7684               else
  7685                 c = '\r';
  7686             }
  7687         }
  7688 
  7689       trans = Qnil;
  7690       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
  7691       if (NILP (trans))
  7692         *buf++ = c;
  7693       else
  7694         {
  7695           ptrdiff_t from_nchars = 1, to_nchars = 1;
  7696           int *lookup_buf_end;
  7697           const unsigned char *p = src;
  7698           int i;
  7699 
  7700           lookup_buf[0] = c;
  7701           for (i = 1; i < max_lookup && p < src_end; i++)
  7702             lookup_buf[i] = string_char_advance (&p);
  7703           lookup_buf_end = lookup_buf + i;
  7704           trans = get_translation (trans, lookup_buf, lookup_buf_end,
  7705                                    &from_nchars);
  7706           if (FIXNUMP (trans))
  7707             c = XFIXNUM (trans);
  7708           else if (VECTORP (trans))
  7709             {
  7710               to_nchars = ASIZE (trans);
  7711               if (buf_end - buf < to_nchars)
  7712                 break;
  7713               c = XFIXNUM (AREF (trans, 0));
  7714             }
  7715           else
  7716             break;
  7717           *buf++ = c;
  7718           for (i = 1; i < to_nchars; i++)
  7719             *buf++ = XFIXNUM (AREF (trans, i));
  7720           for (i = 1; i < from_nchars; i++, pos++)
  7721             src += multibyte_length (src, NULL, false, true);
  7722         }
  7723     }
  7724 
  7725   coding->consumed = src - coding->source;
  7726   coding->consumed_char = pos - coding->src_pos;
  7727   coding->charbuf_used = buf - coding->charbuf;
  7728   coding->chars_at_source = 0;
  7729 }
  7730 
  7731 
  7732 /* Encode the text at CODING->src_object into CODING->dst_object.
  7733    CODING->src_object is a buffer or a string.
  7734    CODING->dst_object is a buffer or nil.
  7735 
  7736    If CODING->src_object is a buffer, it must be the current buffer.
  7737    In this case, if CODING->src_pos is positive, it is a position of
  7738    the source text in the buffer, otherwise. the source text is in the
  7739    gap area of the buffer, and coding->src_pos specifies the offset of
  7740    the text from GPT (which must be the same as PT).  If this is the
  7741    same buffer as CODING->dst_object, CODING->src_pos must be
  7742    negative and CODING should not have `pre-write-conversion'.
  7743 
  7744    If CODING->src_object is a string, CODING should not have
  7745    `pre-write-conversion'.
  7746 
  7747    If CODING->dst_object is a buffer, the encoded data is inserted at
  7748    the current point of that buffer.
  7749 
  7750    If CODING->dst_object is nil, the encoded data is placed at the
  7751    memory area specified by CODING->destination.  */
  7752 
  7753 static void
  7754 encode_coding (struct coding_system *coding)
  7755 {
  7756   Lisp_Object attrs;
  7757   Lisp_Object translation_table;
  7758   int max_lookup;
  7759   struct ccl_spec cclspec;
  7760 
  7761   USE_SAFE_ALLOCA;
  7762 
  7763   attrs = CODING_ID_ATTRS (coding->id);
  7764   if (coding->encoder == encode_coding_raw_text)
  7765     translation_table = Qnil, max_lookup = 0;
  7766   else
  7767     translation_table = get_translation_table (attrs, 1, &max_lookup);
  7768 
  7769   if (BUFFERP (coding->dst_object))
  7770     {
  7771       set_buffer_internal (XBUFFER (coding->dst_object));
  7772       coding->dst_multibyte
  7773         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
  7774     }
  7775 
  7776   coding->consumed = coding->consumed_char = 0;
  7777   coding->produced = coding->produced_char = 0;
  7778   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  7779 
  7780   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
  7781 
  7782   if (coding->encoder == encode_coding_ccl)
  7783     {
  7784       coding->spec.ccl = &cclspec;
  7785       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
  7786     }
  7787   do {
  7788     coding_set_source (coding);
  7789     consume_chars (coding, translation_table, max_lookup);
  7790     coding_set_destination (coding);
  7791     /* The CODING_MODE_LAST_BLOCK flag should be set only for the last
  7792        iteration of the encoding.  */
  7793     unsigned saved_mode = coding->mode;
  7794     if (coding->consumed_char < coding->src_chars)
  7795       coding->mode &= ~CODING_MODE_LAST_BLOCK;
  7796     (*(coding->encoder)) (coding);
  7797     coding->mode = saved_mode;
  7798   } while (coding->consumed_char < coding->src_chars);
  7799 
  7800   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
  7801     insert_from_gap (coding->produced_char, coding->produced, 0);
  7802 
  7803   SAFE_FREE ();
  7804 }
  7805 
  7806 /* Code-conversion operations use internal buffers.  There's a single
  7807    reusable buffer, which is created the first time it is needed, and
  7808    then never killed.  When this reusable buffer is being used, the
  7809    reused_workbuf_in_use flag is set.  If we need another conversion
  7810    buffer while the reusable one is in use (e.g., if code-conversion
  7811    is reentered when another code-conversion is in progress), we
  7812    create temporary buffers using the name of the reusable buffer as
  7813    the base name, see code_conversion_save below.  These temporary
  7814    buffers are killed when the code-conversion operations that use
  7815    them return, see code_conversion_restore below.  */
  7816 
  7817 /* A string that serves as name of the reusable work buffer, and as base
  7818    name of temporary work buffers used for code-conversion operations.  */
  7819 static Lisp_Object Vcode_conversion_workbuf_name;
  7820 
  7821 /* The reusable working buffer, created once and never killed.  */
  7822 static Lisp_Object Vcode_conversion_reused_workbuf;
  7823 
  7824 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
  7825 static bool reused_workbuf_in_use;
  7826 
  7827 static void
  7828 code_conversion_restore (Lisp_Object arg)
  7829 {
  7830   Lisp_Object current, workbuf;
  7831 
  7832   current = XCAR (arg);
  7833   workbuf = XCDR (arg);
  7834   if (! NILP (workbuf))
  7835     {
  7836       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
  7837         reused_workbuf_in_use = false;
  7838       else
  7839         Fkill_buffer (workbuf);
  7840     }
  7841   set_buffer_internal (XBUFFER (current));
  7842 }
  7843 
  7844 Lisp_Object
  7845 code_conversion_save (bool with_work_buf, bool multibyte)
  7846 {
  7847   Lisp_Object workbuf = Qnil;
  7848 
  7849   if (with_work_buf)
  7850     {
  7851       if (reused_workbuf_in_use)
  7852         {
  7853           Lisp_Object name
  7854             = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
  7855           workbuf = Fget_buffer_create (name, Qt);
  7856         }
  7857       else
  7858         {
  7859           if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
  7860             Vcode_conversion_reused_workbuf
  7861               = Fget_buffer_create (Vcode_conversion_workbuf_name, Qt);
  7862           workbuf = Vcode_conversion_reused_workbuf;
  7863         }
  7864     }
  7865   record_unwind_protect (code_conversion_restore,
  7866                          Fcons (Fcurrent_buffer (), workbuf));
  7867   if (!NILP (workbuf))
  7868     {
  7869       struct buffer *current = current_buffer;
  7870       set_buffer_internal (XBUFFER (workbuf));
  7871       /* We can't allow modification hooks to run in the work buffer.  For
  7872          instance, directory_files_internal assumes that file decoding
  7873          doesn't compile new regexps.  */
  7874       Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
  7875       Ferase_buffer ();
  7876       bset_undo_list (current_buffer, Qt);
  7877       bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
  7878       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
  7879         reused_workbuf_in_use = true;
  7880       set_buffer_internal (current);
  7881     }
  7882 
  7883   return workbuf;
  7884 }
  7885 
  7886 static void
  7887 coding_restore_undo_list (Lisp_Object arg)
  7888 {
  7889   Lisp_Object undo_list = XCAR (arg);
  7890   struct buffer *buf = XBUFFER (XCDR (arg));
  7891 
  7892   bset_undo_list (buf, undo_list);
  7893 }
  7894 
  7895 /* Decode the *last* BYTES of the gap and insert them at point.  */
  7896 void
  7897 decode_coding_gap (struct coding_system *coding, ptrdiff_t bytes)
  7898 {
  7899   specpdl_ref count = SPECPDL_INDEX ();
  7900   Lisp_Object attrs;
  7901 
  7902   eassert (GPT_BYTE == PT_BYTE);
  7903 
  7904   coding->src_object = Fcurrent_buffer ();
  7905   coding->src_chars = bytes;
  7906   coding->src_bytes = bytes;
  7907   coding->src_pos = -bytes;
  7908   coding->src_pos_byte = -bytes;
  7909   coding->src_multibyte = false;
  7910   coding->dst_object = coding->src_object;
  7911   coding->dst_pos = PT;
  7912   coding->dst_pos_byte = PT_BYTE;
  7913   eassert (coding->dst_multibyte
  7914            == !NILP (BVAR (current_buffer, enable_multibyte_characters)));
  7915 
  7916   coding->head_ascii = -1;
  7917   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
  7918   coding->eol_seen = EOL_SEEN_NONE;
  7919   if (CODING_REQUIRE_DETECTION (coding))
  7920     detect_coding (coding);
  7921   attrs = CODING_ID_ATTRS (coding->id);
  7922   if (! disable_ascii_optimization
  7923       && ! coding->src_multibyte
  7924       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  7925       && NILP (CODING_ATTR_POST_READ (attrs))
  7926       && NILP (get_translation_table (attrs, 0, NULL)))
  7927     {
  7928       ptrdiff_t chars = coding->head_ascii;
  7929       if (chars < 0)
  7930         chars = check_ascii (coding);
  7931       if (chars != bytes)
  7932         {
  7933           /* There exists a non-ASCII byte.  */
  7934           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
  7935               && coding->detected_utf8_bytes == coding->src_bytes)
  7936             {
  7937               if (coding->detected_utf8_chars >= 0)
  7938                 chars = coding->detected_utf8_chars;
  7939               else
  7940                 chars = check_utf_8 (coding);
  7941               if (CODING_UTF_8_BOM (coding) != utf_without_bom
  7942                   && coding->head_ascii == 0
  7943                   && coding->source[0] == UTF_8_BOM_1
  7944                   && coding->source[1] == UTF_8_BOM_2
  7945                   && coding->source[2] == UTF_8_BOM_3)
  7946                 {
  7947                   chars--;
  7948                   bytes -= 3;
  7949                   coding->src_bytes -= 3;
  7950                 }
  7951             }
  7952           else
  7953             chars = -1;
  7954         }
  7955       if (chars >= 0)
  7956         {
  7957           Lisp_Object eol_type;
  7958 
  7959           eol_type = CODING_ID_EOL_TYPE (coding->id);
  7960           if (VECTORP (eol_type))
  7961             {
  7962               if (coding->eol_seen != EOL_SEEN_NONE)
  7963                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
  7964             }
  7965           if (EQ (eol_type, Qmac))
  7966             {
  7967               unsigned char *src_end = GAP_END_ADDR;
  7968               unsigned char *src = src_end - coding->src_bytes;
  7969 
  7970               while (src < src_end)
  7971                 {
  7972                   if (*src++ == '\r')
  7973                     src[-1] = '\n';
  7974                 }
  7975             }
  7976           else if (EQ (eol_type, Qdos))
  7977             {
  7978               unsigned char *src = GAP_END_ADDR;
  7979               unsigned char *src_beg = src - coding->src_bytes;
  7980               unsigned char *dst = src;
  7981               ptrdiff_t diff;
  7982 
  7983               while (src_beg < src)
  7984                 {
  7985                   *--dst = *--src;
  7986                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
  7987                     src--;
  7988                 }
  7989               diff = dst - src;
  7990               bytes -= diff;
  7991               chars -= diff;
  7992             }
  7993           coding->produced = bytes;
  7994           coding->produced_char = chars;
  7995           insert_from_gap (chars, bytes, 1);
  7996           return;
  7997         }
  7998     }
  7999   code_conversion_save (0, 0);
  8000 
  8001   coding->mode |= CODING_MODE_LAST_BLOCK;
  8002   current_buffer->text->inhibit_shrinking = 1;
  8003   decode_coding (coding);
  8004   current_buffer->text->inhibit_shrinking = 0;
  8005 
  8006   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  8007     {
  8008       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
  8009       Lisp_Object val;
  8010       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
  8011 
  8012       record_unwind_protect (coding_restore_undo_list,
  8013                              Fcons (undo_list, Fcurrent_buffer ()));
  8014       bset_undo_list (current_buffer, Qt);
  8015       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
  8016       val = call1 (CODING_ATTR_POST_READ (attrs),
  8017                    make_fixnum (coding->produced_char));
  8018       CHECK_FIXNAT (val);
  8019       coding->produced_char += Z - prev_Z;
  8020       coding->produced += Z_BYTE - prev_Z_BYTE;
  8021     }
  8022 
  8023   unbind_to (count, Qnil);
  8024 }
  8025 
  8026 
  8027 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
  8028    SRC_OBJECT into DST_OBJECT by coding context CODING.
  8029 
  8030    SRC_OBJECT is a buffer, a string, or Qnil.
  8031 
  8032    If it is a buffer, the text is at point of the buffer.  FROM and TO
  8033    are positions in the buffer.
  8034 
  8035    If it is a string, the text is at the beginning of the string.
  8036    FROM and TO are indices to the string.
  8037 
  8038    If it is nil, the text is at coding->source.  FROM and TO are
  8039    indices to coding->source.
  8040 
  8041    DST_OBJECT is a buffer, Qt, or Qnil.
  8042 
  8043    If it is a buffer, the decoded text is inserted at point of the
  8044    buffer.  If the buffer is the same as SRC_OBJECT, the source text
  8045    is deleted.
  8046 
  8047    If it is Qt, a string is made from the decoded text, and
  8048    set in CODING->dst_object.
  8049 
  8050    If it is Qnil, the decoded text is stored at CODING->destination.
  8051    The caller must allocate CODING->dst_bytes bytes at
  8052    CODING->destination by xmalloc.  If the decoded text is longer than
  8053    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
  8054  */
  8055 
  8056 void
  8057 decode_coding_object (struct coding_system *coding,
  8058                       Lisp_Object src_object,
  8059                       ptrdiff_t from, ptrdiff_t from_byte,
  8060                       ptrdiff_t to, ptrdiff_t to_byte,
  8061                       Lisp_Object dst_object)
  8062 {
  8063   specpdl_ref count = SPECPDL_INDEX ();
  8064   unsigned char *destination UNINIT;
  8065   ptrdiff_t dst_bytes UNINIT;
  8066   ptrdiff_t chars = to - from;
  8067   ptrdiff_t bytes = to_byte - from_byte;
  8068   Lisp_Object attrs;
  8069   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
  8070   bool need_marker_adjustment = 0;
  8071   Lisp_Object old_deactivate_mark;
  8072 
  8073   old_deactivate_mark = Vdeactivate_mark;
  8074 
  8075   if (NILP (dst_object))
  8076     {
  8077       destination = coding->destination;
  8078       dst_bytes = coding->dst_bytes;
  8079     }
  8080 
  8081   coding->src_object = src_object;
  8082   coding->src_chars = chars;
  8083   coding->src_bytes = bytes;
  8084   coding->src_multibyte = chars < bytes;
  8085 
  8086   if (STRINGP (src_object))
  8087     {
  8088       coding->src_pos = from;
  8089       coding->src_pos_byte = from_byte;
  8090     }
  8091   else if (BUFFERP (src_object))
  8092     {
  8093       set_buffer_internal (XBUFFER (src_object));
  8094       if (from != GPT)
  8095         move_gap_both (from, from_byte);
  8096       if (EQ (src_object, dst_object))
  8097         {
  8098           struct Lisp_Marker *tail;
  8099 
  8100           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8101             {
  8102               tail->need_adjustment
  8103                 = tail->charpos == (tail->insertion_type ? from : to);
  8104               need_marker_adjustment |= tail->need_adjustment;
  8105             }
  8106           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8107           TEMP_SET_PT_BOTH (from, from_byte);
  8108           current_buffer->text->inhibit_shrinking = 1;
  8109           del_range_both (from, from_byte, to, to_byte, 1);
  8110           coding->src_pos = -chars;
  8111           coding->src_pos_byte = -bytes;
  8112         }
  8113       else
  8114         {
  8115           coding->src_pos = from;
  8116           coding->src_pos_byte = from_byte;
  8117         }
  8118     }
  8119 
  8120   if (CODING_REQUIRE_DETECTION (coding))
  8121     detect_coding (coding);
  8122   attrs = CODING_ID_ATTRS (coding->id);
  8123 
  8124   if (EQ (dst_object, Qt)
  8125       || (! NILP (CODING_ATTR_POST_READ (attrs))
  8126           && NILP (dst_object)))
  8127     {
  8128       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
  8129       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
  8130       coding->dst_pos = BEG;
  8131       coding->dst_pos_byte = BEG_BYTE;
  8132     }
  8133   else if (BUFFERP (dst_object))
  8134     {
  8135       code_conversion_save (0, 0);
  8136       coding->dst_object = dst_object;
  8137       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
  8138       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
  8139       coding->dst_multibyte
  8140         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
  8141     }
  8142   else
  8143     {
  8144       code_conversion_save (0, 0);
  8145       coding->dst_object = Qnil;
  8146       /* Most callers presume this will return a multibyte result, and they
  8147          won't use `binary' or `raw-text' anyway, so let's not worry about
  8148          CODING_FOR_UNIBYTE.  */
  8149       coding->dst_multibyte = 1;
  8150     }
  8151 
  8152   decode_coding (coding);
  8153 
  8154   if (BUFFERP (coding->dst_object))
  8155     set_buffer_internal (XBUFFER (coding->dst_object));
  8156 
  8157   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  8158     {
  8159       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
  8160       Lisp_Object val;
  8161       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
  8162       specpdl_ref count1 = SPECPDL_INDEX ();
  8163 
  8164       record_unwind_protect (coding_restore_undo_list,
  8165                              Fcons (undo_list, Fcurrent_buffer ()));
  8166       bset_undo_list (current_buffer, Qt);
  8167       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
  8168       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
  8169                         make_fixnum (coding->produced_char));
  8170       CHECK_FIXNAT (val);
  8171       coding->produced_char += Z - prev_Z;
  8172       coding->produced += Z_BYTE - prev_Z_BYTE;
  8173       unbind_to (count1, Qnil);
  8174     }
  8175 
  8176   if (EQ (dst_object, Qt))
  8177     {
  8178       coding->dst_object = Fbuffer_string ();
  8179     }
  8180   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
  8181     {
  8182       set_buffer_internal (XBUFFER (coding->dst_object));
  8183       if (dst_bytes < coding->produced)
  8184         {
  8185           eassert (coding->produced > 0);
  8186           destination = xrealloc (destination, coding->produced);
  8187           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
  8188             move_gap_both (BEGV, BEGV_BYTE);
  8189           memcpy (destination, BEGV_ADDR, coding->produced);
  8190           coding->destination = destination;
  8191         }
  8192     }
  8193 
  8194   if (saved_pt >= 0)
  8195     {
  8196       /* This is the case of:
  8197          (BUFFERP (src_object) && BASE_EQ (src_object, dst_object))
  8198          As we have moved PT while replacing the original buffer
  8199          contents, we must recover it now.  */
  8200       set_buffer_internal (XBUFFER (src_object));
  8201       current_buffer->text->inhibit_shrinking = 0;
  8202       if (saved_pt < from)
  8203         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
  8204       else if (saved_pt < from + chars)
  8205         TEMP_SET_PT_BOTH (from, from_byte);
  8206       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
  8207         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
  8208                           saved_pt_byte + (coding->produced - bytes));
  8209       else
  8210         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
  8211                           saved_pt_byte + (coding->produced - bytes));
  8212 
  8213       if (need_marker_adjustment)
  8214         {
  8215           struct Lisp_Marker *tail;
  8216 
  8217           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8218             if (tail->need_adjustment)
  8219               {
  8220                 tail->need_adjustment = 0;
  8221                 if (tail->insertion_type)
  8222                   {
  8223                     tail->bytepos = from_byte;
  8224                     tail->charpos = from;
  8225                   }
  8226                 else
  8227                   {
  8228                     tail->bytepos = from_byte + coding->produced;
  8229                     tail->charpos
  8230                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
  8231                          ? tail->bytepos : from + coding->produced_char);
  8232                   }
  8233               }
  8234         }
  8235     }
  8236 
  8237   Vdeactivate_mark = old_deactivate_mark;
  8238   unbind_to (count, coding->dst_object);
  8239 }
  8240 
  8241 
  8242 /* Encode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
  8243    SRC_OBJECT into DST_OBJECT by coding context CODING.
  8244 
  8245    SRC_OBJECT is a buffer, a string, or Qnil.
  8246 
  8247    If it is a buffer, the text is at point of the buffer.  FROM and TO
  8248    are positions in the buffer.
  8249 
  8250    If it is a string, the text is at the beginning of the string.
  8251    FROM and TO are indices into the string.
  8252 
  8253    If it is nil, the text is at coding->source.  FROM and TO are
  8254    indices into coding->source.
  8255 
  8256    DST_OBJECT is a buffer, Qt, or Qnil.
  8257 
  8258    If it is a buffer, the encoded text is inserted at point of the
  8259    buffer.  If the buffer is the same as SRC_OBJECT, the source text
  8260    is replaced with the encoded text.
  8261 
  8262    If it is Qt, a string is made from the encoded text, and set in
  8263    CODING->dst_object.  However, if CODING->raw_destination is non-zero,
  8264    the encoded text is instead returned in CODING->destination as a C string,
  8265    and the caller is responsible for freeing CODING->destination.  This
  8266    feature is meant to be used when the caller doesn't need the result as
  8267    a Lisp string, and wants to avoid unnecessary consing of large strings.
  8268 
  8269    If it is Qnil, the encoded text is stored at CODING->destination.
  8270    The caller must allocate CODING->dst_bytes bytes at
  8271    CODING->destination by xmalloc.  If the encoded text is longer than
  8272    CODING->dst_bytes, CODING->destination is reallocated by xrealloc
  8273    (and CODING->dst_bytes is enlarged accordingly).  */
  8274 
  8275 void
  8276 encode_coding_object (struct coding_system *coding,
  8277                       Lisp_Object src_object,
  8278                       ptrdiff_t from, ptrdiff_t from_byte,
  8279                       ptrdiff_t to, ptrdiff_t to_byte,
  8280                       Lisp_Object dst_object)
  8281 {
  8282   specpdl_ref count = SPECPDL_INDEX ();
  8283   ptrdiff_t chars = to - from;
  8284   ptrdiff_t bytes = to_byte - from_byte;
  8285   Lisp_Object attrs;
  8286   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
  8287   bool need_marker_adjustment = 0;
  8288   bool kill_src_buffer = 0;
  8289   Lisp_Object old_deactivate_mark;
  8290 
  8291   old_deactivate_mark = Vdeactivate_mark;
  8292 
  8293   coding->src_object = src_object;
  8294   coding->src_chars = chars;
  8295   coding->src_bytes = bytes;
  8296   coding->src_multibyte = chars < bytes;
  8297 
  8298   attrs = CODING_ID_ATTRS (coding->id);
  8299 
  8300   bool same_buffer = false;
  8301   if (BASE_EQ (src_object, dst_object) && BUFFERP (src_object))
  8302     {
  8303       struct Lisp_Marker *tail;
  8304 
  8305       same_buffer = true;
  8306 
  8307       for (tail = BUF_MARKERS (XBUFFER (src_object)); tail; tail = tail->next)
  8308         {
  8309           tail->need_adjustment
  8310             = tail->charpos == (tail->insertion_type ? from : to);
  8311           need_marker_adjustment |= tail->need_adjustment;
  8312         }
  8313     }
  8314 
  8315   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
  8316     {
  8317       coding->src_object = code_conversion_save (1, coding->src_multibyte);
  8318       set_buffer_internal (XBUFFER (coding->src_object));
  8319       if (STRINGP (src_object))
  8320         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
  8321       else if (BUFFERP (src_object))
  8322         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
  8323       else
  8324         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
  8325 
  8326       if (same_buffer)
  8327         {
  8328           set_buffer_internal (XBUFFER (src_object));
  8329           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8330           del_range_both (from, from_byte, to, to_byte, 1);
  8331           set_buffer_internal (XBUFFER (coding->src_object));
  8332         }
  8333 
  8334       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
  8335                   make_fixnum (BEG), make_fixnum (Z));
  8336       if (XBUFFER (coding->src_object) != current_buffer)
  8337         kill_src_buffer = 1;
  8338       coding->src_object = Fcurrent_buffer ();
  8339       if (BEG != GPT)
  8340         move_gap_both (BEG, BEG_BYTE);
  8341       coding->src_chars = Z - BEG;
  8342       coding->src_bytes = Z_BYTE - BEG_BYTE;
  8343       coding->src_pos = BEG;
  8344       coding->src_pos_byte = BEG_BYTE;
  8345       coding->src_multibyte = Z < Z_BYTE;
  8346     }
  8347   else if (STRINGP (src_object))
  8348     {
  8349       code_conversion_save (0, 0);
  8350       coding->src_pos = from;
  8351       coding->src_pos_byte = from_byte;
  8352     }
  8353   else if (BUFFERP (src_object))
  8354     {
  8355       code_conversion_save (0, 0);
  8356       set_buffer_internal (XBUFFER (src_object));
  8357       if (same_buffer)
  8358         {
  8359           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8360           coding->src_object = del_range_1 (from, to, 1, 1);
  8361           coding->src_pos = 0;
  8362           coding->src_pos_byte = 0;
  8363         }
  8364       else
  8365         {
  8366           if (from < GPT && to >= GPT)
  8367             move_gap_both (from, from_byte);
  8368           coding->src_pos = from;
  8369           coding->src_pos_byte = from_byte;
  8370         }
  8371     }
  8372   else
  8373     {
  8374       code_conversion_save (0, 0);
  8375       coding->src_pos = from;
  8376       coding->src_pos_byte = from_byte;
  8377     }
  8378 
  8379   if (BUFFERP (dst_object))
  8380     {
  8381       coding->dst_object = dst_object;
  8382       if (BASE_EQ (src_object, dst_object))
  8383         {
  8384           coding->dst_pos = from;
  8385           coding->dst_pos_byte = from_byte;
  8386         }
  8387       else
  8388         {
  8389           struct buffer *current = current_buffer;
  8390 
  8391           set_buffer_temp (XBUFFER (dst_object));
  8392           coding->dst_pos = PT;
  8393           coding->dst_pos_byte = PT_BYTE;
  8394           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
  8395           set_buffer_temp (current);
  8396         }
  8397       coding->dst_multibyte
  8398         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
  8399     }
  8400   else if (EQ (dst_object, Qt))
  8401     {
  8402       ptrdiff_t dst_bytes = max (1, coding->src_chars);
  8403       coding->dst_object = Qnil;
  8404       coding->destination = xmalloc (dst_bytes);
  8405       coding->dst_bytes = dst_bytes;
  8406       coding->dst_multibyte = 0;
  8407     }
  8408   else
  8409     {
  8410       coding->dst_object = Qnil;
  8411       coding->dst_multibyte = 0;
  8412     }
  8413 
  8414   encode_coding (coding);
  8415 
  8416   if (EQ (dst_object, Qt))
  8417     {
  8418       if (BUFFERP (coding->dst_object))
  8419         coding->dst_object = Fbuffer_string ();
  8420       else if (coding->raw_destination)
  8421         /* This is used to avoid creating huge Lisp string.
  8422            NOTE: caller who sets `raw_destination' is also
  8423            responsible for freeing `destination' buffer.  */
  8424         coding->dst_object = Qnil;
  8425       else
  8426         {
  8427           coding->dst_object
  8428             = make_unibyte_string ((char *) coding->destination,
  8429                                    coding->produced);
  8430           xfree (coding->destination);
  8431         }
  8432     }
  8433 
  8434   if (saved_pt >= 0)
  8435     {
  8436       /* This is the case of:
  8437          (BUFFERP (src_object) && BASE_EQ (src_object, dst_object))
  8438          As we have moved PT while replacing the original buffer
  8439          contents, we must recover it now.  */
  8440       set_buffer_internal (XBUFFER (src_object));
  8441       if (saved_pt < from)
  8442         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
  8443       else if (saved_pt < from + chars)
  8444         TEMP_SET_PT_BOTH (from, from_byte);
  8445       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
  8446         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
  8447                           saved_pt_byte + (coding->produced - bytes));
  8448       else
  8449         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
  8450                           saved_pt_byte + (coding->produced - bytes));
  8451 
  8452       if (need_marker_adjustment)
  8453         {
  8454           struct Lisp_Marker *tail;
  8455 
  8456           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8457             if (tail->need_adjustment)
  8458               {
  8459                 tail->need_adjustment = 0;
  8460                 if (tail->insertion_type)
  8461                   {
  8462                     tail->bytepos = from_byte;
  8463                     tail->charpos = from;
  8464                   }
  8465                 else
  8466                   {
  8467                     tail->bytepos = from_byte + coding->produced;
  8468                     tail->charpos
  8469                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
  8470                          ? tail->bytepos : from + coding->produced_char);
  8471                   }
  8472               }
  8473         }
  8474     }
  8475 
  8476   if (kill_src_buffer)
  8477     Fkill_buffer (coding->src_object);
  8478 
  8479   Vdeactivate_mark = old_deactivate_mark;
  8480   unbind_to (count, Qnil);
  8481 }
  8482 
  8483 
  8484 Lisp_Object
  8485 preferred_coding_system (void)
  8486 {
  8487   int id = coding_categories[coding_priorities[0]].id;
  8488 
  8489   return CODING_ID_NAME (id);
  8490 }
  8491 
  8492 #if defined (WINDOWSNT) || defined (CYGWIN)
  8493 
  8494 Lisp_Object
  8495 from_unicode (Lisp_Object str)
  8496 {
  8497   CHECK_STRING (str);
  8498   if (!STRING_MULTIBYTE (str) &&
  8499       SBYTES (str) & 1)
  8500     {
  8501       str = Fsubstring (str, make_fixnum (0), make_fixnum (-1));
  8502     }
  8503 
  8504   return code_convert_string_norecord (str, Qutf_16le, 0);
  8505 }
  8506 
  8507 Lisp_Object
  8508 from_unicode_buffer (const wchar_t *wstr)
  8509 {
  8510   /* We get one of the two final null bytes for free.  */
  8511   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
  8512   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
  8513   return from_unicode (str);
  8514 }
  8515 
  8516 wchar_t *
  8517 to_unicode (Lisp_Object str, Lisp_Object *buf)
  8518 {
  8519   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
  8520   /* We need to make another copy (in addition to the one made by
  8521      code_convert_string_norecord) to ensure that the final string is
  8522      _doubly_ zero terminated --- that is, that the string is
  8523      terminated by two zero bytes and one utf-16le null character.
  8524      Because strings are already terminated with a single zero byte,
  8525      we just add one additional zero. */
  8526   str = make_uninit_string (SBYTES (*buf) + 1);
  8527   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
  8528   SDATA (str) [SBYTES (*buf)] = '\0';
  8529   *buf = str;
  8530   return WCSDATA (*buf);
  8531 }
  8532 
  8533 #endif /* WINDOWSNT || CYGWIN */
  8534 
  8535 
  8536 /*** 8. Emacs Lisp library functions ***/
  8537 
  8538 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
  8539        doc: /* Return t if OBJECT is nil or a coding-system.
  8540 See the documentation of `define-coding-system' for information
  8541 about coding-system objects.  */)
  8542   (Lisp_Object object)
  8543 {
  8544   if (NILP (object)
  8545       || CODING_SYSTEM_ID (object) >= 0)
  8546     return Qt;
  8547   if (! SYMBOLP (object)
  8548       || NILP (Fget (object, Qcoding_system_define_form)))
  8549     return Qnil;
  8550   return Qt;
  8551 }
  8552 
  8553 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
  8554        Sread_non_nil_coding_system, 1, 1, 0,
  8555        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
  8556   (Lisp_Object prompt)
  8557 {
  8558   Lisp_Object val;
  8559   do
  8560     {
  8561       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
  8562                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
  8563     }
  8564   while (SCHARS (val) == 0);
  8565   return (Fintern (val, Qnil));
  8566 }
  8567 
  8568 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
  8569        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
  8570 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
  8571 Ignores case when completing coding systems (all Emacs coding systems
  8572 are lower-case).  */)
  8573   (Lisp_Object prompt, Lisp_Object default_coding_system)
  8574 {
  8575   Lisp_Object val;
  8576   specpdl_ref count = SPECPDL_INDEX ();
  8577 
  8578   if (SYMBOLP (default_coding_system))
  8579     default_coding_system = SYMBOL_NAME (default_coding_system);
  8580   specbind (Qcompletion_ignore_case, Qt);
  8581   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
  8582                           Qt, Qnil, Qcoding_system_history,
  8583                           default_coding_system, Qnil);
  8584   val = unbind_to (count, val);
  8585   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
  8586 }
  8587 
  8588 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
  8589        1, 1, 0,
  8590        doc: /* Check validity of CODING-SYSTEM.
  8591 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
  8592 It is valid if it is nil or a symbol defined as a coding system by the
  8593 function `define-coding-system'.  */)
  8594   (Lisp_Object coding_system)
  8595 {
  8596   Lisp_Object define_form;
  8597 
  8598   define_form = Fget (coding_system, Qcoding_system_define_form);
  8599   if (! NILP (define_form))
  8600     {
  8601       Fput (coding_system, Qcoding_system_define_form, Qnil);
  8602       safe_eval (define_form);
  8603     }
  8604   if (!NILP (Fcoding_system_p (coding_system)))
  8605     return coding_system;
  8606   xsignal1 (Qcoding_system_error, coding_system);
  8607 }
  8608 
  8609 
  8610 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
  8611    HIGHEST, return the coding system of the highest
  8612    priority among the detected coding systems.  Otherwise return a
  8613    list of detected coding systems sorted by their priorities.  If
  8614    MULTIBYTEP, it is assumed that the bytes are in correct
  8615    multibyte form but contains only ASCII and eight-bit chars.
  8616    Otherwise, the bytes are raw bytes.
  8617 
  8618    CODING-SYSTEM controls the detection as below:
  8619 
  8620    If it is nil, detect both text-format and eol-format.  If the
  8621    text-format part of CODING-SYSTEM is already specified
  8622    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
  8623    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
  8624    detect only text-format.  */
  8625 
  8626 Lisp_Object
  8627 detect_coding_system (const unsigned char *src,
  8628                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
  8629                       bool highest, bool multibytep,
  8630                       Lisp_Object coding_system)
  8631 {
  8632   const unsigned char *src_end = src + src_bytes;
  8633   Lisp_Object attrs, eol_type;
  8634   Lisp_Object val = Qnil;
  8635   struct coding_system coding;
  8636   ptrdiff_t id;
  8637   struct coding_detection_info detect_info = {0};
  8638   enum coding_category base_category;
  8639   bool null_byte_found = 0, eight_bit_found = 0;
  8640 
  8641   if (NILP (coding_system))
  8642     coding_system = Qundecided;
  8643   setup_coding_system (coding_system, &coding);
  8644   attrs = CODING_ID_ATTRS (coding.id);
  8645   eol_type = CODING_ID_EOL_TYPE (coding.id);
  8646   coding_system = CODING_ATTR_BASE_NAME (attrs);
  8647 
  8648   coding.source = src;
  8649   coding.src_chars = src_chars;
  8650   coding.src_bytes = src_bytes;
  8651   coding.src_multibyte = multibytep;
  8652   coding.consumed = 0;
  8653   coding.mode |= CODING_MODE_LAST_BLOCK;
  8654   coding.head_ascii = 0;
  8655 
  8656   /* At first, detect text-format if necessary.  */
  8657   base_category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8658   if (base_category == coding_category_undecided)
  8659     {
  8660       enum coding_category category UNINIT;
  8661       struct coding_system *this UNINIT;
  8662       int c, i;
  8663       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
  8664                                        inhibit_null_byte_detection);
  8665       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
  8666                                        inhibit_iso_escape_detection);
  8667       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
  8668 
  8669       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
  8670       for (; src < src_end; src++)
  8671         {
  8672           c = *src;
  8673           if (c & 0x80)
  8674             {
  8675               eight_bit_found = 1;
  8676               if (null_byte_found)
  8677                 break;
  8678             }
  8679           else if (c < 0x20)
  8680             {
  8681               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
  8682                   && ! inhibit_ied
  8683                   && ! detect_info.checked)
  8684                 {
  8685                   if (detect_coding_iso_2022 (&coding, &detect_info))
  8686                     {
  8687                       /* We have scanned the whole data.  */
  8688                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
  8689                         {
  8690                           /* We didn't find an 8-bit code.  We may
  8691                              have found a null-byte, but it's very
  8692                              rare that a binary file confirm to
  8693                              ISO-2022.  */
  8694                           src = src_end;
  8695                           coding.head_ascii = src - coding.source;
  8696                         }
  8697                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
  8698                       break;
  8699                     }
  8700                 }
  8701               else if (! c && !inhibit_nbd)
  8702                 {
  8703                   null_byte_found = 1;
  8704                   if (eight_bit_found)
  8705                     break;
  8706                 }
  8707               if (! eight_bit_found)
  8708                 coding.head_ascii++;
  8709             }
  8710           else if (! eight_bit_found)
  8711             coding.head_ascii++;
  8712         }
  8713 
  8714       if (null_byte_found || eight_bit_found
  8715           || coding.head_ascii < coding.src_bytes
  8716           || detect_info.found)
  8717         {
  8718           if (coding.head_ascii == coding.src_bytes)
  8719             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
  8720             for (i = 0; i < coding_category_raw_text; i++)
  8721               {
  8722                 category = coding_priorities[i];
  8723                 this = coding_categories + category;
  8724                 if (detect_info.found & (1 << category))
  8725                   break;
  8726               }
  8727           else
  8728             {
  8729               if (null_byte_found)
  8730                 {
  8731                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
  8732                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
  8733                 }
  8734               else if (prefer_utf_8
  8735                        && detect_coding_utf_8 (&coding, &detect_info))
  8736                 {
  8737                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
  8738                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
  8739                 }
  8740               for (i = 0; i < coding_category_raw_text; i++)
  8741                 {
  8742                   category = coding_priorities[i];
  8743                   this = coding_categories + category;
  8744 
  8745                   if (this->id < 0)
  8746                     {
  8747                       /* No coding system of this category is defined.  */
  8748                       detect_info.rejected |= (1 << category);
  8749                     }
  8750                   else if (category >= coding_category_raw_text)
  8751                     continue;
  8752                   else if (detect_info.checked & (1 << category))
  8753                     {
  8754                       if (highest
  8755                           && (detect_info.found & (1 << category)))
  8756                         break;
  8757                     }
  8758                   else if ((*(this->detector)) (&coding, &detect_info)
  8759                            && highest
  8760                            && (detect_info.found & (1 << category)))
  8761                     {
  8762                       if (category == coding_category_utf_16_auto)
  8763                         {
  8764                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  8765                             category = coding_category_utf_16_le;
  8766                           else
  8767                             category = coding_category_utf_16_be;
  8768                         }
  8769                       break;
  8770                     }
  8771                 }
  8772             }
  8773         }
  8774 
  8775       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
  8776           || null_byte_found)
  8777         {
  8778           detect_info.found = CATEGORY_MASK_RAW_TEXT;
  8779           id = CODING_SYSTEM_ID (Qno_conversion);
  8780           val = list1i (id);
  8781         }
  8782       else if (! detect_info.rejected && ! detect_info.found)
  8783         {
  8784           detect_info.found = CATEGORY_MASK_ANY;
  8785           id = coding_categories[coding_category_undecided].id;
  8786           val = list1i (id);
  8787         }
  8788       else if (highest)
  8789         {
  8790           if (detect_info.found)
  8791             {
  8792               detect_info.found = 1 << category;
  8793               val = list1i (this->id);
  8794             }
  8795           else
  8796             for (i = 0; i < coding_category_raw_text; i++)
  8797               if (! (detect_info.rejected & (1 << coding_priorities[i])))
  8798                 {
  8799                   detect_info.found = 1 << coding_priorities[i];
  8800                   id = coding_categories[coding_priorities[i]].id;
  8801                   val = list1i (id);
  8802                   break;
  8803                 }
  8804         }
  8805       else
  8806         {
  8807           int mask = detect_info.rejected | detect_info.found;
  8808           int found = 0;
  8809 
  8810           for (i = coding_category_raw_text - 1; i >= 0; i--)
  8811             {
  8812               category = coding_priorities[i];
  8813               if (! (mask & (1 << category)))
  8814                 {
  8815                   found |= 1 << category;
  8816                   id = coding_categories[category].id;
  8817                   if (id >= 0)
  8818                     val = list1i (id);
  8819                 }
  8820             }
  8821           for (i = coding_category_raw_text - 1; i >= 0; i--)
  8822             {
  8823               category = coding_priorities[i];
  8824               if (detect_info.found & (1 << category))
  8825                 {
  8826                   id = coding_categories[category].id;
  8827                   val = Fcons (make_fixnum (id), val);
  8828                 }
  8829             }
  8830           detect_info.found |= found;
  8831         }
  8832     }
  8833   else if (base_category == coding_category_utf_8_auto)
  8834     {
  8835       if (detect_coding_utf_8 (&coding, &detect_info))
  8836         {
  8837           struct coding_system *this;
  8838 
  8839           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  8840             this = coding_categories + coding_category_utf_8_sig;
  8841           else
  8842             this = coding_categories + coding_category_utf_8_nosig;
  8843           val = list1i (this->id);
  8844         }
  8845     }
  8846   else if (base_category == coding_category_utf_16_auto)
  8847     {
  8848       if (detect_coding_utf_16 (&coding, &detect_info))
  8849         {
  8850           struct coding_system *this;
  8851 
  8852           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  8853             this = coding_categories + coding_category_utf_16_le;
  8854           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  8855             this = coding_categories + coding_category_utf_16_be;
  8856           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
  8857             this = coding_categories + coding_category_utf_16_be_nosig;
  8858           else
  8859             this = coding_categories + coding_category_utf_16_le_nosig;
  8860           val = list1i (this->id);
  8861         }
  8862     }
  8863   else
  8864     {
  8865       detect_info.found = 1 << XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8866       val = list1i (coding.id);
  8867     }
  8868 
  8869   /* Then, detect eol-format if necessary.  */
  8870   {
  8871     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
  8872     Lisp_Object tail;
  8873 
  8874     if (VECTORP (eol_type))
  8875       {
  8876         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
  8877           {
  8878             if (null_byte_found)
  8879               normal_eol = EOL_SEEN_LF;
  8880             else
  8881               normal_eol = detect_eol (coding.source, src_bytes,
  8882                                        coding_category_raw_text);
  8883           }
  8884         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
  8885                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
  8886           utf_16_be_eol = detect_eol (coding.source, src_bytes,
  8887                                       coding_category_utf_16_be);
  8888         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
  8889                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
  8890           utf_16_le_eol = detect_eol (coding.source, src_bytes,
  8891                                       coding_category_utf_16_le);
  8892       }
  8893     else
  8894       {
  8895         if (EQ (eol_type, Qunix))
  8896           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
  8897         else if (EQ (eol_type, Qdos))
  8898           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
  8899         else
  8900           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
  8901       }
  8902 
  8903     for (tail = val; CONSP (tail); tail = XCDR (tail))
  8904       {
  8905         enum coding_category category;
  8906         int this_eol;
  8907 
  8908         id = XFIXNUM (XCAR (tail));
  8909         attrs = CODING_ID_ATTRS (id);
  8910         category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8911         eol_type = CODING_ID_EOL_TYPE (id);
  8912         if (VECTORP (eol_type))
  8913           {
  8914             if (category == coding_category_utf_16_be
  8915                 || category == coding_category_utf_16_be_nosig)
  8916               this_eol = utf_16_be_eol;
  8917             else if (category == coding_category_utf_16_le
  8918                      || category == coding_category_utf_16_le_nosig)
  8919               this_eol = utf_16_le_eol;
  8920             else
  8921               this_eol = normal_eol;
  8922 
  8923             if (this_eol == EOL_SEEN_LF)
  8924               XSETCAR (tail, AREF (eol_type, 0));
  8925             else if (this_eol == EOL_SEEN_CRLF)
  8926               XSETCAR (tail, AREF (eol_type, 1));
  8927             else if (this_eol == EOL_SEEN_CR)
  8928               XSETCAR (tail, AREF (eol_type, 2));
  8929             else
  8930               XSETCAR (tail, CODING_ID_NAME (id));
  8931           }
  8932         else
  8933           XSETCAR (tail, CODING_ID_NAME (id));
  8934       }
  8935   }
  8936 
  8937   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
  8938 }
  8939 
  8940 
  8941 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
  8942        2, 3, 0,
  8943        doc: /* Detect coding system of the text in the region between START and END.
  8944 Return a list of possible coding systems ordered by priority.
  8945 The coding systems to try and their priorities follows what
  8946 the function `coding-system-priority-list' (which see) returns.
  8947 
  8948 If only ASCII characters are found (except for such ISO-2022 control
  8949 characters as ESC), it returns a list of single element `undecided'
  8950 or its subsidiary coding system according to a detected end-of-line
  8951 format.
  8952 
  8953 If optional argument HIGHEST is non-nil, return the coding system of
  8954 highest priority.  */)
  8955   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
  8956 {
  8957   ptrdiff_t from, to;
  8958   ptrdiff_t from_byte, to_byte;
  8959 
  8960   validate_region (&start, &end);
  8961   from = XFIXNUM (start), to = XFIXNUM (end);
  8962   from_byte = CHAR_TO_BYTE (from);
  8963   to_byte = CHAR_TO_BYTE (to);
  8964 
  8965   if (from < GPT && to >= GPT)
  8966     move_gap_both (to, to_byte);
  8967 
  8968   return detect_coding_system (BYTE_POS_ADDR (from_byte),
  8969                                to - from, to_byte - from_byte,
  8970                                !NILP (highest),
  8971                                !NILP (BVAR (current_buffer
  8972                                       , enable_multibyte_characters)),
  8973                                Qnil);
  8974 }
  8975 
  8976 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
  8977        1, 2, 0,
  8978        doc: /* Detect coding system of the text in STRING.
  8979 Return a list of possible coding systems ordered by priority.
  8980 The coding systems to try and their priorities follows what
  8981 the function `coding-system-priority-list' (which see) returns.
  8982 
  8983 If only ASCII characters are found (except for such ISO-2022 control
  8984 characters as ESC), it returns a list of single element `undecided'
  8985 or its subsidiary coding system according to a detected end-of-line
  8986 format.
  8987 
  8988 If optional argument HIGHEST is non-nil, return the coding system of
  8989 highest priority.  */)
  8990   (Lisp_Object string, Lisp_Object highest)
  8991 {
  8992   CHECK_STRING (string);
  8993 
  8994   return detect_coding_system (SDATA (string),
  8995                                SCHARS (string), SBYTES (string),
  8996                                !NILP (highest), STRING_MULTIBYTE (string),
  8997                                Qnil);
  8998 }
  8999 
  9000 
  9001 static bool
  9002 char_encodable_p (int c, Lisp_Object attrs)
  9003 {
  9004   Lisp_Object tail;
  9005   struct charset *charset;
  9006   Lisp_Object translation_table;
  9007 
  9008   translation_table = CODING_ATTR_TRANS_TBL (attrs);
  9009   if (! NILP (translation_table))
  9010     c = translate_char (translation_table, c);
  9011   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
  9012        CONSP (tail); tail = XCDR (tail))
  9013     {
  9014       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (tail)));
  9015       if (CHAR_CHARSET_P (c, charset))
  9016         break;
  9017     }
  9018   return (! NILP (tail));
  9019 }
  9020 
  9021 
  9022 /* Return a list of coding systems that safely encode the text between
  9023    START and END.  If EXCLUDE is non-nil, it is a list of coding
  9024    systems not to check.  The returned list doesn't contain any such
  9025    coding systems.  In any case, if the text contains only ASCII or is
  9026    unibyte, return t.  */
  9027 
  9028 DEFUN ("find-coding-systems-region-internal",
  9029        Ffind_coding_systems_region_internal,
  9030        Sfind_coding_systems_region_internal, 2, 3, 0,
  9031        doc: /* Internal use only.  */)
  9032   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
  9033 {
  9034   Lisp_Object coding_attrs_list, safe_codings;
  9035   ptrdiff_t start_byte, end_byte;
  9036   const unsigned char *p, *pbeg, *pend;
  9037   int c;
  9038   Lisp_Object tail, elt, work_table;
  9039 
  9040   if (STRINGP (start))
  9041     {
  9042       if (!STRING_MULTIBYTE (start)
  9043           || SCHARS (start) == SBYTES (start))
  9044         return Qt;
  9045       start_byte = 0;
  9046       end_byte = SBYTES (start);
  9047     }
  9048   else
  9049     {
  9050       EMACS_INT s = fix_position (start);
  9051       EMACS_INT e = fix_position (end);
  9052       if (! (BEG <= s && s <= e && e <= Z))
  9053         args_out_of_range (start, end);
  9054       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
  9055         return Qt;
  9056       start_byte = CHAR_TO_BYTE (s);
  9057       end_byte = CHAR_TO_BYTE (e);
  9058       if (e - s == end_byte - start_byte)
  9059         return Qt;
  9060 
  9061       if (s < GPT && GPT < e)
  9062         {
  9063           if (GPT - s < e - GPT)
  9064             move_gap_both (s, start_byte);
  9065           else
  9066             move_gap_both (e, end_byte);
  9067         }
  9068     }
  9069 
  9070   coding_attrs_list = Qnil;
  9071   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
  9072     if (NILP (exclude)
  9073         || NILP (Fmemq (XCAR (tail), exclude)))
  9074       {
  9075         Lisp_Object attrs;
  9076 
  9077         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
  9078         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
  9079           {
  9080             ASET (attrs, coding_attr_trans_tbl,
  9081                   get_translation_table (attrs, 1, NULL));
  9082             coding_attrs_list = Fcons (attrs, coding_attrs_list);
  9083           }
  9084       }
  9085 
  9086   if (STRINGP (start))
  9087     p = pbeg = SDATA (start);
  9088   else
  9089     p = pbeg = BYTE_POS_ADDR (start_byte);
  9090   pend = p + (end_byte - start_byte);
  9091 
  9092   while (p < pend && ASCII_CHAR_P (*p)) p++;
  9093   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
  9094 
  9095   work_table = Fmake_char_table (Qnil, Qnil);
  9096   while (p < pend)
  9097     {
  9098       if (ASCII_CHAR_P (*p))
  9099         p++;
  9100       else
  9101         {
  9102           c = string_char_advance (&p);
  9103           if (!NILP (char_table_ref (work_table, c)))
  9104             /* This character was already checked.  Ignore it.  */
  9105             continue;
  9106 
  9107           charset_map_loaded = 0;
  9108           for (tail = coding_attrs_list; CONSP (tail);)
  9109             {
  9110               elt = XCAR (tail);
  9111               if (NILP (elt))
  9112                 tail = XCDR (tail);
  9113               else if (char_encodable_p (c, elt))
  9114                 tail = XCDR (tail);
  9115               else if (CONSP (XCDR (tail)))
  9116                 {
  9117                   XSETCAR (tail, XCAR (XCDR (tail)));
  9118                   XSETCDR (tail, XCDR (XCDR (tail)));
  9119                 }
  9120               else
  9121                 {
  9122                   XSETCAR (tail, Qnil);
  9123                   tail = XCDR (tail);
  9124                 }
  9125             }
  9126           if (charset_map_loaded)
  9127             {
  9128               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
  9129 
  9130               if (STRINGP (start))
  9131                 pbeg = SDATA (start);
  9132               else
  9133                 pbeg = BYTE_POS_ADDR (start_byte);
  9134               p = pbeg + p_offset;
  9135               pend = pbeg + pend_offset;
  9136             }
  9137           char_table_set (work_table, c, Qt);
  9138         }
  9139     }
  9140 
  9141   safe_codings = list2 (Qraw_text, Qno_conversion);
  9142   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
  9143     if (! NILP (XCAR (tail)))
  9144       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
  9145 
  9146   return safe_codings;
  9147 }
  9148 
  9149 
  9150 DEFUN ("unencodable-char-position", Funencodable_char_position,
  9151        Sunencodable_char_position, 3, 5, 0,
  9152        doc: /* Return position of first un-encodable character in a region.
  9153 START and END specify the region and CODING-SYSTEM specifies the
  9154 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
  9155 
  9156 If optional 4th argument COUNT is non-nil, it specifies at most how
  9157 many un-encodable characters to search.  In this case, the value is a
  9158 list of positions.
  9159 
  9160 If optional 5th argument STRING is non-nil, it is a string to search
  9161 for un-encodable characters.  In that case, START and END are indexes
  9162 to the string and treated as in `substring'.  */)
  9163   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
  9164    Lisp_Object count, Lisp_Object string)
  9165 {
  9166   EMACS_INT n;
  9167   struct coding_system coding;
  9168   Lisp_Object attrs, charset_list, translation_table;
  9169   Lisp_Object positions;
  9170   ptrdiff_t from, to;
  9171   const unsigned char *p, *stop, *pend;
  9172   bool ascii_compatible;
  9173 
  9174   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
  9175   attrs = CODING_ID_ATTRS (coding.id);
  9176   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
  9177     return Qnil;
  9178   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  9179   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  9180   translation_table = get_translation_table (attrs, 1, NULL);
  9181 
  9182   if (NILP (string))
  9183     {
  9184       validate_region (&start, &end);
  9185       from = XFIXNUM (start);
  9186       to = XFIXNUM (end);
  9187       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
  9188           || (ascii_compatible
  9189               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
  9190         return Qnil;
  9191       p = CHAR_POS_ADDR (from);
  9192       pend = CHAR_POS_ADDR (to);
  9193       if (from < GPT && to >= GPT)
  9194         stop = GPT_ADDR;
  9195       else
  9196         stop = pend;
  9197     }
  9198   else
  9199     {
  9200       CHECK_STRING (string);
  9201       validate_subarray (string, start, end, SCHARS (string), &from, &to);
  9202       if (! STRING_MULTIBYTE (string))
  9203         return Qnil;
  9204       p = SDATA (string) + string_char_to_byte (string, from);
  9205       stop = pend = SDATA (string) + string_char_to_byte (string, to);
  9206       if (ascii_compatible && (to - from) == (pend - p))
  9207         return Qnil;
  9208     }
  9209 
  9210   if (NILP (count))
  9211     n = 1;
  9212   else
  9213     {
  9214       CHECK_FIXNAT (count);
  9215       n = XFIXNUM (count);
  9216     }
  9217 
  9218   positions = Qnil;
  9219   charset_map_loaded = 0;
  9220   while (1)
  9221     {
  9222       int c;
  9223 
  9224       if (ascii_compatible)
  9225         while (p < stop && ASCII_CHAR_P (*p))
  9226           p++, from++;
  9227       if (p >= stop)
  9228         {
  9229           if (p >= pend)
  9230             break;
  9231           stop = pend;
  9232           p = GAP_END_ADDR;
  9233         }
  9234 
  9235       c = string_char_advance (&p);
  9236       if (! (ASCII_CHAR_P (c) && ascii_compatible)
  9237           && ! char_charset (translate_char (translation_table, c),
  9238                              charset_list, NULL))
  9239         {
  9240           positions = Fcons (make_fixnum (from), positions);
  9241           n--;
  9242           if (n == 0)
  9243             break;
  9244         }
  9245 
  9246       from++;
  9247       if (charset_map_loaded && NILP (string))
  9248         {
  9249           p = CHAR_POS_ADDR (from);
  9250           pend = CHAR_POS_ADDR (to);
  9251           if (from < GPT && to >= GPT)
  9252             stop = GPT_ADDR;
  9253           else
  9254             stop = pend;
  9255           charset_map_loaded = 0;
  9256         }
  9257     }
  9258 
  9259   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
  9260 }
  9261 
  9262 
  9263 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
  9264        Scheck_coding_systems_region, 3, 3, 0,
  9265        doc: /* Check if text between START and END is encodable by CODING-SYSTEM-LIST.
  9266 
  9267 START and END are buffer positions specifying the region.
  9268 CODING-SYSTEM-LIST is a list of coding systems to check.
  9269 
  9270 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
  9271 function returns nil.
  9272 
  9273 If some of the coding systems cannot encode the whole region, value is
  9274 an alist, each element of which has the form (CODING-SYSTEM POS1 POS2 ...),
  9275 which means that CODING-SYSTEM cannot encode the text at buffer positions
  9276 POS1, POS2, ...
  9277 
  9278 START may be a string.  In that case, check if the string is
  9279 encodable, and the value contains character indices into the string
  9280 instead of buffer positions.  END is ignored in this case.
  9281 
  9282 If the current buffer (or START if it is a string) is unibyte, the value
  9283 is nil.  */)
  9284   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
  9285 {
  9286   Lisp_Object list;
  9287   ptrdiff_t start_byte, end_byte;
  9288   ptrdiff_t pos;
  9289   const unsigned char *p, *pbeg, *pend;
  9290   int c;
  9291   Lisp_Object tail, elt, attrs;
  9292 
  9293   if (STRINGP (start))
  9294     {
  9295       if (!STRING_MULTIBYTE (start)
  9296           || SCHARS (start) == SBYTES (start))
  9297         return Qnil;
  9298       start_byte = 0;
  9299       end_byte = SBYTES (start);
  9300       pos = 0;
  9301     }
  9302   else
  9303     {
  9304       EMACS_INT s = fix_position (start);
  9305       EMACS_INT e = fix_position (end);
  9306       if (! (BEG <= s && s <= e && e <= Z))
  9307         args_out_of_range (start, end);
  9308       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
  9309         return Qnil;
  9310       start_byte = CHAR_TO_BYTE (s);
  9311       end_byte = CHAR_TO_BYTE (e);
  9312       if (e - s == end_byte - start_byte)
  9313         return Qnil;
  9314 
  9315       if (s < GPT && GPT < e)
  9316         {
  9317           if (GPT - s < e - GPT)
  9318             move_gap_both (s, start_byte);
  9319           else
  9320             move_gap_both (e, end_byte);
  9321         }
  9322       pos = s;
  9323     }
  9324 
  9325   list = Qnil;
  9326   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
  9327     {
  9328       elt = XCAR (tail);
  9329       Lisp_Object spec = CODING_SYSTEM_SPEC (elt);
  9330       if (!VECTORP (spec))
  9331         xsignal1 (Qcoding_system_error, elt);
  9332       attrs = AREF (spec, 0);
  9333       ASET (attrs, coding_attr_trans_tbl,
  9334             get_translation_table (attrs, 1, NULL));
  9335       list = Fcons (list2 (elt, attrs), list);
  9336     }
  9337 
  9338   if (STRINGP (start))
  9339     p = pbeg = SDATA (start);
  9340   else
  9341     p = pbeg = BYTE_POS_ADDR (start_byte);
  9342   pend = p + (end_byte - start_byte);
  9343 
  9344   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
  9345   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
  9346 
  9347   while (p < pend)
  9348     {
  9349       if (ASCII_CHAR_P (*p))
  9350         p++;
  9351       else
  9352         {
  9353           c = string_char_advance (&p);
  9354 
  9355           charset_map_loaded = 0;
  9356           for (tail = list; CONSP (tail); tail = XCDR (tail))
  9357             {
  9358               elt = XCDR (XCAR (tail));
  9359               if (! char_encodable_p (c, XCAR (elt)))
  9360                 XSETCDR (elt, Fcons (make_fixnum (pos), XCDR (elt)));
  9361             }
  9362           if (charset_map_loaded)
  9363             {
  9364               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
  9365 
  9366               if (STRINGP (start))
  9367                 pbeg = SDATA (start);
  9368               else
  9369                 pbeg = BYTE_POS_ADDR (start_byte);
  9370               p = pbeg + p_offset;
  9371               pend = pbeg + pend_offset;
  9372             }
  9373         }
  9374       pos++;
  9375     }
  9376 
  9377   tail = list;
  9378   list = Qnil;
  9379   for (; CONSP (tail); tail = XCDR (tail))
  9380     {
  9381       elt = XCAR (tail);
  9382       if (CONSP (XCDR (XCDR (elt))))
  9383         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
  9384                       list);
  9385     }
  9386 
  9387   return list;
  9388 }
  9389 
  9390 
  9391 static Lisp_Object
  9392 code_convert_region (Lisp_Object start, Lisp_Object end,
  9393                      Lisp_Object coding_system, Lisp_Object dst_object,
  9394                      bool encodep, bool norecord)
  9395 {
  9396   struct coding_system coding;
  9397   ptrdiff_t from, from_byte, to, to_byte;
  9398   Lisp_Object src_object;
  9399 
  9400   if (NILP (coding_system))
  9401     coding_system = Qno_conversion;
  9402   else
  9403     CHECK_CODING_SYSTEM (coding_system);
  9404   src_object = Fcurrent_buffer ();
  9405   if (NILP (dst_object))
  9406     dst_object = src_object;
  9407   else if (! EQ (dst_object, Qt))
  9408     CHECK_BUFFER (dst_object);
  9409 
  9410   validate_region (&start, &end);
  9411   from = XFIXNAT (start);
  9412   from_byte = CHAR_TO_BYTE (from);
  9413   to = XFIXNAT (end);
  9414   to_byte = CHAR_TO_BYTE (to);
  9415 
  9416   setup_coding_system (coding_system, &coding);
  9417   coding.mode |= CODING_MODE_LAST_BLOCK;
  9418 
  9419   if (BUFFERP (dst_object) && !BASE_EQ (dst_object, src_object))
  9420     {
  9421       struct buffer *buf = XBUFFER (dst_object);
  9422       ptrdiff_t buf_pt = BUF_PT (buf);
  9423 
  9424       invalidate_buffer_caches (buf, buf_pt, buf_pt);
  9425     }
  9426 
  9427   if (encodep)
  9428     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
  9429                           dst_object);
  9430   else
  9431     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
  9432                           dst_object);
  9433   if (! norecord)
  9434     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
  9435 
  9436   return (BUFFERP (dst_object)
  9437           ? make_fixnum (coding.produced_char)
  9438           : coding.dst_object);
  9439 }
  9440 
  9441 
  9442 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
  9443        3, 4, "r\nzCoding system: ",
  9444        doc: /* Decode the current region using the specified coding system.
  9445 Interactively, prompt for the coding system to decode the region, and
  9446 replace the region with the decoded text.
  9447 
  9448 \"Decoding\" means transforming bytes into readable text (characters).
  9449 If, for instance, you have a region that contains data that represents
  9450 the two bytes #xc2 #xa9, after calling this function with the utf-8
  9451 coding system, the region will contain the single
  9452 character ?\\N{COPYRIGHT SIGN}.
  9453 
  9454 When called from a program, takes four arguments:
  9455         START, END, CODING-SYSTEM, and DESTINATION.
  9456 START and END are buffer positions.
  9457 
  9458 Optional 4th arguments DESTINATION specifies where the decoded text goes.
  9459 If nil, the region between START and END is replaced by the decoded text.
  9460 If buffer, the decoded text is inserted in that buffer after point (point
  9461 does not move).  If that buffer is unibyte, it receives the individual
  9462 bytes of the internal representation of the decoded text.
  9463 In those cases, the length of the decoded text is returned.
  9464 If DESTINATION is t, the decoded text is returned.
  9465 
  9466 This function sets `last-coding-system-used' to the precise coding system
  9467 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
  9468 not fully specified.)  */)
  9469   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
  9470 {
  9471   return code_convert_region (start, end, coding_system, destination, 0, 0);
  9472 }
  9473 
  9474 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
  9475        3, 4, "r\nzCoding system: ",
  9476        doc: /* Encode the current region using th specified coding system.
  9477 Interactively, prompt for the coding system to encode the region, and
  9478 replace the region with the bytes that are the result of the encoding.
  9479 
  9480 What's meant by \"encoding\" is transforming textual data (characters)
  9481 into bytes.  If, for instance, you have a region that contains the
  9482 single character ?\\N{COPYRIGHT SIGN}, after calling this function with
  9483 the utf-8 coding system, the data in the region will represent the two
  9484 bytes #xc2 #xa9.
  9485 
  9486 When called from a program, takes four arguments:
  9487         START, END, CODING-SYSTEM and DESTINATION.
  9488 START and END are buffer positions.
  9489 
  9490 Optional 4th argument DESTINATION specifies where the encoded text goes.
  9491 If nil, the region between START and END is replaced by the encoded text.
  9492 If buffer, the encoded text is inserted in that buffer after point (point
  9493 does not move).
  9494 In those cases, the length of the encoded text is returned.
  9495 If DESTINATION is t, the encoded text is returned.
  9496 
  9497 This function sets `last-coding-system-used' to the precise coding system
  9498 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
  9499 not fully specified.)  */)
  9500   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
  9501 {
  9502   return code_convert_region (start, end, coding_system, destination, 1, 0);
  9503 }
  9504 
  9505 /* Whether STRING only contains chars in the 0..127 range.  */
  9506 bool
  9507 string_ascii_p (Lisp_Object string)
  9508 {
  9509   ptrdiff_t nbytes = SBYTES (string);
  9510   for (ptrdiff_t i = 0; i < nbytes; i++)
  9511     if (SREF (string, i) > 127)
  9512       return false;
  9513   return true;
  9514 }
  9515 
  9516 Lisp_Object
  9517 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
  9518                      Lisp_Object dst_object, bool encodep, bool nocopy,
  9519                      bool norecord)
  9520 {
  9521   struct coding_system coding;
  9522   ptrdiff_t chars, bytes;
  9523 
  9524   CHECK_STRING (string);
  9525   if (NILP (coding_system))
  9526     {
  9527       if (! norecord)
  9528         Vlast_coding_system_used = Qno_conversion;
  9529       if (NILP (dst_object))
  9530         return nocopy ? string : Fcopy_sequence (string);
  9531     }
  9532 
  9533   if (NILP (coding_system))
  9534     coding_system = Qno_conversion;
  9535   else
  9536     CHECK_CODING_SYSTEM (coding_system);
  9537   if (NILP (dst_object))
  9538     dst_object = Qt;
  9539   else if (! EQ (dst_object, Qt))
  9540     CHECK_BUFFER (dst_object);
  9541 
  9542   setup_coding_system (coding_system, &coding);
  9543   coding.mode |= CODING_MODE_LAST_BLOCK;
  9544   chars = SCHARS (string);
  9545   bytes = SBYTES (string);
  9546 
  9547   if (EQ (dst_object, Qt))
  9548     {
  9549       /* Fast path for ASCII-only input and an ASCII-compatible coding:
  9550          act as identity if no EOL conversion is needed.  */
  9551       Lisp_Object attrs = CODING_ID_ATTRS (coding.id);
  9552       if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  9553           && (STRING_MULTIBYTE (string)
  9554               ? (chars == bytes) : string_ascii_p (string))
  9555           && (EQ (CODING_ID_EOL_TYPE (coding.id), Qunix)
  9556               || inhibit_eol_conversion
  9557               || ! memchr (SDATA (string), encodep ? '\n' : '\r', bytes)))
  9558         {
  9559           if (! norecord)
  9560             Vlast_coding_system_used = coding_system;
  9561           return (nocopy
  9562                   ? string
  9563                   : (encodep
  9564                      ? make_unibyte_string (SSDATA (string), bytes)
  9565                      : make_multibyte_string (SSDATA (string), bytes, bytes)));
  9566         }
  9567     }
  9568   else if (BUFFERP (dst_object))
  9569     {
  9570       struct buffer *buf = XBUFFER (dst_object);
  9571       ptrdiff_t buf_pt = BUF_PT (buf);
  9572 
  9573       invalidate_buffer_caches (buf, buf_pt, buf_pt);
  9574     }
  9575 
  9576   if (encodep)
  9577     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
  9578   else
  9579     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
  9580   if (! norecord)
  9581     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
  9582 
  9583   return (BUFFERP (dst_object)
  9584           ? make_fixnum (coding.produced_char)
  9585           : coding.dst_object);
  9586 }
  9587 
  9588 
  9589 /* Encode or decode STRING according to CODING_SYSTEM.
  9590    Do not set Vlast_coding_system_used.  */
  9591 
  9592 Lisp_Object
  9593 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
  9594                               bool encodep)
  9595 {
  9596   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
  9597 }
  9598 
  9599 
  9600 /* Return the gap address of BUFFER.  If the gap size is less than
  9601    NBYTES, enlarge the gap in advance.  */
  9602 
  9603 static unsigned char *
  9604 get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
  9605 {
  9606   struct buffer *buf = XBUFFER (buffer);
  9607 
  9608   if (BUF_GPT (buf) != BUF_PT (buf))
  9609     {
  9610       struct buffer *oldb = current_buffer;
  9611 
  9612       current_buffer = buf;
  9613       move_gap_both (PT, PT_BYTE);
  9614       current_buffer = oldb;
  9615     }
  9616   if (BUF_GAP_SIZE (buf) < nbytes)
  9617     make_gap_1 (buf, nbytes);
  9618   return BUF_GPT_ADDR (buf);
  9619 }
  9620 
  9621 /* Return a pointer to the byte sequence for C, and its byte length in
  9622    LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
  9623    and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
  9624    decode_string_utf_8 when those arguments are given by
  9625    characters.  */
  9626 
  9627 static unsigned char *
  9628 get_char_bytes (int c, int *len)
  9629 {
  9630   /* Use two caches, since encode/decode_string_utf_8 are called
  9631      repeatedly with the same values for HANDLE_8_BIT and
  9632      HANDLE_OVER_UNI arguments.  */
  9633   static int chars[2];
  9634   static unsigned char bytes[2][6];
  9635   static int nbytes[2];
  9636   static int last_index;
  9637 
  9638   if (chars[last_index] == c)
  9639     {
  9640       *len = nbytes[last_index];
  9641       return bytes[last_index];
  9642     }
  9643   if (chars[1 - last_index] == c)
  9644     {
  9645       *len = nbytes[1 - last_index];
  9646       return bytes[1 - last_index];
  9647     }
  9648   last_index = 1 - last_index;
  9649   chars[last_index] = c;
  9650   *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
  9651   return bytes[last_index];
  9652 }
  9653 
  9654 /* Encode STRING by the coding system utf-8-unix.
  9655 
  9656    This function is optimized for speed when the input string is
  9657    already a valid sequence of Unicode codepoints in the internal
  9658    representation, i.e. there are neither 8-bit raw bytes nor
  9659    characters beyond the Unicode range in the string's contents.
  9660 
  9661    Ignore any :pre-write-conversion and :encode-translation-table
  9662    properties.
  9663 
  9664    Assume that arguments have values as described below.
  9665    The validity must be enforced and ensured by the caller.
  9666 
  9667    STRING is a multibyte string or an ASCII-only unibyte string.
  9668 
  9669    BUFFER is a unibyte buffer or Qnil.
  9670 
  9671    If BUFFER is a unibyte buffer, insert the encoded result
  9672    after point of the buffer, and return the number of
  9673    inserted characters.  The caller should have made BUFFER ready for
  9674    modifying in advance (e.g., by calling invalidate_buffer_caches).
  9675 
  9676    If BUFFER is nil, return a unibyte string from the encoded result.
  9677 
  9678    If NOCOPY is non-zero, and if STRING contains only Unicode
  9679    characters (i.e., the encoding does not change the byte sequence),
  9680    return STRING even if it is multibyte.  WARNING: This will return a
  9681    _multibyte_ string, something that callers might not expect, especially
  9682    if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
  9683    will only use the byte sequence of the encoded result accessed by
  9684    SDATA or SSDATA, and the original STRING will _not_ be modified after
  9685    the encoding.  When in doubt, always pass NOCOPY as zero.  You _have_
  9686    been warned!
  9687 
  9688    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
  9689    character in STRING.  The former is for an eight-bit character (represented
  9690    by a 2-byte overlong sequence in a multibyte STRING).  The latter is
  9691    for a codepoint beyond the end of the Unicode range (a character whose
  9692    code is greater than the maximum Unicode character 0x10FFFF, represented
  9693    by a 4 or 5-byte sequence in a multibyte STRING).
  9694 
  9695    If these two arguments are unibyte strings (typically
  9696    "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
  9697    CHARACTER #xFFFD), encode a non-Unicode character into that
  9698    unibyte sequence.
  9699 
  9700    If the two arguments are characters, encode a non-Unicode
  9701    character as the respective argument characters.
  9702 
  9703    If they are Qignored, skip a non-Unicode character.
  9704 
  9705    If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
  9706    of the same value, like the usual Emacs encoding does.
  9707 
  9708    If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
  9709    range into the same 4 or 5-byte sequence as used by Emacs
  9710    internally, like the usual Emacs encoding does.
  9711 
  9712    If the two arguments are Qnil, return Qnil if STRING has a
  9713    non-Unicode character.  This allows the caller to signal an error
  9714    if such input strings are not allowed.  */
  9715 
  9716 Lisp_Object
  9717 encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
  9718                      bool nocopy, Lisp_Object handle_8_bit,
  9719                      Lisp_Object handle_over_uni)
  9720 {
  9721   ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
  9722   if (NILP (buffer) && nchars == nbytes && nocopy)
  9723     /* STRING contains only ASCII characters.  */
  9724     return string;
  9725 
  9726   ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
  9727   /* The following two vars are counted only if handle_over_uni is not Qt.  */
  9728   ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
  9729   ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
  9730   ptrdiff_t outbytes;        /* number of bytes of decoding result */
  9731   unsigned char *p = SDATA (string);
  9732   unsigned char *pend = p + nbytes;
  9733   unsigned char *src = NULL, *dst = NULL;
  9734   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
  9735   int replace_8_bit_len = 0, replace_over_uni_len = 0;
  9736   Lisp_Object val;              /* the return value */
  9737 
  9738   /* Scan bytes in STRING twice.  The first scan is to count non-Unicode
  9739      characters, and the second scan is to encode STRING.  If the
  9740      encoding is trivial (no need of changing the byte sequence),
  9741      the second scan is avoided.  */
  9742   for (int scan_count = 0; scan_count < 2; scan_count++)
  9743     {
  9744       while (p < pend)
  9745         {
  9746           if (nchars == pend - p)
  9747             /* There is no multibyte character remaining.  */
  9748             break;
  9749 
  9750           int c = *p;
  9751           int len = BYTES_BY_CHAR_HEAD (c);
  9752 
  9753           nchars--;
  9754           if (len == 1
  9755               || len == 3
  9756               || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
  9757                   : (EQ (handle_over_uni, Qt)
  9758                      || (len == 4
  9759                          && STRING_CHAR (p) <= MAX_UNICODE_CHAR))))
  9760             {
  9761               p += len;
  9762               continue;
  9763             }
  9764 
  9765           /* A character to change the byte sequence on encoding was
  9766              found.  A rare case.  */
  9767           if (len == 2)
  9768             {
  9769               /* Handle an eight-bit character by handle_8_bit.  */
  9770               if (scan_count == 0)
  9771                 {
  9772                   if (NILP (handle_8_bit))
  9773                     return Qnil;
  9774                   num_8_bit++;
  9775                 }
  9776               else
  9777                 {
  9778                   if (src < p)
  9779                     {
  9780                       memcpy (dst, src, p - src);
  9781                       dst += p - src;
  9782                     }
  9783                   if (replace_8_bit_len > 0)
  9784                     {
  9785                       memcpy (dst, replace_8_bit, replace_8_bit_len);
  9786                       dst += replace_8_bit_len;
  9787                     }
  9788                   else if (EQ (handle_8_bit, Qt))
  9789                     {
  9790                       int char8 = STRING_CHAR (p);
  9791                       *dst++ = CHAR_TO_BYTE8 (char8);
  9792                     }
  9793                 }
  9794             }
  9795           else                  /* len == 4 or 5 */
  9796             {
  9797               /* Handle an over-unicode character by handle_over_uni.  */
  9798               if (scan_count == 0)
  9799                 {
  9800                   if (NILP (handle_over_uni))
  9801                     return Qnil;
  9802                   if (len == 4)
  9803                     num_over_4++;
  9804                   else
  9805                     num_over_5++;
  9806                 }
  9807               else
  9808                 {
  9809                   if (src < p)
  9810                     {
  9811                       memcpy (dst, src, p - src);
  9812                       dst += p - src;
  9813                     }
  9814                   if (replace_over_uni_len > 0)
  9815                     {
  9816                       memcpy (dst, replace_over_uni, replace_over_uni_len);
  9817                       dst += replace_over_uni_len;
  9818                     }
  9819                 }
  9820             }
  9821           p += len;
  9822           src = p;
  9823         }
  9824 
  9825       if (scan_count == 0)
  9826         {
  9827           /* End of the first scan.  */
  9828           outbytes = nbytes;
  9829           if (num_8_bit == 0
  9830               && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
  9831             {
  9832               /* We can break the loop because there is no need of
  9833                  changing the byte sequence.  This is the typical
  9834                  case.  */
  9835               scan_count = 1;
  9836             }
  9837           else
  9838             {
  9839               /* Prepare for handling non-Unicode characters during
  9840                  the next scan.  */
  9841               if (num_8_bit > 0)
  9842                 {
  9843                   if (CHARACTERP (handle_8_bit))
  9844                     replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
  9845                                                     &replace_8_bit_len);
  9846                   else if (STRINGP (handle_8_bit))
  9847                     {
  9848                       replace_8_bit = SDATA (handle_8_bit);
  9849                       replace_8_bit_len = SBYTES (handle_8_bit);
  9850                     }
  9851                   if (replace_8_bit)
  9852                     outbytes += (replace_8_bit_len - 2) * num_8_bit;
  9853                   else if (EQ (handle_8_bit, Qignored))
  9854                     outbytes -= 2 * num_8_bit;
  9855                   else if (EQ (handle_8_bit, Qt))
  9856                     outbytes -= num_8_bit;
  9857                   else
  9858                     return Qnil;
  9859                 }
  9860               if (num_over_4 + num_over_5 > 0)
  9861                 {
  9862                   if (CHARACTERP (handle_over_uni))
  9863                     replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
  9864                                                        &replace_over_uni_len);
  9865                   else if (STRINGP (handle_over_uni))
  9866                     {
  9867                       replace_over_uni = SDATA (handle_over_uni);
  9868                       replace_over_uni_len = SBYTES (handle_over_uni);
  9869                     }
  9870                   if (num_over_4 > 0)
  9871                     {
  9872                       if (replace_over_uni)
  9873                         outbytes += (replace_over_uni_len - 4) * num_over_4;
  9874                       else if (EQ (handle_over_uni, Qignored))
  9875                         outbytes -= 4 * num_over_4;
  9876                       else if (! EQ (handle_over_uni, Qt))
  9877                         return Qnil;
  9878                     }
  9879                   if (num_over_5 > 0)
  9880                     {
  9881                       if (replace_over_uni)
  9882                         outbytes += (replace_over_uni_len - 5) * num_over_5;
  9883                       else if (EQ (handle_over_uni, Qignored))
  9884                         outbytes -= 5 * num_over_5;
  9885                       else if (! EQ (handle_over_uni, Qt))
  9886                         return Qnil;
  9887                     }
  9888                 }
  9889             }
  9890 
  9891           /* Prepare return value and space to store the encoded bytes.  */
  9892           if (BUFFERP (buffer))
  9893             {
  9894               val = make_fixnum (outbytes);
  9895               dst = get_buffer_gap_address (buffer, nbytes);
  9896             }
  9897           else
  9898             {
  9899               if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
  9900                 return string;
  9901               val = make_uninit_string (outbytes);
  9902               dst = SDATA (val);
  9903             }
  9904           p = src = SDATA (string);
  9905         }
  9906     }
  9907 
  9908   if (src < pend)
  9909     memcpy (dst, src, pend - src);
  9910   if (BUFFERP (buffer))
  9911     {
  9912       struct buffer *oldb = current_buffer;
  9913 
  9914       current_buffer = XBUFFER (buffer);
  9915       insert_from_gap (outbytes, outbytes, false);
  9916       current_buffer = oldb;
  9917     }
  9918   return val;
  9919 }
  9920 
  9921 /* Decode input string by the coding system utf-8-unix.
  9922 
  9923    This function is optimized for speed when the input string is
  9924    already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
  9925    bytes nor any UTF-8 sequences longer than 4 bytes in the string's
  9926    contents.
  9927 
  9928    Ignore any :post-read-conversion and :decode-translation-table
  9929    properties.
  9930 
  9931    Assume that arguments have values as described below.
  9932    The validity must be enforced and ensured by the caller.
  9933 
  9934    STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
  9935    If STRING is Qnil, the input is a C string pointed by STR whose
  9936    length in bytes is in STR_LEN.
  9937 
  9938    BUFFER is a multibyte buffer or Qnil.
  9939    If BUFFER is a multibyte buffer, insert the decoding result of
  9940    Unicode characters after point of the buffer, and return the number
  9941    of inserted characters.  The caller should have made BUFFER ready
  9942    for modifying in advance (e.g., by calling invalidate_buffer_caches).
  9943 
  9944    If BUFFER is Qnil, return a multibyte string from the decoded result.
  9945 
  9946    NOCOPY non-zero means it is OK to return the input STRING if it
  9947    contains only ASCII characters or only valid UTF-8 sequences of 2
  9948    to 4 bytes.  WARNING: This will return a _unibyte_ string, something
  9949    that callers might not expect, especially if STRING is not
  9950    pure-ASCII; only use NOCOPY non-zero if the caller will only use
  9951    the byte sequence of the decoded result accessed via SDATA or
  9952    SSDATA, and if the original STRING will _not_ be modified after the
  9953    decoding.  When in doubt, always pass NOCOPY as zero.  You _have_
  9954    been warned!
  9955 
  9956    If STRING is Qnil, and the original string is passed via STR, NOCOPY
  9957    is ignored.
  9958 
  9959    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
  9960    byte sequence.  The former is for a 1-byte invalid sequence that
  9961    violates the fundamental UTF-8 encoding rules.  The latter is for a
  9962    4 or 5-byte overlong sequences that Emacs internally uses to
  9963    represent characters beyond the Unicode range (characters whose
  9964    codepoints are greater than #x10FFFF).  Note that this function does
  9965    not in general treat such overlong UTF-8 sequences as invalid.
  9966 
  9967    If these two arguments are strings (typically a 1-char string of
  9968    the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
  9969    sequence into that string.  They must be multibyte strings if they
  9970    contain a non-ASCII character.
  9971 
  9972    If the two arguments are characters, decode an invalid byte
  9973    sequence into the corresponding multibyte representation of the
  9974    respective character.
  9975 
  9976    If they are Qignored, skip an invalid byte sequence without
  9977    producing anything in the decoded string.
  9978 
  9979    If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
  9980    corresponding eight-bit multibyte representation, like the usual
  9981    Emacs decoding does.
  9982 
  9983    If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
  9984    that follows Emacs' internal representation for a character beyond
  9985    Unicode range into the corresponding character, like the usual
  9986    Emacs decoding does.
  9987 
  9988    If the two arguments are Qnil, return Qnil if the input string has
  9989    raw bytes or overlong sequences.  This allows the caller to signal
  9990    an error if such inputs are not allowed.  */
  9991 
  9992 Lisp_Object
  9993 decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
  9994                      Lisp_Object buffer, bool nocopy,
  9995                      Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
  9996 {
  9997   /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
  9998      and it returns 0 for an invalid sequence.  */
  9999 #define UTF_8_SEQUENCE_LENGTH(c)        \
 10000   ((c) < 0xC2 ? 0                       \
 10001    : (c) < 0xE0 ? 2                     \
 10002    : (c) < 0xF0 ? 3                     \
 10003    : (c) < 0xF8 ? 4                     \
 10004    : (c) == 0xF8 ? 5                    \
 10005    : 0)
 10006 
 10007   ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
 10008   unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
 10009   unsigned char *str_orig = p;
 10010   unsigned char *pend = p + nbytes;
 10011   ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences */
 10012   ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences */
 10013   ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences */
 10014   ptrdiff_t outbytes = nbytes;  /* number of decoded bytes */
 10015   ptrdiff_t outchars = 0;    /* number of decoded characters */
 10016   unsigned char *src = NULL, *dst = NULL;
 10017   bool change_byte_sequence = false;
 10018 
 10019   /* Scan input bytes twice.  The first scan is to count invalid
 10020      sequences, and the second scan is to decode input.  If the
 10021      decoding is trivial (no need of changing the byte sequence),
 10022      the second scan is avoided.  */
 10023   while (p < pend)
 10024     {
 10025       src = p;
 10026       /* Try short cut for an ASCII-only case.  */
 10027       while (p < pend && *p < 0x80) p++;
 10028       outchars += (p - src);
 10029       if (p == pend)
 10030         break;
 10031       int c = *p;
 10032       outchars++;
 10033       int len = UTF_8_SEQUENCE_LENGTH (c);
 10034       /* len == 0, 2, 3, 4, 5.  */
 10035       if (UTF_8_EXTRA_OCTET_P (p[1])
 10036           && (len == 2
 10037               || (UTF_8_EXTRA_OCTET_P (p[2])
 10038                   && (len == 3
 10039                       || (UTF_8_EXTRA_OCTET_P (p[3])
 10040                           && len == 4
 10041                           && STRING_CHAR (p) <= MAX_UNICODE_CHAR)))))
 10042         {
 10043           p += len;
 10044           continue;
 10045         }
 10046 
 10047       /* A sequence to change on decoding was found.  A rare case.  */
 10048       if (len == 0)
 10049         {
 10050           if (NILP (handle_8_bit))
 10051             return Qnil;
 10052           num_8_bit++;
 10053           len = 1;
 10054         }
 10055       else                      /* len == 4 or 5 */
 10056         {
 10057           if (NILP (handle_over_uni))
 10058             return Qnil;
 10059           if (len == 4)
 10060             num_over_4++;
 10061           else
 10062             num_over_5++;
 10063         }
 10064       change_byte_sequence = true;
 10065       p += len;
 10066     }
 10067 
 10068   Lisp_Object val;           /* the return value */
 10069 
 10070   if (! change_byte_sequence
 10071       && NILP (buffer))
 10072     {
 10073       if (nocopy && STRINGP (string))
 10074         return string;
 10075       val = make_uninit_multibyte_string (outchars, outbytes);
 10076       memcpy (SDATA (val), str_orig, pend - str_orig);
 10077       return val;
 10078     }
 10079 
 10080   /* Count the number of resulting chars and bytes.  */
 10081   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
 10082   int replace_8_bit_len = 0, replace_over_uni_len = 0;
 10083 
 10084   if (change_byte_sequence)
 10085     {
 10086       if (num_8_bit > 0)
 10087         {
 10088           if (CHARACTERP (handle_8_bit))
 10089             replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
 10090                                             &replace_8_bit_len);
 10091           else if (STRINGP (handle_8_bit))
 10092             {
 10093               replace_8_bit = SDATA (handle_8_bit);
 10094               replace_8_bit_len = SBYTES (handle_8_bit);
 10095             }
 10096           if (replace_8_bit)
 10097             outbytes += (replace_8_bit_len - 1) * num_8_bit;
 10098           else if (EQ (handle_8_bit, Qignored))
 10099             {
 10100               outbytes -= num_8_bit;
 10101               outchars -= num_8_bit;
 10102             }
 10103           else /* EQ (handle_8_bit, Qt)) */
 10104             outbytes += num_8_bit;
 10105         }
 10106       else if (num_over_4 + num_over_5 > 0)
 10107         {
 10108           if (CHARACTERP (handle_over_uni))
 10109             replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
 10110                                                &replace_over_uni_len);
 10111           else if (STRINGP (handle_over_uni))
 10112             {
 10113               replace_over_uni = SDATA (handle_over_uni);
 10114               replace_over_uni_len = SBYTES (handle_over_uni);
 10115             }
 10116           if (num_over_4 > 0)
 10117             {
 10118               if (replace_over_uni)
 10119                 outbytes += (replace_over_uni_len - 4) * num_over_4;
 10120               else if (EQ (handle_over_uni, Qignored))
 10121                 {
 10122                   outbytes -= 4 * num_over_4;
 10123                   outchars -= num_over_4;
 10124                 }
 10125             }
 10126           if (num_over_5 > 0)
 10127             {
 10128               if (replace_over_uni)
 10129                 outbytes += (replace_over_uni_len - 5) * num_over_5;
 10130               else if (EQ (handle_over_uni, Qignored))
 10131                 {
 10132                   outbytes -= 5 * num_over_5;
 10133                   outchars -= num_over_5;
 10134                 }
 10135             }
 10136         }
 10137     }
 10138 
 10139   /* Prepare return value and  space to store the decoded bytes.  */
 10140   if (BUFFERP (buffer))
 10141     {
 10142       val = make_fixnum (outchars);
 10143       dst = get_buffer_gap_address (buffer, outbytes);
 10144     }
 10145   else
 10146     {
 10147       if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
 10148           && STRINGP (string))
 10149         return string;
 10150       val = make_uninit_multibyte_string (outchars, outbytes);
 10151       dst = SDATA (val);
 10152     }
 10153 
 10154   src = str_orig;
 10155   if (change_byte_sequence)
 10156     {
 10157       p = src;
 10158       while (p < pend)
 10159         {
 10160           /* Try short cut for an ASCII-only case.  */
 10161           /* while (p < pend && *p < 0x80) p++; */
 10162           /* if (p == pend) */
 10163           /*   break; */
 10164           int c = *p;
 10165           if (c < 0x80)
 10166             {
 10167               p++;
 10168               continue;
 10169             }
 10170           int len = UTF_8_SEQUENCE_LENGTH (c);
 10171           if (len > 1)
 10172             {
 10173               int mlen;
 10174               for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
 10175                    mlen++);
 10176               if (mlen == len
 10177                   && (len <= 3
 10178                       || (len == 4 && STRING_CHAR (p) <= MAX_UNICODE_CHAR)
 10179                       || EQ (handle_over_uni, Qt)))
 10180                 {
 10181                   p += len;
 10182                   continue;
 10183                 }
 10184             }
 10185 
 10186           if (src < p)
 10187             {
 10188               memcpy (dst, src, p - src);
 10189               dst += p - src;
 10190             }
 10191           if (len == 0)
 10192             {
 10193               if (replace_8_bit)
 10194                 {
 10195                   memcpy (dst, replace_8_bit, replace_8_bit_len);
 10196                   dst += replace_8_bit_len;
 10197                 }
 10198               else if (EQ (handle_8_bit, Qt))
 10199                 {
 10200                   dst += BYTE8_STRING (c, dst);
 10201                 }
 10202               len = 1;
 10203             }
 10204           else                  /* len == 4 or 5 */
 10205             {
 10206               /* Handle p[0]... by handle_over_uni.  */
 10207               if (replace_over_uni)
 10208                 {
 10209                   memcpy (dst, replace_over_uni, replace_over_uni_len);
 10210                   dst += replace_over_uni_len;
 10211                 }
 10212             }
 10213           p += len;
 10214           src = p;
 10215         }
 10216     }
 10217 
 10218   if (src < pend)
 10219     memcpy (dst, src, pend - src);
 10220   if (BUFFERP (buffer))
 10221     {
 10222       struct buffer *oldb = current_buffer;
 10223 
 10224       current_buffer = XBUFFER (buffer);
 10225       insert_from_gap (outchars, outbytes, false);
 10226       current_buffer = oldb;
 10227     }
 10228   return val;
 10229 }
 10230 
 10231 /* #define ENABLE_UTF_8_CONVERTER_TEST */
 10232 
 10233 #ifdef ENABLE_UTF_8_CONVERTER_TEST
 10234 
 10235 /* These functions are useful for testing and benchmarking
 10236    encode_string_utf_8 and decode_string_utf_8.  */
 10237 
 10238 /* ENCODE_METHOD specifies which internal decoder to use.
 10239    If it is Qnil, use encode_string_utf_8.
 10240    Otherwise, use code_convert_string.
 10241 
 10242    COUNT, if integer, specifies how many times to call those functions
 10243    with the same arguments (for benchmarking). */
 10244 
 10245 DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
 10246        Sinternal_encode_string_utf_8, 7, 7, 0,
 10247        doc: /* Internal use only.*/)
 10248   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
 10249    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
 10250    Lisp_Object encode_method, Lisp_Object count)
 10251 {
 10252   int repeat_count;
 10253   Lisp_Object val;
 10254 
 10255   /* Check arguments.  Return Qnil when an argument is invalid.  */
 10256   if (! STRINGP (string))
 10257     return Qnil;
 10258   if (! NILP (buffer)
 10259       && (! BUFFERP (buffer)
 10260           || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
 10261     return Qnil;
 10262   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
 10263       && ! EQ (handle_8_bit, Qignored)
 10264       && ! CHARACTERP (handle_8_bit)
 10265       && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
 10266     return Qnil;
 10267   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
 10268       && ! EQ (handle_over_uni, Qignored)
 10269       && ! CHARACTERP (handle_over_uni)
 10270       && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
 10271     return Qnil;
 10272 
 10273   CHECK_FIXNUM (count);
 10274   repeat_count = XFIXNUM (count);
 10275 
 10276   val = Qnil;
 10277   /* Run an encoder according to ENCODE_METHOD.  */
 10278   if (NILP (encode_method))
 10279     {
 10280       for (int i = 0; i < repeat_count; i++)
 10281         val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
 10282                                    handle_8_bit, handle_over_uni);
 10283     }
 10284   else
 10285     {
 10286       for (int i = 0; i < repeat_count; i++)
 10287         val = code_convert_string (string, Qutf_8_unix, Qnil, true,
 10288                                    ! NILP (nocopy), true);
 10289     }
 10290   return val;
 10291 }
 10292 
 10293 /* DECODE_METHOD specifies which internal decoder to use.
 10294    If it is Qnil, use decode_string_utf_8.
 10295    If it is Qt, use code_convert_string.
 10296    Otherwise, use make_string_from_utf8.
 10297 
 10298    COUNT, if integer, specifies how many times to call those functions
 10299    with the same arguments (for benchmarking).  */
 10300 
 10301 DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
 10302        Sinternal_decode_string_utf_8, 7, 7, 0,
 10303        doc: /* Internal use only.*/)
 10304   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
 10305    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
 10306    Lisp_Object decode_method, Lisp_Object count)
 10307 {
 10308   int repeat_count;
 10309   Lisp_Object val;
 10310 
 10311   /* Check arguments.  Return Qnil when an argument is invalid.  */
 10312   if (! STRINGP (string))
 10313     return Qnil;
 10314   if (! NILP (buffer)
 10315       && (! BUFFERP (buffer)
 10316           || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
 10317     return Qnil;
 10318   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
 10319       && ! EQ (handle_8_bit, Qignored)
 10320       && ! CHARACTERP (handle_8_bit)
 10321       && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
 10322     return Qnil;
 10323   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
 10324       && ! EQ (handle_over_uni, Qignored)
 10325       && ! CHARACTERP (handle_over_uni)
 10326       && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
 10327     return Qnil;
 10328 
 10329   CHECK_FIXNUM (count);
 10330   repeat_count = XFIXNUM (count);
 10331 
 10332   val = Qnil;
 10333   /* Run a decoder according to DECODE_METHOD.  */
 10334   if (NILP (decode_method))
 10335     {
 10336       for (int i = 0; i < repeat_count; i++)
 10337         val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
 10338                                    handle_8_bit, handle_over_uni);
 10339     }
 10340   else if (EQ (decode_method, Qt))
 10341     {
 10342       if (! BUFFERP (buffer))
 10343         buffer = Qt;
 10344       for (int i = 0; i < repeat_count; i++)
 10345         val = code_convert_string (string, Qutf_8_unix, buffer, false,
 10346                                    ! NILP (nocopy), true);
 10347     }
 10348   else if (! NILP (decode_method))
 10349     {
 10350       for (int i = 0; i < repeat_count; i++)
 10351         val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
 10352     }
 10353   return val;
 10354 }
 10355 
 10356 #endif  /* ENABLE_UTF_8_CONVERTER_TEST */
 10357 
 10358 /* Encode or decode STRING using CODING_SYSTEM, with the possibility of
 10359    returning STRING itself if it equals the result.
 10360    Do not set Vlast_coding_system_used.  */
 10361 static Lisp_Object
 10362 convert_string_nocopy (Lisp_Object string, Lisp_Object coding_system,
 10363                        bool encodep)
 10364 {
 10365   return code_convert_string (string, coding_system, Qt, encodep, 1, 1);
 10366 }
 10367 
 10368 /* Encode or decode a file name, to or from a unibyte string suitable
 10369    for passing to C library functions.  */
 10370 Lisp_Object
 10371 decode_file_name (Lisp_Object fname)
 10372 {
 10373 #ifdef WINDOWSNT
 10374   /* The w32 build pretends to use UTF-8 for file-name encoding, and
 10375      converts the file names either to UTF-16LE or to the system ANSI
 10376      codepage internally, depending on the underlying OS; see w32.c.  */
 10377   if (! NILP (Fcoding_system_p (Qutf_8)))
 10378     return convert_string_nocopy (fname, Qutf_8, 0);
 10379   return fname;
 10380 #else  /* !WINDOWSNT */
 10381   if (! NILP (Vfile_name_coding_system))
 10382     return convert_string_nocopy (fname, Vfile_name_coding_system, 0);
 10383   else if (! NILP (Vdefault_file_name_coding_system))
 10384     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 0);
 10385   else
 10386     return fname;
 10387 #endif
 10388 }
 10389 
 10390 static Lisp_Object
 10391 encode_file_name_1 (Lisp_Object fname)
 10392 {
 10393   /* This is especially important during bootstrap and dumping, when
 10394      file-name encoding is not yet known, and therefore any non-ASCII
 10395      file names are unibyte strings, and could only be thrashed if we
 10396      try to encode them.  */
 10397   if (!STRING_MULTIBYTE (fname))
 10398     return fname;
 10399 #ifdef WINDOWSNT
 10400   /* The w32 build pretends to use UTF-8 for file-name encoding, and
 10401      converts the file names either to UTF-16LE or to the system ANSI
 10402      codepage internally, depending on the underlying OS; see w32.c.  */
 10403   if (! NILP (Fcoding_system_p (Qutf_8)))
 10404     return convert_string_nocopy (fname, Qutf_8, 1);
 10405   return fname;
 10406 #else  /* !WINDOWSNT */
 10407   if (! NILP (Vfile_name_coding_system))
 10408     return convert_string_nocopy (fname, Vfile_name_coding_system, 1);
 10409   else if (! NILP (Vdefault_file_name_coding_system))
 10410     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 1);
 10411   else
 10412     return fname;
 10413 #endif
 10414 }
 10415 
 10416 Lisp_Object
 10417 encode_file_name (Lisp_Object fname)
 10418 {
 10419   Lisp_Object encoded = encode_file_name_1 (fname);
 10420   /* No system accepts NUL bytes in filenames.  Allowing them can
 10421      cause subtle bugs because the system would silently use a
 10422      different filename than expected.  Perform this check after
 10423      encoding to not miss NUL bytes introduced through encoding.  */
 10424   CHECK_STRING_NULL_BYTES (encoded);
 10425   return encoded;
 10426 }
 10427 
 10428 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
 10429        2, 4, 0,
 10430        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
 10431 
 10432 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
 10433 if the decoding operation is trivial.
 10434 
 10435 Optional fourth arg BUFFER non-nil means that the decoded text is
 10436 inserted in that buffer after point (point does not move).  In this
 10437 case, the return value is the length of the decoded text.  If that
 10438 buffer is unibyte, it receives the individual bytes of the internal
 10439 representation of the decoded text.
 10440 
 10441 This function sets `last-coding-system-used' to the precise coding system
 10442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
 10443 not fully specified.)  The function does not change the match data.  */)
 10444   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
 10445 {
 10446   return code_convert_string (string, coding_system, buffer,
 10447                               0, ! NILP (nocopy), 0);
 10448 }
 10449 
 10450 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
 10451        2, 4, 0,
 10452        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
 10453 
 10454 Optional third arg NOCOPY non-nil means it is OK to return STRING
 10455 itself if the encoding operation is trivial.
 10456 
 10457 Optional fourth arg BUFFER non-nil means that the encoded text is
 10458 inserted in that buffer after point (point does not move).  In this
 10459 case, the return value is the length of the encoded text.
 10460 
 10461 This function sets `last-coding-system-used' to the precise coding system
 10462 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
 10463 not fully specified.)  The function does not change the match data.  */)
 10464   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
 10465 {
 10466   return code_convert_string (string, coding_system, buffer,
 10467                               1, ! NILP (nocopy), 0);
 10468 }
 10469 
 10470 
 10471 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
 10472        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
 10473 Return the corresponding character.  */)
 10474   (Lisp_Object code)
 10475 {
 10476   Lisp_Object spec, attrs, val;
 10477   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
 10478   EMACS_INT ch;
 10479   int c;
 10480 
 10481   CHECK_FIXNAT (code);
 10482   ch = XFIXNAT (code);
 10483   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
 10484   attrs = AREF (spec, 0);
 10485 
 10486   if (ASCII_CHAR_P (ch)
 10487       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10488     return code;
 10489 
 10490   val = CODING_ATTR_CHARSET_LIST (attrs);
 10491   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10492   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10493   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
 10494 
 10495   if (ch <= 0x7F)
 10496     {
 10497       c = ch;
 10498       charset = charset_roman;
 10499     }
 10500   else if (ch >= 0xA0 && ch < 0xDF)
 10501     {
 10502       c = ch - 0x80;
 10503       charset = charset_kana;
 10504     }
 10505   else
 10506     {
 10507       EMACS_INT c1 = ch >> 8;
 10508       int c2 = ch & 0xFF;
 10509 
 10510       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
 10511           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
 10512         error ("Invalid code: %"pI"d", ch);
 10513       c = ch;
 10514       SJIS_TO_JIS (c);
 10515       charset = charset_kanji;
 10516     }
 10517   c = DECODE_CHAR (charset, c);
 10518   if (c < 0)
 10519     error ("Invalid code: %"pI"d", ch);
 10520   return make_fixnum (c);
 10521 }
 10522 
 10523 
 10524 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
 10525        doc: /* Encode a Japanese character CH to shift_jis encoding.
 10526 Return the corresponding code in SJIS.  */)
 10527   (Lisp_Object ch)
 10528 {
 10529   Lisp_Object spec, attrs, charset_list;
 10530   int c;
 10531   struct charset *charset;
 10532   unsigned code;
 10533 
 10534   CHECK_CHARACTER (ch);
 10535   c = XFIXNAT (ch);
 10536   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
 10537   attrs = AREF (spec, 0);
 10538 
 10539   if (ASCII_CHAR_P (c)
 10540       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10541     return ch;
 10542 
 10543   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 10544   charset = char_charset (c, charset_list, &code);
 10545   if (code == CHARSET_INVALID_CODE (charset))
 10546     error ("Can't encode by shift_jis encoding: %c", c);
 10547   JIS_TO_SJIS (code);
 10548 
 10549   return make_fixnum (code);
 10550 }
 10551 
 10552 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
 10553        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
 10554 Return the corresponding character.  */)
 10555   (Lisp_Object code)
 10556 {
 10557   Lisp_Object spec, attrs, val;
 10558   struct charset *charset_roman, *charset_big5, *charset;
 10559   EMACS_INT ch;
 10560   int c;
 10561 
 10562   CHECK_FIXNAT (code);
 10563   ch = XFIXNAT (code);
 10564   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
 10565   attrs = AREF (spec, 0);
 10566 
 10567   if (ASCII_CHAR_P (ch)
 10568       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10569     return code;
 10570 
 10571   val = CODING_ATTR_CHARSET_LIST (attrs);
 10572   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10573   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
 10574 
 10575   if (ch <= 0x7F)
 10576     {
 10577       c = ch;
 10578       charset = charset_roman;
 10579     }
 10580   else
 10581     {
 10582       EMACS_INT b1 = ch >> 8;
 10583       int b2 = ch & 0x7F;
 10584       if (b1 < 0xA1 || b1 > 0xFE
 10585           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
 10586         error ("Invalid code: %"pI"d", ch);
 10587       c = ch;
 10588       charset = charset_big5;
 10589     }
 10590   c = DECODE_CHAR (charset, c);
 10591   if (c < 0)
 10592     error ("Invalid code: %"pI"d", ch);
 10593   return make_fixnum (c);
 10594 }
 10595 
 10596 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
 10597        doc: /* Encode the Big5 character CH to BIG5 coding system.
 10598 Return the corresponding character code in Big5.  */)
 10599   (Lisp_Object ch)
 10600 {
 10601   Lisp_Object spec, attrs, charset_list;
 10602   struct charset *charset;
 10603   int c;
 10604   unsigned code;
 10605 
 10606   CHECK_CHARACTER (ch);
 10607   c = XFIXNAT (ch);
 10608   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
 10609   attrs = AREF (spec, 0);
 10610   if (ASCII_CHAR_P (c)
 10611       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10612     return ch;
 10613 
 10614   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 10615   charset = char_charset (c, charset_list, &code);
 10616   if (code == CHARSET_INVALID_CODE (charset))
 10617     error ("Can't encode by Big5 encoding: %c", c);
 10618 
 10619   return make_fixnum (code);
 10620 }
 10621 
 10622 
 10623 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
 10624        Sset_terminal_coding_system_internal, 1, 2, 0,
 10625        doc: /* Internal use only.  */)
 10626   (Lisp_Object coding_system, Lisp_Object terminal)
 10627 {
 10628   struct terminal *term = decode_live_terminal (terminal);
 10629   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
 10630   CHECK_SYMBOL (coding_system);
 10631   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
 10632   /* We had better not send unsafe characters to terminal.  */
 10633   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
 10634   /* Character composition should be disabled.  */
 10635   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10636   terminal_coding->src_multibyte = 1;
 10637   terminal_coding->dst_multibyte = 0;
 10638   tset_charset_list
 10639     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
 10640             ? coding_charset_list (terminal_coding)
 10641             : list1i (charset_ascii)));
 10642   return Qnil;
 10643 }
 10644 
 10645 DEFUN ("set-safe-terminal-coding-system-internal",
 10646        Fset_safe_terminal_coding_system_internal,
 10647        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
 10648        doc: /* Internal use only.  */)
 10649   (Lisp_Object coding_system)
 10650 {
 10651   CHECK_SYMBOL (coding_system);
 10652   setup_coding_system (Fcheck_coding_system (coding_system),
 10653                        &safe_terminal_coding);
 10654   /* Character composition should be disabled.  */
 10655   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10656   safe_terminal_coding.src_multibyte = 1;
 10657   safe_terminal_coding.dst_multibyte = 0;
 10658   return Qnil;
 10659 }
 10660 
 10661 DEFUN ("terminal-coding-system", Fterminal_coding_system,
 10662        Sterminal_coding_system, 0, 1, 0,
 10663        doc: /* Return coding system specified for terminal output on the given terminal.
 10664 TERMINAL may be a terminal object, a frame, or nil for the selected
 10665 frame's terminal device.  */)
 10666   (Lisp_Object terminal)
 10667 {
 10668   struct coding_system *terminal_coding
 10669     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
 10670   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
 10671 
 10672   /* For backward compatibility, return nil if it is `undecided'.  */
 10673   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
 10674 }
 10675 
 10676 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
 10677        Sset_keyboard_coding_system_internal, 1, 2, 0,
 10678        doc: /* Internal use only.  */)
 10679   (Lisp_Object coding_system, Lisp_Object terminal)
 10680 {
 10681   struct terminal *t = decode_live_terminal (terminal);
 10682   CHECK_SYMBOL (coding_system);
 10683   if (NILP (coding_system))
 10684     coding_system = Qno_conversion;
 10685   else
 10686     Fcheck_coding_system (coding_system);
 10687   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
 10688   /* Character composition should be disabled.  */
 10689   TERMINAL_KEYBOARD_CODING (t)->common_flags
 10690     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10691   return Qnil;
 10692 }
 10693 
 10694 DEFUN ("keyboard-coding-system",
 10695        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
 10696        doc: /* Return coding system specified for decoding keyboard input.  */)
 10697   (Lisp_Object terminal)
 10698 {
 10699   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
 10700                          (decode_live_terminal (terminal))->id);
 10701 }
 10702 
 10703 
 10704 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
 10705        Sfind_operation_coding_system,  1, MANY, 0,
 10706        doc: /* Choose a coding system for an operation based on the target name.
 10707 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
 10708 DECODING-SYSTEM is the coding system to use for decoding
 10709 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
 10710 for encoding (in case OPERATION does encoding).
 10711 
 10712 The first argument OPERATION specifies an I/O primitive:
 10713   For file I/O, `insert-file-contents' or `write-region'.
 10714   For process I/O, `call-process', `call-process-region', or `start-process'.
 10715   For network I/O, `open-network-stream'.
 10716 
 10717 The remaining arguments should be the same arguments that were passed
 10718 to the primitive.  Depending on which primitive, one of those arguments
 10719 is selected as the TARGET.  For example, if OPERATION does file I/O,
 10720 whichever argument specifies the file name is TARGET.
 10721 
 10722 TARGET has a meaning which depends on OPERATION:
 10723   For file I/O, TARGET is a file name (except for the special case below).
 10724   For process I/O, TARGET is a process name.
 10725   For network I/O, TARGET is a service name or a port number.
 10726 
 10727 This function looks up what is specified for TARGET in
 10728 `file-coding-system-alist', `process-coding-system-alist',
 10729 or `network-coding-system-alist' depending on OPERATION.
 10730 They may specify a coding system, a cons of coding systems,
 10731 or a function symbol to call.
 10732 In the last case, we call the function with one argument,
 10733 which is a list of all the arguments given to this function.
 10734 If the function can't decide a coding system, it can return
 10735 `undecided' so that the normal code-detection is performed.
 10736 
 10737 If OPERATION is `insert-file-contents', the argument corresponding to
 10738 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
 10739 file name to look up, and BUFFER is a buffer that contains the file's
 10740 contents (not yet decoded).  If `file-coding-system-alist' specifies a
 10741 function to call for FILENAME, that function should examine the
 10742 contents of BUFFER instead of reading the file.
 10743 
 10744 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
 10745   (ptrdiff_t nargs, Lisp_Object *args)
 10746 {
 10747   Lisp_Object operation, target_idx, target, val;
 10748   register Lisp_Object chain;
 10749 
 10750   if (nargs < 2)
 10751     error ("Too few arguments");
 10752   operation = args[0];
 10753   if (!SYMBOLP (operation)
 10754       || (target_idx = Fget (operation, Qtarget_idx), !FIXNATP (target_idx)))
 10755     error ("Invalid first argument");
 10756   if (nargs <= 1 + XFIXNAT (target_idx))
 10757     error ("Too few arguments for operation `%s'",
 10758            SDATA (SYMBOL_NAME (operation)));
 10759   target = args[XFIXNAT (target_idx) + 1];
 10760   if (!(STRINGP (target)
 10761         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
 10762             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
 10763         || (EQ (operation, Qopen_network_stream)
 10764             && (FIXNUMP (target) || EQ (target, Qt)))))
 10765     error ("Invalid argument %"pI"d of operation `%s'",
 10766            XFIXNAT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
 10767   if (CONSP (target))
 10768     target = XCAR (target);
 10769 
 10770   chain = ((EQ (operation, Qinsert_file_contents)
 10771             || EQ (operation, Qwrite_region))
 10772            ? Vfile_coding_system_alist
 10773            : (EQ (operation, Qopen_network_stream)
 10774               ? Vnetwork_coding_system_alist
 10775               : Vprocess_coding_system_alist));
 10776   if (NILP (chain))
 10777     return Qnil;
 10778 
 10779   for (; CONSP (chain); chain = XCDR (chain))
 10780     {
 10781       Lisp_Object elt;
 10782 
 10783       elt = XCAR (chain);
 10784       if (CONSP (elt)
 10785           && ((STRINGP (target)
 10786                && STRINGP (XCAR (elt))
 10787                && fast_string_match (XCAR (elt), target) >= 0)
 10788               || (FIXNUMP (target) && BASE_EQ (target, XCAR (elt)))))
 10789         {
 10790           val = XCDR (elt);
 10791           /* Here, if VAL is both a valid coding system and a valid
 10792              function symbol, we return VAL as a coding system.  */
 10793           if (CONSP (val))
 10794             return val;
 10795           if (! SYMBOLP (val))
 10796             return Qnil;
 10797           if (! NILP (Fcoding_system_p (val)))
 10798             return Fcons (val, val);
 10799           if (! NILP (Ffboundp (val)))
 10800             {
 10801               /* We use call1 rather than safe_call1
 10802                  so as to get bug reports about functions called here
 10803                  which don't handle the current interface.  */
 10804               val = call1 (val, Flist (nargs, args));
 10805               if (CONSP (val))
 10806                 return val;
 10807               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
 10808                 return Fcons (val, val);
 10809             }
 10810           return Qnil;
 10811         }
 10812     }
 10813   return Qnil;
 10814 }
 10815 
 10816 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
 10817        Sset_coding_system_priority, 0, MANY, 0,
 10818        doc: /* Assign higher priority to the coding systems given as arguments.
 10819 If multiple coding systems belong to the same category,
 10820 all but the first one are ignored.
 10821 
 10822 usage: (set-coding-system-priority &rest coding-systems)  */)
 10823   (ptrdiff_t nargs, Lisp_Object *args)
 10824 {
 10825   ptrdiff_t i, j;
 10826   bool changed[coding_category_max];
 10827   enum coding_category priorities[coding_category_max];
 10828 
 10829   memset (changed, 0, sizeof changed);
 10830 
 10831   for (i = j = 0; i < nargs; i++)
 10832     {
 10833       enum coding_category category;
 10834       Lisp_Object spec, attrs;
 10835 
 10836       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
 10837       attrs = AREF (spec, 0);
 10838       category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
 10839       if (changed[category])
 10840         /* Ignore this coding system because a coding system of the
 10841            same category already had a higher priority.  */
 10842         continue;
 10843       changed[category] = 1;
 10844       priorities[j++] = category;
 10845       if (coding_categories[category].id >= 0
 10846           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
 10847         setup_coding_system (args[i], &coding_categories[category]);
 10848       Fset (AREF (Vcoding_category_table, category), args[i]);
 10849     }
 10850 
 10851   /* Now we have decided top J priorities.  Reflect the order of the
 10852      original priorities to the remaining priorities.  */
 10853 
 10854   for (i = j, j = 0; i < coding_category_max; i++, j++)
 10855     {
 10856       while (j < coding_category_max
 10857              && changed[coding_priorities[j]])
 10858         j++;
 10859       if (j == coding_category_max)
 10860         emacs_abort ();
 10861       priorities[i] = coding_priorities[j];
 10862     }
 10863 
 10864   memcpy (coding_priorities, priorities, sizeof priorities);
 10865 
 10866   /* Update `coding-category-list'.  */
 10867   Vcoding_category_list = Qnil;
 10868   for (i = coding_category_max; i-- > 0; )
 10869     Vcoding_category_list
 10870       = Fcons (AREF (Vcoding_category_table, priorities[i]),
 10871                Vcoding_category_list);
 10872 
 10873   return Qnil;
 10874 }
 10875 
 10876 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
 10877        Scoding_system_priority_list, 0, 1, 0,
 10878        doc: /* Return a list of coding systems ordered by their priorities.
 10879 The list contains a subset of coding systems; i.e. coding systems
 10880 assigned to each coding category (see `coding-category-list').
 10881 
 10882 HIGHESTP non-nil means just return the highest priority one.  */)
 10883   (Lisp_Object highestp)
 10884 {
 10885   int i;
 10886   Lisp_Object val;
 10887 
 10888   for (i = 0, val = Qnil; i < coding_category_max; i++)
 10889     {
 10890       enum coding_category category = coding_priorities[i];
 10891       int id = coding_categories[category].id;
 10892       Lisp_Object attrs;
 10893 
 10894       if (id < 0)
 10895         continue;
 10896       attrs = CODING_ID_ATTRS (id);
 10897       if (! NILP (highestp))
 10898         return CODING_ATTR_BASE_NAME (attrs);
 10899       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
 10900     }
 10901   return Fnreverse (val);
 10902 }
 10903 
 10904 static Lisp_Object
 10905 make_subsidiaries (Lisp_Object base)
 10906 {
 10907   static char const suffixes[][8] = { "-unix", "-dos", "-mac" };
 10908   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
 10909   USE_SAFE_ALLOCA;
 10910   char *buf = SAFE_ALLOCA (base_name_len + 6);
 10911 
 10912   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
 10913   Lisp_Object subsidiaries = make_nil_vector (3);
 10914   for (int i = 0; i < 3; i++)
 10915     {
 10916       strcpy (buf + base_name_len, suffixes[i]);
 10917       ASET (subsidiaries, i, intern (buf));
 10918     }
 10919   SAFE_FREE ();
 10920   return subsidiaries;
 10921 }
 10922 
 10923 
 10924 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
 10925        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
 10926        doc: /* For internal use only.
 10927 usage: (define-coding-system-internal ...)  */)
 10928   (ptrdiff_t nargs, Lisp_Object *args)
 10929 {
 10930   enum coding_category category;
 10931   int max_charset_id = 0;
 10932 
 10933   if (nargs < coding_arg_max)
 10934     goto short_args;
 10935 
 10936   Lisp_Object attrs = make_nil_vector (coding_attr_last_index);
 10937 
 10938   Lisp_Object name = args[coding_arg_name];
 10939   CHECK_SYMBOL (name);
 10940   ASET (attrs, coding_attr_base_name, name);
 10941 
 10942   Lisp_Object val = args[coding_arg_mnemonic];
 10943   /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
 10944   if (STRINGP (val))
 10945     val = make_fixnum (STRING_CHAR (SDATA (val)));
 10946   else
 10947     CHECK_CHARACTER (val);
 10948   ASET (attrs, coding_attr_mnemonic, val);
 10949 
 10950   Lisp_Object coding_type = args[coding_arg_coding_type];
 10951   CHECK_SYMBOL (coding_type);
 10952   ASET (attrs, coding_attr_type, coding_type);
 10953 
 10954   Lisp_Object charset_list = args[coding_arg_charset_list];
 10955   if (SYMBOLP (charset_list))
 10956     {
 10957       if (EQ (charset_list, Qiso_2022))
 10958         {
 10959           if (! EQ (coding_type, Qiso_2022))
 10960             error ("Invalid charset-list");
 10961           charset_list = Viso_2022_charset_list;
 10962         }
 10963       else if (EQ (charset_list, Qemacs_mule))
 10964         {
 10965           if (! EQ (coding_type, Qemacs_mule))
 10966             error ("Invalid charset-list");
 10967           charset_list = Vemacs_mule_charset_list;
 10968         }
 10969       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 10970         {
 10971           if (! RANGED_FIXNUMP (0, XCAR (tail), INT_MAX - 1))
 10972             error ("Invalid charset-list");
 10973           if (max_charset_id < XFIXNAT (XCAR (tail)))
 10974             max_charset_id = XFIXNAT (XCAR (tail));
 10975         }
 10976     }
 10977   else
 10978     {
 10979       charset_list = Fcopy_sequence (charset_list);
 10980       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 10981         {
 10982           struct charset *charset;
 10983 
 10984           val = XCAR (tail);
 10985           CHECK_CHARSET_GET_CHARSET (val, charset);
 10986           if (EQ (coding_type, Qiso_2022)
 10987               ? CHARSET_ISO_FINAL (charset) < 0
 10988               : EQ (coding_type, Qemacs_mule)
 10989               ? CHARSET_EMACS_MULE_ID (charset) < 0
 10990               : 0)
 10991             error ("Can't handle charset `%s'",
 10992                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 10993 
 10994           XSETCAR (tail, make_fixnum (charset->id));
 10995           if (max_charset_id < charset->id)
 10996             max_charset_id = charset->id;
 10997         }
 10998     }
 10999   ASET (attrs, coding_attr_charset_list, charset_list);
 11000 
 11001   Lisp_Object safe_charsets = make_uninit_string (max_charset_id + 1);
 11002   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
 11003   for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 11004     SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
 11005   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
 11006 
 11007   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
 11008 
 11009   val = args[coding_arg_decode_translation_table];
 11010   if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11011     CHECK_SYMBOL (val);
 11012   ASET (attrs, coding_attr_decode_tbl, val);
 11013 
 11014   val = args[coding_arg_encode_translation_table];
 11015   if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11016     CHECK_SYMBOL (val);
 11017   ASET (attrs, coding_attr_encode_tbl, val);
 11018 
 11019   val = args[coding_arg_post_read_conversion];
 11020   CHECK_SYMBOL (val);
 11021   ASET (attrs, coding_attr_post_read, val);
 11022 
 11023   val = args[coding_arg_pre_write_conversion];
 11024   CHECK_SYMBOL (val);
 11025   ASET (attrs, coding_attr_pre_write, val);
 11026 
 11027   val = args[coding_arg_default_char];
 11028   if (NILP (val))
 11029     ASET (attrs, coding_attr_default_char, make_fixnum (' '));
 11030   else
 11031     {
 11032       CHECK_CHARACTER (val);
 11033       ASET (attrs, coding_attr_default_char, val);
 11034     }
 11035 
 11036   val = args[coding_arg_for_unibyte];
 11037   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
 11038 
 11039   val = args[coding_arg_plist];
 11040   CHECK_LIST (val);
 11041   ASET (attrs, coding_attr_plist, val);
 11042 
 11043   if (EQ (coding_type, Qcharset))
 11044     {
 11045       /* Generate a lisp vector of 256 elements.  Each element is nil,
 11046          integer, or a list of charset IDs.
 11047 
 11048          If Nth element is nil, the byte code N is invalid in this
 11049          coding system.
 11050 
 11051          If Nth element is a number NUM, N is the first byte of a
 11052          charset whose ID is NUM.
 11053 
 11054          If Nth element is a list of charset IDs, N is the first byte
 11055          of one of them.  The list is sorted by dimensions of the
 11056          charsets.  A charset of smaller dimension comes first. */
 11057       val = make_nil_vector (256);
 11058 
 11059       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 11060         {
 11061           struct charset *charset = CHARSET_FROM_ID (XFIXNAT (XCAR (tail)));
 11062           int dim = CHARSET_DIMENSION (charset);
 11063           int idx = (dim - 1) * 4;
 11064 
 11065           if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11066             ASET (attrs, coding_attr_ascii_compat, Qt);
 11067 
 11068           for (int i = charset->code_space[idx];
 11069                i <= charset->code_space[idx + 1]; i++)
 11070             {
 11071               Lisp_Object tmp, tmp2;
 11072               int dim2;
 11073 
 11074               tmp = AREF (val, i);
 11075               if (NILP (tmp))
 11076                 tmp = XCAR (tail);
 11077               else if (FIXNATP (tmp))
 11078                 {
 11079                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (tmp)));
 11080                   if (dim < dim2)
 11081                     tmp = list2 (XCAR (tail), tmp);
 11082                   else
 11083                     tmp = list2 (tmp, XCAR (tail));
 11084                 }
 11085               else
 11086                 {
 11087                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
 11088                     {
 11089                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (XCAR (tmp2))));
 11090                       if (dim < dim2)
 11091                         break;
 11092                     }
 11093                   if (NILP (tmp2))
 11094                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
 11095                   else
 11096                     {
 11097                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
 11098                       XSETCAR (tmp2, XCAR (tail));
 11099                     }
 11100                 }
 11101               ASET (val, i, tmp);
 11102             }
 11103         }
 11104       ASET (attrs, coding_attr_charset_valids, val);
 11105       category = coding_category_charset;
 11106     }
 11107   else if (EQ (coding_type, Qccl))
 11108     {
 11109       Lisp_Object valids;
 11110 
 11111       if (nargs < coding_arg_ccl_max)
 11112         goto short_args;
 11113 
 11114       val = args[coding_arg_ccl_decoder];
 11115       CHECK_CCL_PROGRAM (val);
 11116       if (VECTORP (val))
 11117         val = Fcopy_sequence (val);
 11118       ASET (attrs, coding_attr_ccl_decoder, val);
 11119 
 11120       val = args[coding_arg_ccl_encoder];
 11121       CHECK_CCL_PROGRAM (val);
 11122       if (VECTORP (val))
 11123         val = Fcopy_sequence (val);
 11124       ASET (attrs, coding_attr_ccl_encoder, val);
 11125 
 11126       val = args[coding_arg_ccl_valids];
 11127       valids = Fmake_string (make_fixnum (256), make_fixnum (0), Qnil);
 11128       for (Lisp_Object tail = val; CONSP (tail); tail = XCDR (tail))
 11129         {
 11130           int from, to;
 11131 
 11132           val = XCAR (tail);
 11133           if (FIXNUMP (val))
 11134             {
 11135               if (! (0 <= XFIXNUM (val) && XFIXNUM (val) <= 255))
 11136                 args_out_of_range_3 (val, make_fixnum (0), make_fixnum (255));
 11137               from = to = XFIXNUM (val);
 11138             }
 11139           else
 11140             {
 11141               CHECK_CONS (val);
 11142               from = check_integer_range (XCAR (val), 0, 255);
 11143               to = check_integer_range (XCDR (val), from, 255);
 11144             }
 11145           for (int i = from; i <= to; i++)
 11146             SSET (valids, i, 1);
 11147         }
 11148       ASET (attrs, coding_attr_ccl_valids, valids);
 11149 
 11150       category = coding_category_ccl;
 11151     }
 11152   else if (EQ (coding_type, Qutf_16))
 11153     {
 11154       Lisp_Object bom, endian;
 11155 
 11156       ASET (attrs, coding_attr_ascii_compat, Qnil);
 11157 
 11158       if (nargs < coding_arg_utf16_max)
 11159         goto short_args;
 11160 
 11161       bom = args[coding_arg_utf16_bom];
 11162       if (! NILP (bom) && ! EQ (bom, Qt))
 11163         {
 11164           CHECK_CONS (bom);
 11165           val = XCAR (bom);
 11166           CHECK_CODING_SYSTEM (val);
 11167           val = XCDR (bom);
 11168           CHECK_CODING_SYSTEM (val);
 11169         }
 11170       ASET (attrs, coding_attr_utf_bom, bom);
 11171 
 11172       endian = args[coding_arg_utf16_endian];
 11173       CHECK_SYMBOL (endian);
 11174       if (NILP (endian))
 11175         endian = Qbig;
 11176       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
 11177         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
 11178       ASET (attrs, coding_attr_utf_16_endian, endian);
 11179 
 11180       category = (CONSP (bom)
 11181                   ? coding_category_utf_16_auto
 11182                   : NILP (bom)
 11183                   ? (EQ (endian, Qbig)
 11184                      ? coding_category_utf_16_be_nosig
 11185                      : coding_category_utf_16_le_nosig)
 11186                   : (EQ (endian, Qbig)
 11187                      ? coding_category_utf_16_be
 11188                      : coding_category_utf_16_le));
 11189     }
 11190   else if (EQ (coding_type, Qiso_2022))
 11191     {
 11192       Lisp_Object initial, reg_usage, request, flags;
 11193 
 11194       if (nargs < coding_arg_iso2022_max)
 11195         goto short_args;
 11196 
 11197       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
 11198       CHECK_VECTOR (initial);
 11199       for (int i = 0; i < 4; i++)
 11200         {
 11201           val = AREF (initial, i);
 11202           if (! NILP (val))
 11203             {
 11204               struct charset *charset;
 11205 
 11206               CHECK_CHARSET_GET_CHARSET (val, charset);
 11207               ASET (initial, i, make_fixnum (CHARSET_ID (charset)));
 11208               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
 11209                 ASET (attrs, coding_attr_ascii_compat, Qt);
 11210             }
 11211           else
 11212             ASET (initial, i, make_fixnum (-1));
 11213         }
 11214 
 11215       reg_usage = args[coding_arg_iso2022_reg_usage];
 11216       CHECK_CONS (reg_usage);
 11217       CHECK_FIXNUM (XCAR (reg_usage));
 11218       CHECK_FIXNUM (XCDR (reg_usage));
 11219 
 11220       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
 11221       for (Lisp_Object tail = request; CONSP (tail); tail = XCDR (tail))
 11222         {
 11223           int id;
 11224 
 11225           val = XCAR (tail);
 11226           CHECK_CONS (val);
 11227           CHECK_CHARSET_GET_ID (XCAR (val), id);
 11228           check_integer_range (XCDR (val), 0, 3);
 11229           XSETCAR (val, make_fixnum (id));
 11230         }
 11231 
 11232       flags = args[coding_arg_iso2022_flags];
 11233       CHECK_FIXNAT (flags);
 11234       int i = XFIXNUM (flags) & INT_MAX;
 11235       if (EQ (args[coding_arg_charset_list], Qiso_2022))
 11236         i |= CODING_ISO_FLAG_FULL_SUPPORT;
 11237       flags = make_fixnum (i);
 11238 
 11239       ASET (attrs, coding_attr_iso_initial, initial);
 11240       ASET (attrs, coding_attr_iso_usage, reg_usage);
 11241       ASET (attrs, coding_attr_iso_request, request);
 11242       ASET (attrs, coding_attr_iso_flags, flags);
 11243       setup_iso_safe_charsets (attrs);
 11244 
 11245       if (i & CODING_ISO_FLAG_SEVEN_BITS)
 11246         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
 11247                           | CODING_ISO_FLAG_SINGLE_SHIFT))
 11248                     ? coding_category_iso_7_else
 11249                     : EQ (args[coding_arg_charset_list], Qiso_2022)
 11250                     ? coding_category_iso_7
 11251                     : coding_category_iso_7_tight);
 11252       else
 11253         {
 11254           int id = XFIXNUM (AREF (initial, 1));
 11255 
 11256           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
 11257                        || EQ (args[coding_arg_charset_list], Qiso_2022)
 11258                        || id < 0)
 11259                       ? coding_category_iso_8_else
 11260                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
 11261                       ? coding_category_iso_8_1
 11262                       : coding_category_iso_8_2);
 11263         }
 11264       if (category != coding_category_iso_8_1
 11265           && category != coding_category_iso_8_2)
 11266         ASET (attrs, coding_attr_ascii_compat, Qnil);
 11267     }
 11268   else if (EQ (coding_type, Qemacs_mule))
 11269     {
 11270       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
 11271         ASET (attrs, coding_attr_emacs_mule_full, Qt);
 11272       ASET (attrs, coding_attr_ascii_compat, Qt);
 11273       category = coding_category_emacs_mule;
 11274     }
 11275   else if (EQ (coding_type, Qshift_jis))
 11276     {
 11277       ptrdiff_t charset_list_len = list_length (charset_list);
 11278       if (charset_list_len != 3 && charset_list_len != 4)
 11279         error ("There should be three or four charsets");
 11280 
 11281       struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11282       if (CHARSET_DIMENSION (charset) != 1)
 11283         error ("Dimension of charset %s is not one",
 11284                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11285       if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11286         ASET (attrs, coding_attr_ascii_compat, Qt);
 11287 
 11288       charset_list = XCDR (charset_list);
 11289       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11290       if (CHARSET_DIMENSION (charset) != 1)
 11291         error ("Dimension of charset %s is not one",
 11292                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11293 
 11294       charset_list = XCDR (charset_list);
 11295       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11296       if (CHARSET_DIMENSION (charset) != 2)
 11297         error ("Dimension of charset %s is not two",
 11298                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11299 
 11300       charset_list = XCDR (charset_list);
 11301       if (! NILP (charset_list))
 11302         {
 11303           charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11304           if (CHARSET_DIMENSION (charset) != 2)
 11305             error ("Dimension of charset %s is not two",
 11306                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11307         }
 11308 
 11309       category = coding_category_sjis;
 11310       Vsjis_coding_system = name;
 11311     }
 11312   else if (EQ (coding_type, Qbig5))
 11313     {
 11314       struct charset *charset;
 11315 
 11316       if (list_length (charset_list) != 2)
 11317         error ("There should be just two charsets");
 11318 
 11319       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11320       if (CHARSET_DIMENSION (charset) != 1)
 11321         error ("Dimension of charset %s is not one",
 11322                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11323       if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11324         ASET (attrs, coding_attr_ascii_compat, Qt);
 11325 
 11326       charset_list = XCDR (charset_list);
 11327       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11328       if (CHARSET_DIMENSION (charset) != 2)
 11329         error ("Dimension of charset %s is not two",
 11330                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11331 
 11332       category = coding_category_big5;
 11333       Vbig5_coding_system = name;
 11334     }
 11335   else if (EQ (coding_type, Qraw_text))
 11336     {
 11337       category = coding_category_raw_text;
 11338       ASET (attrs, coding_attr_ascii_compat, Qt);
 11339     }
 11340   else if (EQ (coding_type, Qutf_8))
 11341     {
 11342       Lisp_Object bom;
 11343 
 11344       if (nargs < coding_arg_utf8_max)
 11345         goto short_args;
 11346 
 11347       bom = args[coding_arg_utf8_bom];
 11348       if (! NILP (bom) && ! EQ (bom, Qt))
 11349         {
 11350           CHECK_CONS (bom);
 11351           val = XCAR (bom);
 11352           CHECK_CODING_SYSTEM (val);
 11353           val = XCDR (bom);
 11354           CHECK_CODING_SYSTEM (val);
 11355         }
 11356       ASET (attrs, coding_attr_utf_bom, bom);
 11357       if (NILP (bom))
 11358         ASET (attrs, coding_attr_ascii_compat, Qt);
 11359 
 11360       category = (CONSP (bom) ? coding_category_utf_8_auto
 11361                   : NILP (bom) ? coding_category_utf_8_nosig
 11362                   : coding_category_utf_8_sig);
 11363     }
 11364   else if (EQ (coding_type, Qundecided))
 11365     {
 11366       if (nargs < coding_arg_undecided_max)
 11367         goto short_args;
 11368       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
 11369             args[coding_arg_undecided_inhibit_null_byte_detection]);
 11370       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
 11371             args[coding_arg_undecided_inhibit_iso_escape_detection]);
 11372       ASET (attrs, coding_attr_undecided_prefer_utf_8,
 11373             args[coding_arg_undecided_prefer_utf_8]);
 11374       category = coding_category_undecided;
 11375     }
 11376   else
 11377     error ("Invalid coding system type: %s",
 11378            SDATA (SYMBOL_NAME (coding_type)));
 11379 
 11380   ASET (attrs, coding_attr_category, make_fixnum (category));
 11381   ASET (attrs, coding_attr_plist,
 11382         Fcons (QCcategory,
 11383                Fcons (AREF (Vcoding_category_table, category),
 11384                       CODING_ATTR_PLIST (attrs))));
 11385   ASET (attrs, coding_attr_plist,
 11386         Fcons (QCascii_compatible_p,
 11387                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
 11388                       CODING_ATTR_PLIST (attrs))));
 11389 
 11390   Lisp_Object eol_type = args[coding_arg_eol_type];
 11391   if (! NILP (eol_type)
 11392       && ! EQ (eol_type, Qunix)
 11393       && ! EQ (eol_type, Qdos)
 11394       && ! EQ (eol_type, Qmac))
 11395     error ("Invalid eol-type");
 11396 
 11397   Lisp_Object aliases = list1 (name);
 11398 
 11399   if (NILP (eol_type))
 11400     {
 11401       eol_type = make_subsidiaries (name);
 11402       for (int i = 0; i < 3; i++)
 11403         {
 11404           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
 11405 
 11406           this_name = AREF (eol_type, i);
 11407           this_aliases = list1 (this_name);
 11408           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
 11409           this_spec = make_uninit_vector (3);
 11410           ASET (this_spec, 0, attrs);
 11411           ASET (this_spec, 1, this_aliases);
 11412           ASET (this_spec, 2, this_eol_type);
 11413           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
 11414           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
 11415           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
 11416           if (NILP (val))
 11417             Vcoding_system_alist
 11418               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
 11419                        Vcoding_system_alist);
 11420         }
 11421     }
 11422 
 11423   Lisp_Object spec_vec = make_uninit_vector (3);
 11424   ASET (spec_vec, 0, attrs);
 11425   ASET (spec_vec, 1, aliases);
 11426   ASET (spec_vec, 2, eol_type);
 11427 
 11428   Fputhash (name, spec_vec, Vcoding_system_hash_table);
 11429   Vcoding_system_list = Fcons (name, Vcoding_system_list);
 11430   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
 11431   if (NILP (val))
 11432     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
 11433                                   Vcoding_system_alist);
 11434 
 11435   int id = coding_categories[category].id;
 11436   if (id < 0 || EQ (name, CODING_ID_NAME (id)))
 11437       setup_coding_system (name, &coding_categories[category]);
 11438 
 11439   return Qnil;
 11440 
 11441  short_args:
 11442   Fsignal (Qwrong_number_of_arguments,
 11443            Fcons (intern ("define-coding-system-internal"),
 11444                   make_fixnum (nargs)));
 11445 }
 11446 
 11447 
 11448 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
 11449        3, 3, 0,
 11450        doc: /* Change value of CODING-SYSTEM's property PROP to VAL.
 11451 
 11452 The following properties, if set by this function, override the values
 11453 of the corresponding attributes set by `define-coding-system':
 11454 
 11455   `:mnemonic', `:default-char', `:ascii-compatible-p'
 11456   `:decode-translation-table', `:encode-translation-table',
 11457   `:post-read-conversion', `:pre-write-conversion'
 11458 
 11459 See `define-coding-system' for the description of these properties.
 11460 See `coding-system-get' and `coding-system-plist' for accessing the
 11461 property list of a coding-system.  */)
 11462   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
 11463 {
 11464   Lisp_Object spec, attrs;
 11465 
 11466   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11467   attrs = AREF (spec, 0);
 11468   if (EQ (prop, QCmnemonic))
 11469     {
 11470       /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
 11471       if (STRINGP (val))
 11472         val = make_fixnum (STRING_CHAR (SDATA (val)));
 11473       else
 11474         CHECK_CHARACTER (val);
 11475       ASET (attrs, coding_attr_mnemonic, val);
 11476     }
 11477   else if (EQ (prop, QCdefault_char))
 11478     {
 11479       if (NILP (val))
 11480         val = make_fixnum (' ');
 11481       else
 11482         CHECK_CHARACTER (val);
 11483       ASET (attrs, coding_attr_default_char, val);
 11484     }
 11485   else if (EQ (prop, QCdecode_translation_table))
 11486     {
 11487       if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11488         CHECK_SYMBOL (val);
 11489       ASET (attrs, coding_attr_decode_tbl, val);
 11490     }
 11491   else if (EQ (prop, QCencode_translation_table))
 11492     {
 11493       if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11494         CHECK_SYMBOL (val);
 11495       ASET (attrs, coding_attr_encode_tbl, val);
 11496     }
 11497   else if (EQ (prop, QCpost_read_conversion))
 11498     {
 11499       CHECK_SYMBOL (val);
 11500       ASET (attrs, coding_attr_post_read, val);
 11501     }
 11502   else if (EQ (prop, QCpre_write_conversion))
 11503     {
 11504       CHECK_SYMBOL (val);
 11505       ASET (attrs, coding_attr_pre_write, val);
 11506     }
 11507   else if (EQ (prop, QCascii_compatible_p))
 11508     {
 11509       ASET (attrs, coding_attr_ascii_compat, val);
 11510     }
 11511 
 11512   ASET (attrs, coding_attr_plist,
 11513         plist_put (CODING_ATTR_PLIST (attrs), prop, val));
 11514   return val;
 11515 }
 11516 
 11517 
 11518 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
 11519        Sdefine_coding_system_alias, 2, 2, 0,
 11520        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
 11521   (Lisp_Object alias, Lisp_Object coding_system)
 11522 {
 11523   Lisp_Object spec, aliases, eol_type, val;
 11524 
 11525   CHECK_SYMBOL (alias);
 11526   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11527   aliases = AREF (spec, 1);
 11528   /* ALIASES should be a list of length more than zero, and the first
 11529      element is a base coding system.  Append ALIAS at the tail of the
 11530      list.  */
 11531   while (!NILP (XCDR (aliases)))
 11532     aliases = XCDR (aliases);
 11533   XSETCDR (aliases, list1 (alias));
 11534 
 11535   eol_type = AREF (spec, 2);
 11536   if (VECTORP (eol_type))
 11537     {
 11538       Lisp_Object subsidiaries;
 11539       int i;
 11540 
 11541       subsidiaries = make_subsidiaries (alias);
 11542       for (i = 0; i < 3; i++)
 11543         Fdefine_coding_system_alias (AREF (subsidiaries, i),
 11544                                      AREF (eol_type, i));
 11545     }
 11546 
 11547   Fputhash (alias, spec, Vcoding_system_hash_table);
 11548   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
 11549   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
 11550   if (NILP (val))
 11551     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
 11552                                   Vcoding_system_alist);
 11553 
 11554   return Qnil;
 11555 }
 11556 
 11557 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
 11558        1, 1, 0,
 11559        doc: /* Return the base of CODING-SYSTEM.
 11560 Any alias or subsidiary coding system is not a base coding system.  */)
 11561   (Lisp_Object coding_system)
 11562 {
 11563   Lisp_Object spec, attrs;
 11564 
 11565   if (NILP (coding_system))
 11566     return (Qno_conversion);
 11567   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11568   attrs = AREF (spec, 0);
 11569   return CODING_ATTR_BASE_NAME (attrs);
 11570 }
 11571 
 11572 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
 11573        1, 1, 0,
 11574        doc: /* Return the property list of CODING-SYSTEM.  */)
 11575   (Lisp_Object coding_system)
 11576 {
 11577   Lisp_Object spec, attrs;
 11578 
 11579   if (NILP (coding_system))
 11580     coding_system = Qno_conversion;
 11581   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11582   attrs = AREF (spec, 0);
 11583   return CODING_ATTR_PLIST (attrs);
 11584 }
 11585 
 11586 
 11587 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
 11588        1, 1, 0,
 11589        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
 11590   (Lisp_Object coding_system)
 11591 {
 11592   Lisp_Object spec;
 11593 
 11594   if (NILP (coding_system))
 11595     coding_system = Qno_conversion;
 11596   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11597   return AREF (spec, 1);
 11598 }
 11599 
 11600 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
 11601        Scoding_system_eol_type, 1, 1, 0,
 11602        doc: /* Return eol-type of CODING-SYSTEM.
 11603 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
 11604 
 11605 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
 11606 and CR respectively.
 11607 
 11608 A vector value indicates that a format of end-of-line should be
 11609 detected automatically.  Nth element of the vector is the subsidiary
 11610 coding system whose eol-type is N.  */)
 11611   (Lisp_Object coding_system)
 11612 {
 11613   Lisp_Object spec, eol_type;
 11614   int n;
 11615 
 11616   if (NILP (coding_system))
 11617     coding_system = Qno_conversion;
 11618   if (! CODING_SYSTEM_P (coding_system))
 11619     return Qnil;
 11620   spec = CODING_SYSTEM_SPEC (coding_system);
 11621   eol_type = AREF (spec, 2);
 11622   if (VECTORP (eol_type))
 11623     return Fcopy_sequence (eol_type);
 11624   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
 11625   return make_fixnum (n);
 11626 }
 11627 
 11628 
 11629 /*** 9. Post-amble ***/
 11630 
 11631 void
 11632 init_coding_once (void)
 11633 {
 11634   int i;
 11635 
 11636   for (i = 0; i < coding_category_max; i++)
 11637     {
 11638       coding_categories[i].id = -1;
 11639       coding_priorities[i] = i;
 11640     }
 11641 
 11642   PDUMPER_REMEMBER_SCALAR (coding_categories);
 11643   PDUMPER_REMEMBER_SCALAR (coding_priorities);
 11644 
 11645   /* ISO2022 specific initialize routine.  */
 11646   for (i = 0; i < 0x20; i++)
 11647     iso_code_class[i] = ISO_control_0;
 11648   for (i = 0x21; i < 0x7F; i++)
 11649     iso_code_class[i] = ISO_graphic_plane_0;
 11650   for (i = 0x80; i < 0xA0; i++)
 11651     iso_code_class[i] = ISO_control_1;
 11652   for (i = 0xA1; i < 0xFF; i++)
 11653     iso_code_class[i] = ISO_graphic_plane_1;
 11654   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
 11655   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
 11656   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
 11657   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
 11658   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
 11659   iso_code_class[ISO_CODE_ESC] = ISO_escape;
 11660   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
 11661   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
 11662   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
 11663 
 11664   PDUMPER_REMEMBER_SCALAR (iso_code_class);
 11665 
 11666   for (i = 0; i < 256; i++)
 11667     {
 11668       emacs_mule_bytes[i] = 1;
 11669     }
 11670   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
 11671   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
 11672   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
 11673   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
 11674 
 11675   PDUMPER_REMEMBER_SCALAR (emacs_mule_bytes);
 11676 }
 11677 
 11678 static void reset_coding_after_pdumper_load (void);
 11679 
 11680 void
 11681 syms_of_coding (void)
 11682 {
 11683   staticpro (&Vcoding_system_hash_table);
 11684   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
 11685 
 11686   staticpro (&Vsjis_coding_system);
 11687   Vsjis_coding_system = Qnil;
 11688 
 11689   staticpro (&Vbig5_coding_system);
 11690   Vbig5_coding_system = Qnil;
 11691 
 11692   staticpro (&Vcode_conversion_reused_workbuf);
 11693   Vcode_conversion_reused_workbuf = Qnil;
 11694 
 11695   staticpro (&Vcode_conversion_workbuf_name);
 11696   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
 11697 
 11698   reused_workbuf_in_use = false;
 11699   PDUMPER_REMEMBER_SCALAR (reused_workbuf_in_use);
 11700 
 11701   DEFSYM (Qcharset, "charset");
 11702   DEFSYM (Qtarget_idx, "target-idx");
 11703   DEFSYM (Qcoding_system_history, "coding-system-history");
 11704   Fset (Qcoding_system_history, Qnil);
 11705 
 11706   /* Target FILENAME is the first argument.  */
 11707   Fput (Qinsert_file_contents, Qtarget_idx, make_fixnum (0));
 11708   /* Target FILENAME is the third argument.  */
 11709   Fput (Qwrite_region, Qtarget_idx, make_fixnum (2));
 11710 
 11711   DEFSYM (Qcall_process, "call-process");
 11712   /* Target PROGRAM is the first argument.  */
 11713   Fput (Qcall_process, Qtarget_idx, make_fixnum (0));
 11714 
 11715   DEFSYM (Qcall_process_region, "call-process-region");
 11716   /* Target PROGRAM is the third argument.  */
 11717   Fput (Qcall_process_region, Qtarget_idx, make_fixnum (2));
 11718 
 11719   DEFSYM (Qstart_process, "start-process");
 11720   /* Target PROGRAM is the third argument.  */
 11721   Fput (Qstart_process, Qtarget_idx, make_fixnum (2));
 11722 
 11723   DEFSYM (Qopen_network_stream, "open-network-stream");
 11724   /* Target SERVICE is the fourth argument.  */
 11725   Fput (Qopen_network_stream, Qtarget_idx, make_fixnum (3));
 11726 
 11727   DEFSYM (Qunix, "unix");
 11728   DEFSYM (Qdos, "dos");
 11729   DEFSYM (Qmac, "mac");
 11730 
 11731   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
 11732   DEFSYM (Qundecided, "undecided");
 11733   DEFSYM (Qno_conversion, "no-conversion");
 11734   DEFSYM (Qraw_text, "raw-text");
 11735   DEFSYM (Qus_ascii, "us-ascii");
 11736 
 11737   DEFSYM (Qiso_2022, "iso-2022");
 11738 
 11739   DEFSYM (Qutf_8, "utf-8");
 11740   DEFSYM (Qutf_8_unix, "utf-8-unix");
 11741   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
 11742 
 11743 #if defined (WINDOWSNT) || defined (CYGWIN)
 11744   /* No, not utf-16-le: that one has a BOM.  */
 11745   DEFSYM (Qutf_16le, "utf-16le");
 11746 #endif
 11747 
 11748   DEFSYM (Qutf_16, "utf-16");
 11749   DEFSYM (Qbig, "big");
 11750   DEFSYM (Qlittle, "little");
 11751 
 11752   DEFSYM (Qshift_jis, "shift-jis");
 11753   DEFSYM (Qbig5, "big5");
 11754 
 11755   DEFSYM (Qcoding_system_p, "coding-system-p");
 11756 
 11757   /* Error signaled when there's a problem with detecting a coding system.  */
 11758   DEFSYM (Qcoding_system_error, "coding-system-error");
 11759   Fput (Qcoding_system_error, Qerror_conditions,
 11760         pure_list (Qcoding_system_error, Qerror));
 11761   Fput (Qcoding_system_error, Qerror_message,
 11762         build_pure_c_string ("Invalid coding system"));
 11763 
 11764   DEFSYM (Qtranslation_table, "translation-table");
 11765   Fput (Qtranslation_table, Qchar_table_extra_slots, make_fixnum (2));
 11766   DEFSYM (Qtranslation_table_id, "translation-table-id");
 11767 
 11768   /* Coding system emacs-mule and raw-text are for converting only
 11769      end-of-line format.  */
 11770   DEFSYM (Qemacs_mule, "emacs-mule");
 11771 
 11772   DEFSYM (QCcategory, ":category");
 11773   DEFSYM (QCmnemonic, ":mnemonic");
 11774   DEFSYM (QCdefault_char, ":default-char");
 11775   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
 11776   DEFSYM (QCencode_translation_table, ":encode-translation-table");
 11777   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
 11778   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
 11779   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
 11780 
 11781   Vcoding_category_table = make_nil_vector (coding_category_max);
 11782   staticpro (&Vcoding_category_table);
 11783   /* Followings are target of code detection.  */
 11784   ASET (Vcoding_category_table, coding_category_iso_7,
 11785         intern_c_string ("coding-category-iso-7"));
 11786   ASET (Vcoding_category_table, coding_category_iso_7_tight,
 11787         intern_c_string ("coding-category-iso-7-tight"));
 11788   ASET (Vcoding_category_table, coding_category_iso_8_1,
 11789         intern_c_string ("coding-category-iso-8-1"));
 11790   ASET (Vcoding_category_table, coding_category_iso_8_2,
 11791         intern_c_string ("coding-category-iso-8-2"));
 11792   ASET (Vcoding_category_table, coding_category_iso_7_else,
 11793         intern_c_string ("coding-category-iso-7-else"));
 11794   ASET (Vcoding_category_table, coding_category_iso_8_else,
 11795         intern_c_string ("coding-category-iso-8-else"));
 11796   ASET (Vcoding_category_table, coding_category_utf_8_auto,
 11797         intern_c_string ("coding-category-utf-8-auto"));
 11798   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
 11799         intern_c_string ("coding-category-utf-8"));
 11800   ASET (Vcoding_category_table, coding_category_utf_8_sig,
 11801         intern_c_string ("coding-category-utf-8-sig"));
 11802   ASET (Vcoding_category_table, coding_category_utf_16_be,
 11803         intern_c_string ("coding-category-utf-16-be"));
 11804   ASET (Vcoding_category_table, coding_category_utf_16_auto,
 11805         intern_c_string ("coding-category-utf-16-auto"));
 11806   ASET (Vcoding_category_table, coding_category_utf_16_le,
 11807         intern_c_string ("coding-category-utf-16-le"));
 11808   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
 11809         intern_c_string ("coding-category-utf-16-be-nosig"));
 11810   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
 11811         intern_c_string ("coding-category-utf-16-le-nosig"));
 11812   ASET (Vcoding_category_table, coding_category_charset,
 11813         intern_c_string ("coding-category-charset"));
 11814   ASET (Vcoding_category_table, coding_category_sjis,
 11815         intern_c_string ("coding-category-sjis"));
 11816   ASET (Vcoding_category_table, coding_category_big5,
 11817         intern_c_string ("coding-category-big5"));
 11818   ASET (Vcoding_category_table, coding_category_ccl,
 11819         intern_c_string ("coding-category-ccl"));
 11820   ASET (Vcoding_category_table, coding_category_emacs_mule,
 11821         intern_c_string ("coding-category-emacs-mule"));
 11822   /* Followings are NOT target of code detection.  */
 11823   ASET (Vcoding_category_table, coding_category_raw_text,
 11824         intern_c_string ("coding-category-raw-text"));
 11825   ASET (Vcoding_category_table, coding_category_undecided,
 11826         intern_c_string ("coding-category-undecided"));
 11827 
 11828   DEFSYM (Qinsufficient_source, "insufficient-source");
 11829   DEFSYM (Qinvalid_source, "invalid-source");
 11830   DEFSYM (Qinterrupted, "interrupted");
 11831 
 11832   /* If a symbol has this property, evaluate the value to define the
 11833      symbol as a coding system.  */
 11834   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
 11835 
 11836   DEFSYM (Qignored, "ignored");
 11837 
 11838   DEFSYM (Qutf_8_string_p, "utf-8-string-p");
 11839   DEFSYM (Qfilenamep, "filenamep");
 11840 
 11841   defsubr (&Scoding_system_p);
 11842   defsubr (&Sread_coding_system);
 11843   defsubr (&Sread_non_nil_coding_system);
 11844   defsubr (&Scheck_coding_system);
 11845   defsubr (&Sdetect_coding_region);
 11846   defsubr (&Sdetect_coding_string);
 11847   defsubr (&Sfind_coding_systems_region_internal);
 11848   defsubr (&Sunencodable_char_position);
 11849   defsubr (&Scheck_coding_systems_region);
 11850   defsubr (&Sdecode_coding_region);
 11851   defsubr (&Sencode_coding_region);
 11852   defsubr (&Sdecode_coding_string);
 11853   defsubr (&Sencode_coding_string);
 11854 #ifdef ENABLE_UTF_8_CONVERTER_TEST
 11855   defsubr (&Sinternal_encode_string_utf_8);
 11856   defsubr (&Sinternal_decode_string_utf_8);
 11857 #endif  /* ENABLE_UTF_8_CONVERTER_TEST */
 11858   defsubr (&Sdecode_sjis_char);
 11859   defsubr (&Sencode_sjis_char);
 11860   defsubr (&Sdecode_big5_char);
 11861   defsubr (&Sencode_big5_char);
 11862   defsubr (&Sset_terminal_coding_system_internal);
 11863   defsubr (&Sset_safe_terminal_coding_system_internal);
 11864   defsubr (&Sterminal_coding_system);
 11865   defsubr (&Sset_keyboard_coding_system_internal);
 11866   defsubr (&Skeyboard_coding_system);
 11867   defsubr (&Sfind_operation_coding_system);
 11868   defsubr (&Sset_coding_system_priority);
 11869   defsubr (&Sdefine_coding_system_internal);
 11870   defsubr (&Sdefine_coding_system_alias);
 11871   defsubr (&Scoding_system_put);
 11872   defsubr (&Scoding_system_base);
 11873   defsubr (&Scoding_system_plist);
 11874   defsubr (&Scoding_system_aliases);
 11875   defsubr (&Scoding_system_eol_type);
 11876   defsubr (&Scoding_system_priority_list);
 11877 
 11878   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
 11879                doc: /* List of coding systems.
 11880 
 11881 Do not alter the value of this variable manually.  This variable should be
 11882 updated by the functions `define-coding-system' and
 11883 `define-coding-system-alias'.  */);
 11884   Vcoding_system_list = Qnil;
 11885 
 11886   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
 11887                doc: /* Alist of coding system names.
 11888 Each element is one element list of coding system name.
 11889 This variable is given to `completing-read' as COLLECTION argument.
 11890 
 11891 Do not alter the value of this variable manually.  This variable should be
 11892 updated by `define-coding-system-alias'.  */);
 11893   Vcoding_system_alist = Qnil;
 11894 
 11895   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
 11896                doc: /* List of coding-categories (symbols) ordered by priority.
 11897 
 11898 On detecting a coding system, Emacs tries code detection algorithms
 11899 associated with each coding-category one by one in this order.  When
 11900 one algorithm agrees with a byte sequence of source text, the coding
 11901 system bound to the corresponding coding-category is selected.
 11902 
 11903 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
 11904   {
 11905     int i;
 11906 
 11907     Vcoding_category_list = Qnil;
 11908     for (i = coding_category_max - 1; i >= 0; i--)
 11909       Vcoding_category_list
 11910         = Fcons (AREF (Vcoding_category_table, i),
 11911                  Vcoding_category_list);
 11912   }
 11913 
 11914   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
 11915                doc: /* Specify the coding system for read operations.
 11916 It is useful to bind this variable with `let', but do not set it globally.
 11917 If the value is a coding system, it is used for decoding on read operation.
 11918 If not, an appropriate element is used from one of the coding system alists.
 11919 There are three such tables: `file-coding-system-alist',
 11920 `process-coding-system-alist', and `network-coding-system-alist'.  */);
 11921   Vcoding_system_for_read = Qnil;
 11922 
 11923   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
 11924                doc: /* Specify the coding system for write operations.
 11925 Programs bind this variable with `let', but you should not set it globally.
 11926 If the value is a coding system, it is used for encoding of output,
 11927 when writing it to a file and when sending it to a file or subprocess.
 11928 
 11929 If this does not specify a coding system, an appropriate element
 11930 is used from one of the coding system alists.
 11931 There are three such tables: `file-coding-system-alist',
 11932 `process-coding-system-alist', and `network-coding-system-alist'.
 11933 For output to files, if the above procedure does not specify a coding system,
 11934 the value of `buffer-file-coding-system' is used.  */);
 11935   Vcoding_system_for_write = Qnil;
 11936 
 11937   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
 11938                doc: /*
 11939 Coding system used in the latest file or process I/O.  */);
 11940   Vlast_coding_system_used = Qnil;
 11941 
 11942   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
 11943                doc: /*
 11944 Error status of the last code conversion.
 11945 
 11946 When an error was detected in the last code conversion, this variable
 11947 is set to one of the following symbols.
 11948   `insufficient-source'
 11949   `inconsistent-eol'
 11950   `invalid-source'
 11951   `interrupted'
 11952   `insufficient-memory'
 11953 When no error was detected, the value doesn't change.  So, to check
 11954 the error status of a code conversion by this variable, you must
 11955 explicitly set this variable to nil before performing code
 11956 conversion.  */);
 11957   Vlast_code_conversion_error = Qnil;
 11958 
 11959   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
 11960                doc: /*
 11961 Non-nil means always inhibit code conversion of end-of-line format.
 11962 See info node `Coding Systems' and info node `Text and Binary' concerning
 11963 such conversion.  */);
 11964   inhibit_eol_conversion = 0;
 11965 
 11966   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
 11967                doc: /*
 11968 Non-nil means process buffer inherits coding system of process output.
 11969 Bind it to t if the process output is to be treated as if it were a file
 11970 read from some filesystem.  */);
 11971   inherit_process_coding_system = 0;
 11972 
 11973   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
 11974                doc: /*
 11975 Alist to decide a coding system to use for a file I/O operation.
 11976 The format is ((PATTERN . VAL) ...),
 11977 where PATTERN is a regular expression matching a file name,
 11978 VAL is a coding system, a cons of coding systems, or a function symbol.
 11979 If VAL is a coding system, it is used for both decoding and encoding
 11980 the file contents.
 11981 If VAL is a cons of coding systems, the car part is used for decoding,
 11982 and the cdr part is used for encoding.
 11983 If VAL is a function symbol, the function must return a coding system
 11984 or a cons of coding systems which are used as above.  The function is
 11985 called with an argument that is a list of the arguments with which
 11986 `find-operation-coding-system' was called.  If the function can't decide
 11987 a coding system, it can return `undecided' so that the normal
 11988 code-detection is performed.
 11989 
 11990 See also the function `find-operation-coding-system'
 11991 and the variable `auto-coding-alist'.  */);
 11992   Vfile_coding_system_alist = Qnil;
 11993 
 11994   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
 11995                doc: /*
 11996 Alist to decide a coding system to use for a process I/O operation.
 11997 The format is ((PATTERN . VAL) ...),
 11998 where PATTERN is a regular expression matching a program name,
 11999 VAL is a coding system, a cons of coding systems, or a function symbol.
 12000 If VAL is a coding system, it is used for both decoding what received
 12001 from the program and encoding what sent to the program.
 12002 If VAL is a cons of coding systems, the car part is used for decoding,
 12003 and the cdr part is used for encoding.
 12004 If VAL is a function symbol, the function must return a coding system
 12005 or a cons of coding systems which are used as above.
 12006 
 12007 See also the function `find-operation-coding-system'.  */);
 12008   Vprocess_coding_system_alist = Qnil;
 12009 
 12010   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
 12011                doc: /*
 12012 Alist to decide a coding system to use for a network I/O operation.
 12013 The format is ((PATTERN . VAL) ...),
 12014 where PATTERN is a regular expression matching a network service name
 12015 or is a port number to connect to,
 12016 VAL is a coding system, a cons of coding systems, or a function symbol.
 12017 If VAL is a coding system, it is used for both decoding what received
 12018 from the network stream and encoding what sent to the network stream.
 12019 If VAL is a cons of coding systems, the car part is used for decoding,
 12020 and the cdr part is used for encoding.
 12021 If VAL is a function symbol, the function must return a coding system
 12022 or a cons of coding systems which are used as above.
 12023 
 12024 See also the function `find-operation-coding-system'.  */);
 12025   Vnetwork_coding_system_alist = Qnil;
 12026 
 12027   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
 12028     doc: /* Coding system to use with system messages.
 12029 Potentially also used for decoding keyboard input on X Windows, and is
 12030 used for encoding standard output and error streams.  */);
 12031   Vlocale_coding_system = Qnil;
 12032 
 12033   /* The eol mnemonics are reset in startup.el system-dependently.  */
 12034   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
 12035                doc: /*
 12036 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
 12037   eol_mnemonic_unix = build_pure_c_string (":");
 12038 
 12039   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
 12040                doc: /*
 12041 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
 12042   eol_mnemonic_dos = build_pure_c_string ("\\");
 12043 
 12044   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
 12045                doc: /*
 12046 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
 12047   eol_mnemonic_mac = build_pure_c_string ("/");
 12048 
 12049   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
 12050                doc: /*
 12051 String displayed in mode line when end-of-line format is not yet determined.  */);
 12052   eol_mnemonic_undecided = build_pure_c_string (":");
 12053 
 12054   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
 12055                doc: /*
 12056 Non-nil enables character translation while encoding and decoding.  */);
 12057   Venable_character_translation = Qt;
 12058 
 12059   DEFVAR_LISP ("standard-translation-table-for-decode",
 12060                Vstandard_translation_table_for_decode,
 12061                doc: /* Table for translating characters while decoding.  */);
 12062   Vstandard_translation_table_for_decode = Qnil;
 12063 
 12064   DEFVAR_LISP ("standard-translation-table-for-encode",
 12065                Vstandard_translation_table_for_encode,
 12066                doc: /* Table for translating characters while encoding.  */);
 12067   Vstandard_translation_table_for_encode = Qnil;
 12068 
 12069   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
 12070                doc: /* Alist of charsets vs revision numbers.
 12071 While encoding, if a charset (car part of an element) is found,
 12072 designate it with the escape sequence identifying revision (cdr part
 12073 of the element).  */);
 12074   Vcharset_revision_table = Qnil;
 12075 
 12076   DEFVAR_LISP ("default-process-coding-system",
 12077                Vdefault_process_coding_system,
 12078                doc: /* Cons of coding systems used for process I/O by default.
 12079 The car part is used for decoding a process output,
 12080 the cdr part is used for encoding a text to be sent to a process.  */);
 12081   Vdefault_process_coding_system = Qnil;
 12082 
 12083   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
 12084                doc: /*
 12085 Table of extra Latin codes in the range 128..159 (inclusive).
 12086 This is a vector of length 256.
 12087 If Nth element is non-nil, the existence of code N in a file
 12088 \(or output of subprocess) doesn't prevent it to be detected as
 12089 a coding system of ISO 2022 variant which has a flag
 12090 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
 12091 or reading output of a subprocess.
 12092 Only 128th through 159th elements have a meaning.  */);
 12093   Vlatin_extra_code_table = make_nil_vector (256);
 12094 
 12095   DEFVAR_LISP ("select-safe-coding-system-function",
 12096                Vselect_safe_coding_system_function,
 12097                doc: /*
 12098 Function to call to select safe coding system for encoding a text.
 12099 
 12100 If set, this function is called to force a user to select a proper
 12101 coding system which can encode the text in the case that a default
 12102 coding system used in each operation can't encode the text.  The
 12103 function should take care that the buffer is not modified while
 12104 the coding system is being selected.
 12105 
 12106 The default value is `select-safe-coding-system' (which see).  */);
 12107   Vselect_safe_coding_system_function = Qnil;
 12108 
 12109   DEFVAR_BOOL ("coding-system-require-warning",
 12110                coding_system_require_warning,
 12111                doc: /* Internal use only.
 12112 If non-nil, on writing a file, `select-safe-coding-system-function' is
 12113 called even if `coding-system-for-write' is non-nil.  The command
 12114 `universal-coding-system-argument' binds this variable to t temporarily.  */);
 12115   coding_system_require_warning = 0;
 12116 
 12117 
 12118   DEFVAR_BOOL ("inhibit-iso-escape-detection",
 12119                inhibit_iso_escape_detection,
 12120                doc: /*
 12121 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
 12122 
 12123 When Emacs reads text, it tries to detect how the text is encoded.
 12124 This code detection is sensitive to escape sequences.  If Emacs sees
 12125 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
 12126 of the ISO2022 encodings, and decodes text by the corresponding coding
 12127 system (e.g. `iso-2022-7bit').
 12128 
 12129 However, there may be a case that you want to read escape sequences in
 12130 a file as is.  In such a case, you can set this variable to non-nil.
 12131 Then the code detection will ignore any escape sequences, and no text is
 12132 detected as encoded in some ISO-2022 encoding.  The result is that all
 12133 escape sequences become visible in a buffer.
 12134 
 12135 The default value is nil, and it is strongly recommended not to change
 12136 it.  That is because many Emacs Lisp source files that contain
 12137 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
 12138 in Emacs's distribution, and they won't be decoded correctly on
 12139 reading if you suppress escape sequence detection.
 12140 
 12141 The other way to read escape sequences in a file without decoding is
 12142 to explicitly specify some coding system that doesn't use ISO-2022
 12143 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
 12144   inhibit_iso_escape_detection = 0;
 12145 
 12146   DEFVAR_BOOL ("inhibit-null-byte-detection",
 12147                inhibit_null_byte_detection,
 12148                doc: /* If non-nil, Emacs ignores null bytes on code detection.
 12149 By default, Emacs treats it as binary data, and does not attempt to
 12150 decode it.  The effect is as if you specified `no-conversion' for
 12151 reading that text.
 12152 
 12153 Set this to non-nil when a regular text happens to include null bytes.
 12154 Examples are Index nodes of Info files and null-byte delimited output
 12155 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
 12156 decode text as usual.  */);
 12157   inhibit_null_byte_detection = 0;
 12158 
 12159   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
 12160                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
 12161 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
 12162   disable_ascii_optimization = 0;
 12163 
 12164   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
 12165                doc: /* Char table for translating self-inserting characters.
 12166 This is applied to the result of input methods, not their input.
 12167 See also `keyboard-translate-table'.
 12168 
 12169 Use of this variable for character code unification was rendered
 12170 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
 12171 internal character representation.  */);
 12172   Vtranslation_table_for_input = Qnil;
 12173 
 12174   Lisp_Object args[coding_arg_undecided_max];
 12175   memclear (args, sizeof args);
 12176 
 12177   Lisp_Object plist[] =
 12178     {
 12179       QCname,
 12180       args[coding_arg_name] = Qno_conversion,
 12181       QCmnemonic,
 12182       args[coding_arg_mnemonic] = make_fixnum ('='),
 12183       intern_c_string (":coding-type"),
 12184       args[coding_arg_coding_type] = Qraw_text,
 12185       QCascii_compatible_p,
 12186       args[coding_arg_ascii_compatible_p] = Qt,
 12187       QCdefault_char,
 12188       args[coding_arg_default_char] = make_fixnum (0),
 12189       intern_c_string (":for-unibyte"),
 12190       args[coding_arg_for_unibyte] = Qt,
 12191       intern_c_string (":docstring"),
 12192       (build_pure_c_string
 12193        ("Do no conversion.\n"
 12194         "\n"
 12195         "When you visit a file with this coding, the file is read into a\n"
 12196         "unibyte buffer as is, thus each byte of a file is treated as a\n"
 12197         "character.")),
 12198       intern_c_string (":eol-type"),
 12199       args[coding_arg_eol_type] = Qunix,
 12200     };
 12201   args[coding_arg_plist] = CALLMANY (Flist, plist);
 12202   Fdefine_coding_system_internal (coding_arg_max, args);
 12203 
 12204   plist[1] = args[coding_arg_name] = Qundecided;
 12205   plist[3] = args[coding_arg_mnemonic] = make_fixnum ('-');
 12206   plist[5] = args[coding_arg_coding_type] = Qundecided;
 12207   /* This is already set.
 12208      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
 12209   plist[8] = intern_c_string (":charset-list");
 12210   plist[9] = args[coding_arg_charset_list] = list1 (Qascii);
 12211   plist[11] = args[coding_arg_for_unibyte] = Qnil;
 12212   plist[13] = build_pure_c_string ("No conversion on encoding, "
 12213                                    "automatic conversion on decoding.");
 12214   plist[15] = args[coding_arg_eol_type] = Qnil;
 12215   args[coding_arg_plist] = CALLMANY (Flist, plist);
 12216   args[coding_arg_undecided_inhibit_null_byte_detection] = make_fixnum (0);
 12217   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_fixnum (0);
 12218   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
 12219 
 12220   setup_coding_system (Qno_conversion, &safe_terminal_coding);
 12221 
 12222   for (int i = 0; i < coding_category_max; i++)
 12223     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
 12224 
 12225   pdumper_do_now_and_after_load (reset_coding_after_pdumper_load);
 12226 }
 12227 
 12228 static void
 12229 reset_coding_after_pdumper_load (void)
 12230 {
 12231   if (!dumped_with_pdumper_p ())
 12232     return;
 12233   for (struct coding_system *this = &coding_categories[0];
 12234        this < &coding_categories[coding_category_max];
 12235        ++this)
 12236     {
 12237       int id = this->id;
 12238       if (id >= 0)
 12239         {
 12240           /* Need to rebuild the coding system object because we
 12241              persisted it as a scalar and it's full of gunk that's now
 12242              invalid.  */
 12243           memset (this, 0, sizeof (*this));
 12244           setup_coding_system (CODING_ID_NAME (id), this);
 12245         }
 12246     }
 12247   /* In temacs the below is done by mule-conf.el, because we need to
 12248      define us-ascii first.  But in dumped Emacs us-ascii is restored
 12249      by the above loop, and mule-conf.el will not be loaded, so we set
 12250      it up now; otherwise safe_terminal_coding will remain zeroed.  */
 12251   Fset_safe_terminal_coding_system_internal (Qus_ascii);
 12252 }
/* [<][>][^][v][top][bottom][index][help] */
root/src/coding.c

DEFINITIONS