root/src/coding.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. detect_coding_XXX
  2. decode_coding_XXXX
  3. encode_coding_XXX
  4. encode_inhibit_flag
  5. inhibit_flag
  6. growable_destination
  7. record_conversion_result
  8. coding_set_source
  9. coding_change_source
  10. coding_set_destination
  11. coding_change_destination
  12. coding_alloc_by_realloc
  13. coding_alloc_by_making_gap
  14. alloc_destination
  15. detect_coding_utf_8
  16. decode_coding_utf_8
  17. encode_coding_utf_8
  18. detect_coding_utf_16
  19. decode_coding_utf_16
  20. encode_coding_utf_16
  21. detect_coding_emacs_mule
  22. emacs_mule_char
  23. emacs_mule_finish_composition
  24. decode_coding_emacs_mule
  25. encode_coding_emacs_mule
  26. setup_iso_safe_charsets
  27. detect_coding_iso_2022
  28. finish_composition
  29. decode_coding_iso_2022
  30. encode_invocation_designation
  31. encode_designation_at_bol
  32. encode_coding_iso_2022
  33. detect_coding_sjis
  34. detect_coding_big5
  35. decode_coding_sjis
  36. decode_coding_big5
  37. encode_coding_sjis
  38. encode_coding_big5
  39. detect_coding_ccl
  40. decode_coding_ccl
  41. encode_coding_ccl
  42. decode_coding_raw_text
  43. encode_coding_raw_text
  44. detect_coding_charset
  45. decode_coding_charset
  46. encode_coding_charset
  47. setup_coding_system
  48. coding_charset_list
  49. coding_system_charset_list
  50. raw_text_coding_system
  51. raw_text_coding_system_p
  52. coding_inherit_eol_type
  53. complement_process_encoding_system
  54. check_ascii
  55. check_utf_8
  56. utf8_string_p
  57. make_string_from_utf8
  58. detect_eol
  59. adjust_coding_eol_type
  60. detect_coding
  61. decode_eol
  62. get_translation_table
  63. get_translation
  64. produce_chars
  65. produce_composition
  66. produce_charset
  67. produce_annotation
  68. decode_coding
  69. handle_composition_annotation
  70. handle_charset_annotation
  71. consume_chars
  72. encode_coding
  73. code_conversion_restore
  74. code_conversion_save
  75. coding_restore_undo_list
  76. decode_coding_gap
  77. decode_coding_object
  78. encode_coding_object
  79. preferred_coding_system
  80. from_unicode
  81. from_unicode_buffer
  82. to_unicode
  83. DEFUN
  84. DEFUN
  85. DEFUN
  86. detect_coding_system
  87. char_encodable_p
  88. code_convert_region
  89. string_ascii_p
  90. code_convert_string
  91. code_convert_string_norecord
  92. get_buffer_gap_address
  93. get_char_bytes
  94. encode_string_utf_8
  95. decode_string_utf_8
  96. convert_string_nocopy
  97. decode_file_name
  98. encode_file_name_1
  99. encode_file_name
  100. DEFUN
  101. DEFUN
  102. DEFUN
  103. DEFUN
  104. DEFUN
  105. DEFUN
  106. DEFUN
  107. DEFUN
  108. make_subsidiaries
  109. DEFUN
  110. DEFUN
  111. DEFUN
  112. DEFUN
  113. init_coding_once
  114. syms_of_coding
  115. reset_coding_after_pdumper_load

     1 /* Coding system handler (conversion, detection, etc).
     2    Copyright (C) 2001-2023 Free Software Foundation, Inc.
     3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
     4      2005, 2006, 2007, 2008, 2009, 2010, 2011
     5      National Institute of Advanced Industrial Science and Technology (AIST)
     6      Registration Number H14PRO021
     7    Copyright (C) 2003
     8      National Institute of Advanced Industrial Science and Technology (AIST)
     9      Registration Number H13PRO009
    10 
    11 This file is part of GNU Emacs.
    12 
    13 GNU Emacs is free software: you can redistribute it and/or modify
    14 it under the terms of the GNU General Public License as published by
    15 the Free Software Foundation, either version 3 of the License, or (at
    16 your option) any later version.
    17 
    18 GNU Emacs is distributed in the hope that it will be useful,
    19 but WITHOUT ANY WARRANTY; without even the implied warranty of
    20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    21 GNU General Public License for more details.
    22 
    23 You should have received a copy of the GNU General Public License
    24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
    25 
    26 /*** TABLE OF CONTENTS ***
    27 
    28   0. General comments
    29   1. Preamble
    30   2. Emacs' internal format (emacs-utf-8) handlers
    31   3. UTF-8 handlers
    32   4. UTF-16 handlers
    33   5. Charset-base coding systems handlers
    34   6. emacs-mule (old Emacs' internal format) handlers
    35   7. ISO2022 handlers
    36   8. Shift-JIS and BIG5 handlers
    37   9. CCL handlers
    38   10. C library functions
    39   11. Emacs Lisp library functions
    40   12. Postamble
    41 
    42 */
    43 
    44 /*** 0. General comments ***
    45 
    46 
    47 CODING SYSTEM
    48 
    49   A coding system is an object for an encoding mechanism that contains
    50   information about how to convert byte sequences to character
    51   sequences and vice versa.  When we say "decode", it means converting
    52   a byte sequence of a specific coding system into a character
    53   sequence that is represented by Emacs' internal coding system
    54   `emacs-utf-8', and when we say "encode", it means converting a
    55   character sequence of emacs-utf-8 to a byte sequence of a specific
    56   coding system.
    57 
    58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
    59   the C level, a coding system is represented by a vector of attributes
    60   stored in the hash table Vcharset_hash_table.  The conversion from
    61   coding system symbol to attributes vector is done by looking up
    62   Vcharset_hash_table by the symbol.
    63 
    64   Coding systems are classified into the following types depending on
    65   the encoding mechanism.  Here's a brief description of the types.
    66 
    67   o UTF-8
    68 
    69   o UTF-16
    70 
    71   o Charset-base coding system
    72 
    73   A coding system defined by one or more (coded) character sets.
    74   Decoding and encoding are done by a code converter defined for each
    75   character set.
    76 
    77   o Old Emacs internal format (emacs-mule)
    78 
    79   The coding system adopted by old versions of Emacs (20 and 21).
    80 
    81   o ISO2022-base coding system
    82 
    83   The most famous coding system for multiple character sets.  X's
    84   Compound Text, various EUCs (Extended Unix Code), and coding systems
    85   used in the Internet communication such as ISO-2022-JP are all
    86   variants of ISO2022.
    87 
    88   o SJIS (or Shift-JIS or MS-Kanji-Code)
    89 
    90   A coding system to encode character sets: ASCII, JISX0201, and
    91   JISX0208.  Widely used for PC's in Japan.  Details are described in
    92   section 8.
    93 
    94   o BIG5
    95 
    96   A coding system to encode character sets: ASCII and Big5.  Widely
    97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
    98   described in section 8.  In this file, when we write "big5" (all
    99   lowercase), we mean the coding system, and when we write "Big5"
   100   (capitalized), we mean the character set.
   101 
   102   o CCL
   103 
   104   If a user wants to decode/encode text encoded in a coding system
   105   not listed above, he can supply a decoder and an encoder for it in
   106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
   107   program while decoding/encoding.
   108 
   109   o Raw-text
   110 
   111   A coding system for text containing raw eight-bit data.  Emacs
   112   treats each byte of source text as a character (except for
   113   end-of-line conversion).
   114 
   115   o No-conversion
   116 
   117   Like raw text, but don't do end-of-line conversion.
   118 
   119 
   120 END-OF-LINE FORMAT
   121 
   122   How text end-of-line is encoded depends on operating system.  For
   123   instance, Unix's format is just one byte of LF (line-feed) code,
   124   whereas DOS's format is two-byte sequence of `carriage-return' and
   125   `line-feed' codes.  Classic Mac OS's format is usually one byte of
   126   `carriage-return'.
   127 
   128   Since text character encoding and end-of-line encoding are
   129   independent, any coding system described above can take any format
   130   of end-of-line (except for no-conversion).
   131 
   132 STRUCT CODING_SYSTEM
   133 
   134   Before using a coding system for code conversion (i.e. decoding and
   135   encoding), we setup a structure of type `struct coding_system'.
   136   This structure keeps various information about a specific code
   137   conversion (e.g. the location of source and destination data).
   138 
   139 */
   140 
   141 /* COMMON MACROS */
   142 
   143 
   144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
   145 
   146   These functions check if a byte sequence specified as a source in
   147   CODING conforms to the format of XXX, and update the members of
   148   DETECT_INFO.
   149 
   150   Return true if the byte sequence conforms to XXX.
   151 
   152   Below is the template of these functions.  */
   153 
   154 #if 0
   155 static bool
   156 detect_coding_XXX (struct coding_system *coding,
   157                    struct coding_detection_info *detect_info)
   158 {
   159   const unsigned char *src = coding->source;
   160   const unsigned char *src_end = coding->source + coding->src_bytes;
   161   bool multibytep = coding->src_multibyte;
   162   ptrdiff_t consumed_chars = 0;
   163   int found = 0;
   164   ...;
   165 
   166   while (1)
   167     {
   168       /* Get one byte from the source.  If the source is exhausted, jump
   169          to no_more_source:.  */
   170       ONE_MORE_BYTE (c);
   171 
   172       if (! __C_conforms_to_XXX___ (c))
   173         break;
   174       if (! __C_strongly_suggests_XXX__ (c))
   175         found = CATEGORY_MASK_XXX;
   176     }
   177   /* The byte sequence is invalid for XXX.  */
   178   detect_info->rejected |= CATEGORY_MASK_XXX;
   179   return 0;
   180 
   181  no_more_source:
   182   /* The source exhausted successfully.  */
   183   detect_info->found |= found;
   184   return 1;
   185 }
   186 #endif
   187 
   188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
   189 
   190   These functions decode a byte sequence specified as a source by
   191   CODING.  The resulting multibyte text goes to a place pointed to by
   192   CODING->charbuf, the length of which should not exceed
   193   CODING->charbuf_size;
   194 
   195   These functions set the information of original and decoded texts in
   196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
   197   They also set CODING->result to one of CODING_RESULT_XXX indicating
   198   how the decoding is finished.
   199 
   200   Below is the template of these functions.  */
   201 
   202 #if 0
   203 static void
   204 decode_coding_XXXX (struct coding_system *coding)
   205 {
   206   const unsigned char *src = coding->source + coding->consumed;
   207   const unsigned char *src_end = coding->source + coding->src_bytes;
   208   /* SRC_BASE remembers the start position in source in each loop.
   209      The loop will be exited when there's not enough source code, or
   210      when there's no room in CHARBUF for a decoded character.  */
   211   const unsigned char *src_base;
   212   /* A buffer to produce decoded characters.  */
   213   int *charbuf = coding->charbuf + coding->charbuf_used;
   214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
   215   bool multibytep = coding->src_multibyte;
   216 
   217   while (1)
   218     {
   219       src_base = src;
   220       if (charbuf < charbuf_end)
   221         /* No more room to produce a decoded character.  */
   222         break;
   223       ONE_MORE_BYTE (c);
   224       /* Decode it. */
   225     }
   226 
   227  no_more_source:
   228   if (src_base < src_end
   229       && coding->mode & CODING_MODE_LAST_BLOCK)
   230     /* If the source ends by partial bytes to construct a character,
   231        treat them as eight-bit raw data.  */
   232     while (src_base < src_end && charbuf < charbuf_end)
   233       *charbuf++ = *src_base++;
   234   /* Remember how many bytes and characters we consumed.  If the
   235      source is multibyte, the bytes and chars are not identical.  */
   236   coding->consumed = coding->consumed_char = src_base - coding->source;
   237   /* Remember how many characters we produced.  */
   238   coding->charbuf_used = charbuf - coding->charbuf;
   239 }
   240 #endif
   241 
   242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
   243 
   244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
   245   internal multibyte format by CODING.  The resulting byte sequence
   246   goes to a place pointed to by DESTINATION, the length of which
   247   should not exceed DST_BYTES.
   248 
   249   These functions set the information of original and encoded texts in
   250   the members produced, produced_char, consumed, and consumed_char of
   251   the structure *CODING.  They also set the member result to one of
   252   CODING_RESULT_XXX indicating how the encoding finished.
   253 
   254   DST_BYTES zero means that source area and destination area are
   255   overlapped, which means that we can produce an encoded text until it
   256   reaches at the head of not-yet-encoded source text.
   257 
   258   Below is a template of these functions.  */
   259 #if 0
   260 static void
   261 encode_coding_XXX (struct coding_system *coding)
   262 {
   263   bool multibytep = coding->dst_multibyte;
   264   int *charbuf = coding->charbuf;
   265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
   266   unsigned char *dst = coding->destination + coding->produced;
   267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
   268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
   269   ptrdiff_t produced_chars = 0;
   270 
   271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
   272     {
   273       int c = *charbuf;
   274       /* Encode C into DST, and increment DST.  */
   275     }
   276  label_no_more_destination:
   277   /* How many chars and bytes we produced.  */
   278   coding->produced_char += produced_chars;
   279   coding->produced = dst - coding->destination;
   280 }
   281 #endif
   282 
   283 
   284 /*** 1. Preamble ***/
   285 
   286 #include <config.h>
   287 
   288 #ifdef HAVE_WCHAR_H
   289 #include <wchar.h>
   290 #endif /* HAVE_WCHAR_H */
   291 
   292 #include "lisp.h"
   293 #include "character.h"
   294 #include "buffer.h"
   295 #include "charset.h"
   296 #include "ccl.h"
   297 #include "composite.h"
   298 #include "coding.h"
   299 #include "termhooks.h"
   300 #include "pdumper.h"
   301 
   302 Lisp_Object Vcoding_system_hash_table;
   303 
   304 /* Coding-systems are handed between Emacs Lisp programs and C internal
   305    routines by the following three variables.  */
   306 /* Coding system to be used to encode text for terminal display when
   307    terminal coding system is nil.  */
   308 struct coding_system safe_terminal_coding;
   309 
   310 /* Two special coding systems.  */
   311 static Lisp_Object Vsjis_coding_system;
   312 static Lisp_Object Vbig5_coding_system;
   313 
   314 /* ISO2022 section */
   315 
   316 #define CODING_ISO_INITIAL(coding, reg)                 \
   317   (XFIXNUM (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
   318                      coding_attr_iso_initial),          \
   319                reg)))
   320 
   321 
   322 #define CODING_ISO_REQUEST(coding, charset_id)          \
   323   (((charset_id) <= (coding)->max_charset_id            \
   324     ? ((coding)->safe_charsets[charset_id] != 255       \
   325        ? (coding)->safe_charsets[charset_id]            \
   326        : -1)                                            \
   327     : -1))
   328 
   329 
   330 #define CODING_ISO_FLAGS(coding)        \
   331   ((coding)->spec.iso_2022.flags)
   332 #define CODING_ISO_DESIGNATION(coding, reg)     \
   333   ((coding)->spec.iso_2022.current_designation[reg])
   334 #define CODING_ISO_INVOCATION(coding, plane)    \
   335   ((coding)->spec.iso_2022.current_invocation[plane])
   336 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
   337   ((coding)->spec.iso_2022.single_shifting)
   338 #define CODING_ISO_BOL(coding)  \
   339   ((coding)->spec.iso_2022.bol)
   340 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
   341   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
   342    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
   343 #define CODING_ISO_CMP_STATUS(coding)   \
   344   (&(coding)->spec.iso_2022.cmp_status)
   345 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
   346   ((coding)->spec.iso_2022.ctext_extended_segment_len)
   347 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
   348   ((coding)->spec.iso_2022.embedded_utf_8)
   349 
   350 /* Control characters of ISO2022.  */
   351                         /* code */      /* function */
   352 #define ISO_CODE_SO     0x0E            /* shift-out */
   353 #define ISO_CODE_SI     0x0F            /* shift-in */
   354 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
   355 #define ISO_CODE_ESC    0x1B            /* escape */
   356 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
   357 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
   358 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
   359 
   360 /* All code (1-byte) of ISO2022 is classified into one of the
   361    followings.  */
   362 enum iso_code_class_type
   363   {
   364     ISO_control_0,              /* Control codes in the range
   365                                    0x00..0x1F and 0x7F, except for the
   366                                    following 5 codes.  */
   367     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
   368     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
   369     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
   370     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
   371     ISO_control_1,              /* Control codes in the range
   372                                    0x80..0x9F, except for the
   373                                    following 3 codes.  */
   374     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
   375     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
   376     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
   377     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
   378     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
   379     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
   380     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
   381   };
   382 
   383 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
   384     `iso-flags' attribute of an iso2022 coding system.  */
   385 
   386 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
   387    instead of the correct short-form sequence (e.g. ESC $ A).  */
   388 #define CODING_ISO_FLAG_LONG_FORM       0x0001
   389 
   390 /* If set, reset graphic planes and registers at end-of-line to the
   391    initial state.  */
   392 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
   393 
   394 /* If set, reset graphic planes and registers before any control
   395    characters to the initial state.  */
   396 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
   397 
   398 /* If set, encode by 7-bit environment.  */
   399 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
   400 
   401 /* If set, use locking-shift function.  */
   402 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
   403 
   404 /* If set, use single-shift function.  Overwrite
   405    CODING_ISO_FLAG_LOCKING_SHIFT.  */
   406 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
   407 
   408 /* If set, use designation escape sequence.  */
   409 #define CODING_ISO_FLAG_DESIGNATION     0x0040
   410 
   411 /* If set, produce revision number sequence.  */
   412 #define CODING_ISO_FLAG_REVISION        0x0080
   413 
   414 /* If set, produce ISO6429's direction specifying sequence.  */
   415 #define CODING_ISO_FLAG_DIRECTION       0x0100
   416 
   417 /* If set, assume designation states are reset at beginning of line on
   418    output.  */
   419 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
   420 
   421 /* If set, designation sequence should be placed at beginning of line
   422    on output.  */
   423 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
   424 
   425 /* If set, do not encode unsafe characters on output.  */
   426 #define CODING_ISO_FLAG_SAFE            0x0800
   427 
   428 /* If set, extra latin codes (128..159) are accepted as a valid code
   429    on input.  */
   430 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
   431 
   432 #define CODING_ISO_FLAG_COMPOSITION     0x2000
   433 
   434 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
   435 
   436 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
   437 
   438 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
   439 
   440 #define CODING_ISO_FLAG_LEVEL_4         0x20000
   441 
   442 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
   443 
   444 /* A character to be produced on output if encoding of the original
   445    character is prohibited by CODING_ISO_FLAG_SAFE.  */
   446 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
   447 
   448 /* UTF-8 section */
   449 #define CODING_UTF_8_BOM(coding)        \
   450   ((coding)->spec.utf_8_bom)
   451 
   452 /* UTF-16 section */
   453 #define CODING_UTF_16_BOM(coding)       \
   454   ((coding)->spec.utf_16.bom)
   455 
   456 #define CODING_UTF_16_ENDIAN(coding)    \
   457   ((coding)->spec.utf_16.endian)
   458 
   459 #define CODING_UTF_16_SURROGATE(coding) \
   460   ((coding)->spec.utf_16.surrogate)
   461 
   462 
   463 /* CCL section */
   464 #define CODING_CCL_DECODER(coding)      \
   465   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
   466 #define CODING_CCL_ENCODER(coding)      \
   467   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
   468 #define CODING_CCL_VALIDS(coding)                                          \
   469   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
   470 
   471 /* Index for each coding category in `coding_categories' */
   472 
   473 enum coding_category
   474   {
   475     coding_category_iso_7,
   476     coding_category_iso_7_tight,
   477     coding_category_iso_8_1,
   478     coding_category_iso_8_2,
   479     coding_category_iso_7_else,
   480     coding_category_iso_8_else,
   481     coding_category_utf_8_auto,
   482     coding_category_utf_8_nosig,
   483     coding_category_utf_8_sig,
   484     coding_category_utf_16_auto,
   485     coding_category_utf_16_be,
   486     coding_category_utf_16_le,
   487     coding_category_utf_16_be_nosig,
   488     coding_category_utf_16_le_nosig,
   489     coding_category_charset,
   490     coding_category_sjis,
   491     coding_category_big5,
   492     coding_category_ccl,
   493     coding_category_emacs_mule,
   494     /* All above are targets of code detection.  */
   495     coding_category_raw_text,
   496     coding_category_undecided,
   497     coding_category_max
   498   };
   499 
   500 /* Definitions of flag bits used in detect_coding_XXXX.  */
   501 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
   502 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
   503 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
   504 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
   505 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
   506 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
   507 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
   508 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
   509 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
   510 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
   511 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
   512 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
   513 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
   514 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
   515 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
   516 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
   517 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
   518 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
   519 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
   520 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
   521 
   522 /* This value is returned if detect_coding_mask () find nothing other
   523    than ASCII characters.  */
   524 #define CATEGORY_MASK_ANY               \
   525   (CATEGORY_MASK_ISO_7                  \
   526    | CATEGORY_MASK_ISO_7_TIGHT          \
   527    | CATEGORY_MASK_ISO_8_1              \
   528    | CATEGORY_MASK_ISO_8_2              \
   529    | CATEGORY_MASK_ISO_7_ELSE           \
   530    | CATEGORY_MASK_ISO_8_ELSE           \
   531    | CATEGORY_MASK_UTF_8_AUTO           \
   532    | CATEGORY_MASK_UTF_8_NOSIG          \
   533    | CATEGORY_MASK_UTF_8_SIG            \
   534    | CATEGORY_MASK_UTF_16_AUTO          \
   535    | CATEGORY_MASK_UTF_16_BE            \
   536    | CATEGORY_MASK_UTF_16_LE            \
   537    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
   538    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
   539    | CATEGORY_MASK_CHARSET              \
   540    | CATEGORY_MASK_SJIS                 \
   541    | CATEGORY_MASK_BIG5                 \
   542    | CATEGORY_MASK_CCL                  \
   543    | CATEGORY_MASK_EMACS_MULE)
   544 
   545 
   546 #define CATEGORY_MASK_ISO_7BIT \
   547   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
   548 
   549 #define CATEGORY_MASK_ISO_8BIT \
   550   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
   551 
   552 #define CATEGORY_MASK_ISO_ELSE \
   553   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
   554 
   555 #define CATEGORY_MASK_ISO_ESCAPE        \
   556   (CATEGORY_MASK_ISO_7                  \
   557    | CATEGORY_MASK_ISO_7_TIGHT          \
   558    | CATEGORY_MASK_ISO_7_ELSE           \
   559    | CATEGORY_MASK_ISO_8_ELSE)
   560 
   561 #define CATEGORY_MASK_ISO       \
   562   (  CATEGORY_MASK_ISO_7BIT     \
   563      | CATEGORY_MASK_ISO_8BIT   \
   564      | CATEGORY_MASK_ISO_ELSE)
   565 
   566 #define CATEGORY_MASK_UTF_16            \
   567   (CATEGORY_MASK_UTF_16_AUTO            \
   568    | CATEGORY_MASK_UTF_16_BE            \
   569    | CATEGORY_MASK_UTF_16_LE            \
   570    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
   571    | CATEGORY_MASK_UTF_16_LE_NOSIG)
   572 
   573 #define CATEGORY_MASK_UTF_8     \
   574   (CATEGORY_MASK_UTF_8_AUTO     \
   575    | CATEGORY_MASK_UTF_8_NOSIG  \
   576    | CATEGORY_MASK_UTF_8_SIG)
   577 
   578 /* Table of coding categories (Lisp symbols).  This variable is for
   579    internal use only.  */
   580 static Lisp_Object Vcoding_category_table;
   581 
   582 /* Table of coding-categories ordered by priority.  */
   583 static enum coding_category coding_priorities[coding_category_max];
   584 
   585 /* Nth element is a coding context for the coding system bound to the
   586    Nth coding category.  */
   587 static struct coding_system coding_categories[coding_category_max];
   588 
   589 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
   590 
   591 static int
   592 encode_inhibit_flag (Lisp_Object flag)
   593 {
   594   return NILP (flag) ? -1 : EQ (flag, Qt);
   595 }
   596 
   597 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
   598    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
   599 
   600 static bool
   601 inhibit_flag (int encoded_flag, bool var)
   602 {
   603   return 0 < encoded_flag + var;
   604 }
   605 
   606 #define CODING_GET_INFO(coding, attrs, charset_list)    \
   607   do {                                                  \
   608     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
   609     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
   610   } while (false)
   611 
   612 /* True if CODING's destination can be grown.  */
   613 
   614 static bool
   615 growable_destination (struct coding_system *coding)
   616 {
   617   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
   618 }
   619 
   620 
   621 /* Safely get one byte from the source text pointed by SRC which ends
   622    at SRC_END, and set C to that byte.  If there are not enough bytes
   623    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
   624    and a multibyte character is found at SRC, set C to the
   625    negative value of the character code.  The caller should declare
   626    and set these variables appropriately in advance:
   627         src, src_end, multibytep */
   628 
   629 #define ONE_MORE_BYTE(c)                                \
   630   do {                                                  \
   631     if (src == src_end)                                 \
   632       {                                                 \
   633         if (src_base < src)                             \
   634           record_conversion_result                      \
   635             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
   636         goto no_more_source;                            \
   637       }                                                 \
   638     c = *src++;                                         \
   639     if (multibytep && (c & 0x80))                       \
   640       {                                                 \
   641         if ((c & 0xFE) == 0xC0)                         \
   642           c = ((c & 1) << 6) | *src++;                  \
   643         else                                            \
   644           {                                             \
   645             src--;                                      \
   646             c = - string_char_advance (&src);           \
   647             record_conversion_result                    \
   648               (coding, CODING_RESULT_INVALID_SRC);      \
   649           }                                             \
   650       }                                                 \
   651     consumed_chars++;                                   \
   652   } while (0)
   653 
   654 /* Suppress clang warnings about consumed_chars never being used.
   655    Although correct, the warnings are too much trouble to code around.  */
   656 #if 13 <= __clang_major__ - defined __apple_build_version__
   657 # pragma clang diagnostic ignored "-Wunused-but-set-variable"
   658 #endif
   659 
   660 /* Safely get two bytes from the source text pointed by SRC which ends
   661    at SRC_END, and set C1 and C2 to those bytes while skipping the
   662    heading multibyte characters.  If there are not enough bytes in the
   663    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
   664    a multibyte character is found for C2, set C2 to the negative value
   665    of the character code.  The caller should declare and set these
   666    variables appropriately in advance:
   667         src, src_end, multibytep
   668    It is intended that this macro is used in detect_coding_utf_16.  */
   669 
   670 #define TWO_MORE_BYTES(c1, c2)                          \
   671   do {                                                  \
   672     do {                                                \
   673       if (src == src_end)                               \
   674         goto no_more_source;                            \
   675       c1 = *src++;                                      \
   676       if (multibytep && (c1 & 0x80))                    \
   677         {                                               \
   678           if ((c1 & 0xFE) == 0xC0)                      \
   679             c1 = ((c1 & 1) << 6) | *src++;              \
   680           else                                          \
   681             {                                           \
   682               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
   683               c1 = -1;                                  \
   684             }                                           \
   685         }                                               \
   686     } while (c1 < 0);                                   \
   687     if (src == src_end)                                 \
   688       goto no_more_source;                              \
   689     c2 = *src++;                                        \
   690     if (multibytep && (c2 & 0x80))                      \
   691       {                                                 \
   692         if ((c2 & 0xFE) == 0xC0)                        \
   693           c2 = ((c2 & 1) << 6) | *src++;                \
   694         else                                            \
   695           c2 = -1;                                      \
   696       }                                                 \
   697   } while (0)
   698 
   699 
   700 /* Store a byte C in the place pointed by DST and increment DST to the
   701    next free point, and increment PRODUCED_CHARS.  The caller should
   702    assure that C is 0..127, and declare and set the variable `dst'
   703    appropriately in advance.
   704 */
   705 
   706 
   707 #define EMIT_ONE_ASCII_BYTE(c)  \
   708   do {                          \
   709     produced_chars++;           \
   710     *dst++ = (c);               \
   711   } while (0)
   712 
   713 
   714 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
   715 
   716 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
   717   do {                                  \
   718     produced_chars += 2;                \
   719     *dst++ = (c1), *dst++ = (c2);       \
   720   } while (0)
   721 
   722 
   723 /* Store a byte C in the place pointed by DST and increment DST to the
   724    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
   725    store in an appropriate multibyte form.  The caller should
   726    declare and set the variables `dst' and `multibytep' appropriately
   727    in advance.  */
   728 
   729 #define EMIT_ONE_BYTE(c)                \
   730   do {                                  \
   731     produced_chars++;                   \
   732     if (multibytep)                     \
   733       {                                 \
   734         unsigned ch = (c);              \
   735         if (ch >= 0x80)                 \
   736           ch = BYTE8_TO_CHAR (ch);      \
   737         dst += CHAR_STRING (ch, dst);   \
   738       }                                 \
   739     else                                \
   740       *dst++ = (c);                     \
   741   } while (0)
   742 
   743 
   744 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
   745 
   746 #define EMIT_TWO_BYTES(c1, c2)          \
   747   do {                                  \
   748     produced_chars += 2;                \
   749     if (multibytep)                     \
   750       {                                 \
   751         unsigned ch;                    \
   752                                         \
   753         ch = (c1);                      \
   754         if (ch >= 0x80)                 \
   755           ch = BYTE8_TO_CHAR (ch);      \
   756         dst += CHAR_STRING (ch, dst);   \
   757         ch = (c2);                      \
   758         if (ch >= 0x80)                 \
   759           ch = BYTE8_TO_CHAR (ch);      \
   760         dst += CHAR_STRING (ch, dst);   \
   761       }                                 \
   762     else                                \
   763       {                                 \
   764         *dst++ = (c1);                  \
   765         *dst++ = (c2);                  \
   766       }                                 \
   767   } while (0)
   768 
   769 
   770 #define EMIT_THREE_BYTES(c1, c2, c3)    \
   771   do {                                  \
   772     EMIT_ONE_BYTE (c1);                 \
   773     EMIT_TWO_BYTES (c2, c3);            \
   774   } while (0)
   775 
   776 
   777 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
   778   do {                                          \
   779     EMIT_TWO_BYTES (c1, c2);                    \
   780     EMIT_TWO_BYTES (c3, c4);                    \
   781   } while (0)
   782 
   783 
   784 static void
   785 record_conversion_result (struct coding_system *coding,
   786                           enum coding_result_code result)
   787 {
   788   coding->result = result;
   789   switch (result)
   790     {
   791     case CODING_RESULT_INSUFFICIENT_SRC:
   792       Vlast_code_conversion_error = Qinsufficient_source;
   793       break;
   794     case CODING_RESULT_INVALID_SRC:
   795       Vlast_code_conversion_error = Qinvalid_source;
   796       break;
   797     case CODING_RESULT_INTERRUPT:
   798       Vlast_code_conversion_error = Qinterrupted;
   799       break;
   800     case CODING_RESULT_INSUFFICIENT_DST:
   801       /* Don't record this error in Vlast_code_conversion_error
   802          because it happens just temporarily and is resolved when the
   803          whole conversion is finished.  */
   804       break;
   805     case CODING_RESULT_SUCCESS:
   806       break;
   807     default:
   808       Vlast_code_conversion_error = intern ("Unknown error");
   809     }
   810 }
   811 
   812 /* These wrapper macros are used to preserve validity of pointers into
   813    buffer text across calls to decode_char, encode_char, etc, which
   814    could cause relocation of buffers if it loads a charset map,
   815    because loading a charset map allocates large structures.  */
   816 
   817 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
   818   do {                                                                       \
   819     ptrdiff_t offset;                                                        \
   820                                                                              \
   821     charset_map_loaded = 0;                                                  \
   822     c = DECODE_CHAR (charset, code);                                         \
   823     if (charset_map_loaded                                                   \
   824         && (offset = coding_change_source (coding)))                         \
   825       {                                                                      \
   826         src += offset;                                                       \
   827         src_base += offset;                                                  \
   828         src_end += offset;                                                   \
   829       }                                                                      \
   830   } while (0)
   831 
   832 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
   833   do {                                                                  \
   834     ptrdiff_t offset;                                                   \
   835                                                                         \
   836     charset_map_loaded = 0;                                             \
   837     code = ENCODE_CHAR (charset, c);                                    \
   838     if (charset_map_loaded                                              \
   839         && (offset = coding_change_destination (coding)))               \
   840       {                                                                 \
   841         dst += offset;                                                  \
   842         dst_end += offset;                                              \
   843       }                                                                 \
   844   } while (0)
   845 
   846 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
   847   do {                                                                  \
   848     ptrdiff_t offset;                                                   \
   849                                                                         \
   850     charset_map_loaded = 0;                                             \
   851     charset = char_charset (c, charset_list, code_return);              \
   852     if (charset_map_loaded                                              \
   853         && (offset = coding_change_destination (coding)))               \
   854       {                                                                 \
   855         dst += offset;                                                  \
   856         dst_end += offset;                                              \
   857       }                                                                 \
   858   } while (0)
   859 
   860 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
   861   do {                                                                  \
   862     ptrdiff_t offset;                                                   \
   863                                                                         \
   864     charset_map_loaded = 0;                                             \
   865     result = CHAR_CHARSET_P (c, charset);                               \
   866     if (charset_map_loaded                                              \
   867         && (offset = coding_change_destination (coding)))               \
   868       {                                                                 \
   869         dst += offset;                                                  \
   870         dst_end += offset;                                              \
   871       }                                                                 \
   872   } while (0)
   873 
   874 
   875 /* If there are at least BYTES length of room at dst, allocate memory
   876    for coding->destination and update dst and dst_end.  We don't have
   877    to take care of coding->source which will be relocated.  It is
   878    handled by calling coding_set_source in encode_coding.  */
   879 
   880 #define ASSURE_DESTINATION(bytes)                               \
   881   do {                                                          \
   882     if (dst + (bytes) >= dst_end)                               \
   883       {                                                         \
   884         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
   885                                                                 \
   886         dst = alloc_destination (coding, more_bytes, dst);      \
   887         dst_end = coding->destination + coding->dst_bytes;      \
   888       }                                                         \
   889   } while (0)
   890 
   891 
   892 /* Store multibyte form of the character C in P, and advance P to the
   893    end of the multibyte form.  This used to be like adding CHAR_STRING
   894    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
   895    MAYBE_UNIFY_CHAR in CHAR_STRING.  */
   896 
   897 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) ((p) += CHAR_STRING (c, p))
   898 
   899 /* Return the character code of character whose multibyte form is at
   900    P, and advance P to the end of the multibyte form.  This used to be
   901    like string_char_advance without ever calling MAYBE_UNIFY_CHAR, but
   902    nowadays string_char_advance doesn't call MAYBE_UNIFY_CHAR.  */
   903 
   904 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) string_char_advance (&(p))
   905 
   906 /* Set coding->source from coding->src_object.  */
   907 
   908 static void
   909 coding_set_source (struct coding_system *coding)
   910 {
   911   if (BUFFERP (coding->src_object))
   912     {
   913       struct buffer *buf = XBUFFER (coding->src_object);
   914 
   915       if (coding->src_pos < 0)
   916         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
   917       else
   918         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
   919     }
   920   else if (STRINGP (coding->src_object))
   921     {
   922       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
   923     }
   924   else
   925     {
   926       /* Otherwise, the source is C string and is never relocated
   927          automatically.  Thus we don't have to update anything.  */
   928     }
   929 }
   930 
   931 
   932 /* Set coding->source from coding->src_object, and return how many
   933    bytes coding->source was changed.  */
   934 
   935 static ptrdiff_t
   936 coding_change_source (struct coding_system *coding)
   937 {
   938   const unsigned char *orig = coding->source;
   939   coding_set_source (coding);
   940   return coding->source - orig;
   941 }
   942 
   943 
   944 /* Set coding->destination from coding->dst_object.  */
   945 
   946 static void
   947 coding_set_destination (struct coding_system *coding)
   948 {
   949   if (BUFFERP (coding->dst_object))
   950     {
   951       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
   952         {
   953           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
   954           coding->dst_bytes = (GAP_END_ADDR
   955                                - (coding->src_bytes - coding->consumed)
   956                                - coding->destination);
   957         }
   958       else
   959         {
   960           /* We are sure that coding->dst_pos_byte is before the gap
   961              of the buffer. */
   962           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
   963                                  + coding->dst_pos_byte - BEG_BYTE);
   964           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
   965                                - coding->destination);
   966         }
   967     }
   968   else
   969     {
   970       /* Otherwise, the destination is C string and is never relocated
   971          automatically.  Thus we don't have to update anything.  */
   972     }
   973 }
   974 
   975 
   976 /* Set coding->destination from coding->dst_object, and return how
   977    many bytes coding->destination was changed.  */
   978 
   979 static ptrdiff_t
   980 coding_change_destination (struct coding_system *coding)
   981 {
   982   const unsigned char *orig = coding->destination;
   983   coding_set_destination (coding);
   984   return coding->destination - orig;
   985 }
   986 
   987 
   988 static void
   989 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
   990 {
   991   ptrdiff_t newbytes;
   992   if (ckd_add (&newbytes, coding->dst_bytes, bytes)
   993       || SIZE_MAX < newbytes)
   994     string_overflow ();
   995   coding->destination = xrealloc (coding->destination, newbytes);
   996   coding->dst_bytes = newbytes;
   997 }
   998 
   999 static void
  1000 coding_alloc_by_making_gap (struct coding_system *coding,
  1001                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
  1002 {
  1003   if (EQ (coding->src_object, coding->dst_object))
  1004     {
  1005       /* The gap may contain the produced data at the head and not-yet
  1006          consumed data at the tail.  To preserve those data, we at
  1007          first make the gap size to zero, then increase the gap
  1008          size.  */
  1009       ptrdiff_t add = GAP_SIZE;
  1010 
  1011       GPT += gap_head_used, GPT_BYTE += gap_head_used;
  1012       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
  1013       make_gap (bytes);
  1014       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
  1015       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
  1016     }
  1017   else
  1018     make_gap_1 (XBUFFER (coding->dst_object), bytes);
  1019 }
  1020 
  1021 
  1022 static unsigned char *
  1023 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
  1024                    unsigned char *dst)
  1025 {
  1026   ptrdiff_t offset = dst - coding->destination;
  1027 
  1028   if (BUFFERP (coding->dst_object))
  1029     {
  1030       struct buffer *buf = XBUFFER (coding->dst_object);
  1031 
  1032       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
  1033     }
  1034   else
  1035     coding_alloc_by_realloc (coding, nbytes);
  1036   coding_set_destination (coding);
  1037   dst = coding->destination + offset;
  1038   return dst;
  1039 }
  1040 
  1041 /** Macros for annotations.  */
  1042 
  1043 /* An annotation data is stored in the array coding->charbuf in this
  1044    format:
  1045      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
  1046    LENGTH is the number of elements in the annotation.
  1047    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
  1048    NCHARS is the number of characters in the text annotated.
  1049 
  1050    The format of the following elements depend on ANNOTATION_MASK.
  1051 
  1052    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
  1053    follows:
  1054      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
  1055 
  1056    NBYTES is the number of bytes specified in the header part of
  1057    old-style emacs-mule encoding, or 0 for the other kind of
  1058    composition.
  1059 
  1060    METHOD is one of enum composition_method.
  1061 
  1062    Optional COMPOSITION-COMPONENTS are characters and composition
  1063    rules.
  1064 
  1065    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
  1066    follows.
  1067 
  1068    If ANNOTATION_MASK is 0, this annotation is just a space holder to
  1069    recover from an invalid annotation, and should be skipped by
  1070    produce_annotation.  */
  1071 
  1072 /* Maximum length of the header of annotation data.  */
  1073 #define MAX_ANNOTATION_LENGTH 5
  1074 
  1075 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
  1076   do {                                                  \
  1077     *(buf)++ = -(len);                                  \
  1078     *(buf)++ = (mask);                                  \
  1079     *(buf)++ = (nchars);                                \
  1080     coding->annotated = 1;                              \
  1081   } while (0);
  1082 
  1083 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
  1084   do {                                                                      \
  1085     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
  1086     *buf++ = nbytes;                                                        \
  1087     *buf++ = method;                                                        \
  1088   } while (0)
  1089 
  1090 
  1091 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
  1092   do {                                                                  \
  1093     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
  1094     *buf++ = id;                                                        \
  1095   } while (0)
  1096 
  1097 
  1098 /* Bitmasks for coding->eol_seen.  */
  1099 
  1100 #define EOL_SEEN_NONE   0
  1101 #define EOL_SEEN_LF     1
  1102 #define EOL_SEEN_CR     2
  1103 #define EOL_SEEN_CRLF   4
  1104 
  1105 
  1106 /*** 2. Emacs' internal format (emacs-utf-8) ***/
  1107 
  1108 
  1109 
  1110 
  1111 /*** 3. UTF-8 ***/
  1112 
  1113 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1114    Return true if a text is encoded in UTF-8.  */
  1115 
  1116 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
  1117 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
  1118 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
  1119 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
  1120 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
  1121 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
  1122 
  1123 #define UTF_8_BOM_1 0xEF
  1124 #define UTF_8_BOM_2 0xBB
  1125 #define UTF_8_BOM_3 0xBF
  1126 
  1127 /* Unlike the other detect_coding_XXX, this function counts the number
  1128    of characters and checks the EOL format.  */
  1129 
  1130 static bool
  1131 detect_coding_utf_8 (struct coding_system *coding,
  1132                      struct coding_detection_info *detect_info)
  1133 {
  1134   const unsigned char *src = coding->source, *src_base;
  1135   const unsigned char *src_end = coding->source + coding->src_bytes;
  1136   bool multibytep = coding->src_multibyte;
  1137   ptrdiff_t consumed_chars = 0;
  1138   bool bom_found = 0;
  1139   ptrdiff_t nchars = coding->head_ascii;
  1140 
  1141   detect_info->checked |= CATEGORY_MASK_UTF_8;
  1142   /* A coding system of this category is always ASCII compatible.  */
  1143   src += nchars;
  1144 
  1145   if (src == coding->source     /* BOM should be at the head.  */
  1146       && src + 3 < src_end      /* BOM is 3-byte long.  */
  1147       && src[0] == UTF_8_BOM_1
  1148       && src[1] == UTF_8_BOM_2
  1149       && src[2] == UTF_8_BOM_3)
  1150     {
  1151       bom_found = 1;
  1152       src += 3;
  1153       nchars++;
  1154     }
  1155 
  1156   while (1)
  1157     {
  1158       int c, c1, c2, c3, c4;
  1159 
  1160       src_base = src;
  1161       ONE_MORE_BYTE (c);
  1162       if (c < 0 || UTF_8_1_OCTET_P (c))
  1163         {
  1164           nchars++;
  1165           if (c == '\r')
  1166             {
  1167               if (src < src_end && *src == '\n')
  1168                 {
  1169                   src++;
  1170                   nchars++;
  1171                 }
  1172             }
  1173           continue;
  1174         }
  1175       ONE_MORE_BYTE (c1);
  1176       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
  1177         break;
  1178       if (UTF_8_2_OCTET_LEADING_P (c))
  1179         {
  1180           nchars++;
  1181           continue;
  1182         }
  1183       ONE_MORE_BYTE (c2);
  1184       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
  1185         break;
  1186       if (UTF_8_3_OCTET_LEADING_P (c))
  1187         {
  1188           nchars++;
  1189           continue;
  1190         }
  1191       ONE_MORE_BYTE (c3);
  1192       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
  1193         break;
  1194       if (UTF_8_4_OCTET_LEADING_P (c))
  1195         {
  1196           nchars++;
  1197           continue;
  1198         }
  1199       ONE_MORE_BYTE (c4);
  1200       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
  1201         break;
  1202       if (UTF_8_5_OCTET_LEADING_P (c)
  1203           /* If we ever need to increase MAX_CHAR, the below may need
  1204              to be reviewed.  */
  1205           && c < MAX_MULTIBYTE_LEADING_CODE)
  1206         {
  1207           nchars++;
  1208           continue;
  1209         }
  1210       break;
  1211     }
  1212   detect_info->rejected |= CATEGORY_MASK_UTF_8;
  1213   return 0;
  1214 
  1215  no_more_source:
  1216   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  1217     {
  1218       detect_info->rejected |= CATEGORY_MASK_UTF_8;
  1219       return 0;
  1220     }
  1221   if (bom_found)
  1222     {
  1223       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
  1224       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
  1225     }
  1226   else
  1227     {
  1228       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
  1229       if (nchars < src_end - coding->source)
  1230         /* The found characters are less than source bytes, which
  1231            means that we found a valid non-ASCII characters.  */
  1232         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
  1233     }
  1234   coding->detected_utf8_bytes = src_base - coding->source;
  1235   coding->detected_utf8_chars = nchars;
  1236   return 1;
  1237 }
  1238 
  1239 
  1240 static void
  1241 decode_coding_utf_8 (struct coding_system *coding)
  1242 {
  1243   const unsigned char *src = coding->source + coding->consumed;
  1244   const unsigned char *src_end = coding->source + coding->src_bytes;
  1245   const unsigned char *src_base;
  1246   int *charbuf = coding->charbuf + coding->charbuf_used;
  1247   int *charbuf_end = coding->charbuf + coding->charbuf_size;
  1248   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
  1249   bool multibytep = coding->src_multibyte;
  1250   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
  1251   bool eol_dos
  1252     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  1253   int byte_after_cr = -1;
  1254 
  1255   if (bom != utf_without_bom)
  1256     {
  1257       int c1, c2, c3;
  1258 
  1259       src_base = src;
  1260       ONE_MORE_BYTE (c1);
  1261       if (! UTF_8_3_OCTET_LEADING_P (c1))
  1262         src = src_base;
  1263       else
  1264         {
  1265           ONE_MORE_BYTE (c2);
  1266           if (! UTF_8_EXTRA_OCTET_P (c2))
  1267             src = src_base;
  1268           else
  1269             {
  1270               ONE_MORE_BYTE (c3);
  1271               if (! UTF_8_EXTRA_OCTET_P (c3))
  1272                 src = src_base;
  1273               else
  1274                 {
  1275                   if ((c1 != UTF_8_BOM_1)
  1276                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
  1277                     src = src_base;
  1278                   else
  1279                     CODING_UTF_8_BOM (coding) = utf_without_bom;
  1280                 }
  1281             }
  1282         }
  1283     }
  1284   CODING_UTF_8_BOM (coding) = utf_without_bom;
  1285 
  1286   while (1)
  1287     {
  1288       int c, c1, c2, c3, c4, c5;
  1289 
  1290       src_base = src;
  1291       consumed_chars_base = consumed_chars;
  1292 
  1293       if (charbuf >= charbuf_end)
  1294         {
  1295           if (byte_after_cr >= 0)
  1296             src_base--;
  1297           break;
  1298         }
  1299 
  1300       /* In the simple case, rapidly handle ordinary characters */
  1301       if (multibytep && ! eol_dos
  1302           && charbuf < charbuf_end - 6 && src < src_end - 6)
  1303         {
  1304           while (charbuf < charbuf_end - 6 && src < src_end - 6)
  1305             {
  1306               c1 = *src;
  1307               if (c1 & 0x80)
  1308                 break;
  1309               src++;
  1310               consumed_chars++;
  1311               *charbuf++ = c1;
  1312 
  1313               c1 = *src;
  1314               if (c1 & 0x80)
  1315                 break;
  1316               src++;
  1317               consumed_chars++;
  1318               *charbuf++ = c1;
  1319 
  1320               c1 = *src;
  1321               if (c1 & 0x80)
  1322                 break;
  1323               src++;
  1324               consumed_chars++;
  1325               *charbuf++ = c1;
  1326 
  1327               c1 = *src;
  1328               if (c1 & 0x80)
  1329                 break;
  1330               src++;
  1331               consumed_chars++;
  1332               *charbuf++ = c1;
  1333             }
  1334           /* If we handled at least one character, restart the main loop.  */
  1335           if (src != src_base)
  1336             continue;
  1337         }
  1338 
  1339       if (byte_after_cr >= 0)
  1340         c1 = byte_after_cr, byte_after_cr = -1;
  1341       else
  1342         ONE_MORE_BYTE (c1);
  1343       if (c1 < 0)
  1344         {
  1345           c = - c1;
  1346         }
  1347       else if (UTF_8_1_OCTET_P (c1))
  1348         {
  1349           if (eol_dos && c1 == '\r')
  1350             ONE_MORE_BYTE (byte_after_cr);
  1351           c = c1;
  1352         }
  1353       else
  1354         {
  1355           ONE_MORE_BYTE (c2);
  1356           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
  1357             goto invalid_code;
  1358           if (UTF_8_2_OCTET_LEADING_P (c1))
  1359             {
  1360               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
  1361               /* Reject overlong sequences here and below.  Encoders
  1362                  producing them are incorrect, they can be misleading,
  1363                  and they mess up read/write invariance.  */
  1364               if (c < 128)
  1365                 goto invalid_code;
  1366             }
  1367           else
  1368             {
  1369               ONE_MORE_BYTE (c3);
  1370               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
  1371                 goto invalid_code;
  1372               if (UTF_8_3_OCTET_LEADING_P (c1))
  1373                 {
  1374                   c = (((c1 & 0xF) << 12)
  1375                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
  1376                   if (c < 0x800
  1377                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
  1378                     goto invalid_code;
  1379                 }
  1380               else
  1381                 {
  1382                   ONE_MORE_BYTE (c4);
  1383                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
  1384                     goto invalid_code;
  1385                   if (UTF_8_4_OCTET_LEADING_P (c1))
  1386                     {
  1387                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
  1388                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
  1389                     if (c < 0x10000)
  1390                       goto invalid_code;
  1391                     }
  1392                   else
  1393                     {
  1394                       ONE_MORE_BYTE (c5);
  1395                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
  1396                         goto invalid_code;
  1397                       if (UTF_8_5_OCTET_LEADING_P (c1))
  1398                         {
  1399                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
  1400                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
  1401                                | (c5 & 0x3F));
  1402                           if ((c > MAX_CHAR) || (c < 0x200000))
  1403                             goto invalid_code;
  1404                         }
  1405                       else
  1406                         goto invalid_code;
  1407                     }
  1408                 }
  1409             }
  1410         }
  1411 
  1412       *charbuf++ = c;
  1413       continue;
  1414 
  1415     invalid_code:
  1416       src = src_base;
  1417       consumed_chars = consumed_chars_base;
  1418       ONE_MORE_BYTE (c);
  1419       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  1420     }
  1421 
  1422  no_more_source:
  1423   coding->consumed_char += consumed_chars_base;
  1424   coding->consumed = src_base - coding->source;
  1425   coding->charbuf_used = charbuf - coding->charbuf;
  1426 }
  1427 
  1428 
  1429 bool
  1430 encode_coding_utf_8 (struct coding_system *coding)
  1431 {
  1432   bool multibytep = coding->dst_multibyte;
  1433   int *charbuf = coding->charbuf;
  1434   int *charbuf_end = charbuf + coding->charbuf_used;
  1435   unsigned char *dst = coding->destination + coding->produced;
  1436   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  1437   ptrdiff_t produced_chars = 0;
  1438   int c;
  1439 
  1440   if (CODING_UTF_8_BOM (coding) != utf_without_bom)
  1441     {
  1442       ASSURE_DESTINATION (3);
  1443       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
  1444       CODING_UTF_8_BOM (coding) = utf_without_bom;
  1445     }
  1446 
  1447   if (multibytep)
  1448     {
  1449       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
  1450 
  1451       while (charbuf < charbuf_end)
  1452         {
  1453           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
  1454 
  1455           ASSURE_DESTINATION (safe_room);
  1456           c = *charbuf++;
  1457           if (CHAR_BYTE8_P (c))
  1458             {
  1459               c = CHAR_TO_BYTE8 (c);
  1460               EMIT_ONE_BYTE (c);
  1461             }
  1462           else
  1463             {
  1464               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
  1465               for (p = str; p < pend; p++)
  1466                 EMIT_ONE_BYTE (*p);
  1467             }
  1468         }
  1469     }
  1470   else
  1471     {
  1472       int safe_room = MAX_MULTIBYTE_LENGTH;
  1473 
  1474       while (charbuf < charbuf_end)
  1475         {
  1476           ASSURE_DESTINATION (safe_room);
  1477           c = *charbuf++;
  1478           if (CHAR_BYTE8_P (c))
  1479             *dst++ = CHAR_TO_BYTE8 (c);
  1480           else
  1481             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
  1482         }
  1483       produced_chars = dst - (coding->destination + coding->produced);
  1484     }
  1485   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  1486   coding->produced_char += produced_chars;
  1487   coding->produced = dst - coding->destination;
  1488   return 0;
  1489 }
  1490 
  1491 
  1492 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1493    Return true if a text is encoded in one of UTF-16 based coding systems.  */
  1494 
  1495 static bool
  1496 detect_coding_utf_16 (struct coding_system *coding,
  1497                       struct coding_detection_info *detect_info)
  1498 {
  1499   const unsigned char *src = coding->source;
  1500   const unsigned char *src_end = coding->source + coding->src_bytes;
  1501   bool multibytep = coding->src_multibyte;
  1502   int c1, c2;
  1503 
  1504   detect_info->checked |= CATEGORY_MASK_UTF_16;
  1505   if (coding->mode & CODING_MODE_LAST_BLOCK
  1506       && (coding->src_chars & 1))
  1507     {
  1508       detect_info->rejected |= CATEGORY_MASK_UTF_16;
  1509       return 0;
  1510     }
  1511 
  1512   TWO_MORE_BYTES (c1, c2);
  1513   if ((c1 == 0xFF) && (c2 == 0xFE))
  1514     {
  1515       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
  1516                              | CATEGORY_MASK_UTF_16_AUTO);
  1517       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
  1518                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
  1519                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
  1520     }
  1521   else if ((c1 == 0xFE) && (c2 == 0xFF))
  1522     {
  1523       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
  1524                              | CATEGORY_MASK_UTF_16_AUTO);
  1525       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
  1526                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
  1527                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
  1528     }
  1529   else if (c2 < 0)
  1530     {
  1531       detect_info->rejected |= CATEGORY_MASK_UTF_16;
  1532       return 0;
  1533     }
  1534   else
  1535     {
  1536       /* We check the dispersion of Eth and Oth bytes where E is even and
  1537          O is odd.  If both are high, we assume binary data.*/
  1538       unsigned char e[256], o[256];
  1539       unsigned e_num = 1, o_num = 1;
  1540 
  1541       memset (e, 0, 256);
  1542       memset (o, 0, 256);
  1543       e[c1] = 1;
  1544       o[c2] = 1;
  1545 
  1546       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
  1547                                 |CATEGORY_MASK_UTF_16_BE
  1548                                 | CATEGORY_MASK_UTF_16_LE);
  1549 
  1550       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
  1551              != CATEGORY_MASK_UTF_16)
  1552         {
  1553           TWO_MORE_BYTES (c1, c2);
  1554           if (c2 < 0)
  1555             break;
  1556           if (! e[c1])
  1557             {
  1558               e[c1] = 1;
  1559               e_num++;
  1560               if (e_num >= 128)
  1561                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
  1562             }
  1563           if (! o[c2])
  1564             {
  1565               o[c2] = 1;
  1566               o_num++;
  1567               if (o_num >= 128)
  1568                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
  1569             }
  1570         }
  1571       return 0;
  1572     }
  1573 
  1574  no_more_source:
  1575   return 1;
  1576 }
  1577 
  1578 static void
  1579 decode_coding_utf_16 (struct coding_system *coding)
  1580 {
  1581   const unsigned char *src = coding->source + coding->consumed;
  1582   const unsigned char *src_end = coding->source + coding->src_bytes;
  1583   const unsigned char *src_base;
  1584   int *charbuf = coding->charbuf + coding->charbuf_used;
  1585   /* We may produces at most 3 chars in one loop.  */
  1586   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
  1587   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
  1588   bool multibytep = coding->src_multibyte;
  1589   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
  1590   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
  1591   int surrogate = CODING_UTF_16_SURROGATE (coding);
  1592   bool eol_dos
  1593     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  1594   int byte_after_cr1 = -1, byte_after_cr2 = -1;
  1595 
  1596   if (bom == utf_with_bom)
  1597     {
  1598       int c, c1, c2;
  1599 
  1600       src_base = src;
  1601       ONE_MORE_BYTE (c1);
  1602       ONE_MORE_BYTE (c2);
  1603       c = (c1 << 8) | c2;
  1604 
  1605       if (endian == utf_16_big_endian
  1606           ? c != 0xFEFF : c != 0xFFFE)
  1607         {
  1608           /* The first two bytes are not BOM.  Treat them as bytes
  1609              for a normal character.  */
  1610           src = src_base;
  1611         }
  1612       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1613     }
  1614   else if (bom == utf_detect_bom)
  1615     {
  1616       /* We have already tried to detect BOM and failed in
  1617          detect_coding.  */
  1618       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1619     }
  1620 
  1621   while (1)
  1622     {
  1623       int c, c1, c2;
  1624 
  1625       src_base = src;
  1626       consumed_chars_base = consumed_chars;
  1627 
  1628       if (charbuf >= charbuf_end)
  1629         {
  1630           if (byte_after_cr1 >= 0)
  1631             src_base -= 2;
  1632           break;
  1633         }
  1634 
  1635       if (byte_after_cr1 >= 0)
  1636         c1 = byte_after_cr1, byte_after_cr1 = -1;
  1637       else
  1638         ONE_MORE_BYTE (c1);
  1639       if (c1 < 0)
  1640         {
  1641           *charbuf++ = -c1;
  1642           continue;
  1643         }
  1644       if (byte_after_cr2 >= 0)
  1645         c2 = byte_after_cr2, byte_after_cr2 = -1;
  1646       else
  1647         ONE_MORE_BYTE (c2);
  1648       if (c2 < 0)
  1649         {
  1650           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  1651           *charbuf++ = -c2;
  1652           continue;
  1653         }
  1654       c = (endian == utf_16_big_endian
  1655            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
  1656 
  1657       if (surrogate)
  1658         {
  1659           if (! UTF_16_LOW_SURROGATE_P (c))
  1660             {
  1661               if (endian == utf_16_big_endian)
  1662                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
  1663               else
  1664                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
  1665               *charbuf++ = c1;
  1666               *charbuf++ = c2;
  1667               if (UTF_16_HIGH_SURROGATE_P (c))
  1668                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
  1669               else
  1670                 *charbuf++ = c;
  1671             }
  1672           else
  1673             {
  1674               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
  1675               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
  1676               *charbuf++ = 0x10000 + c;
  1677             }
  1678         }
  1679       else
  1680         {
  1681           if (UTF_16_HIGH_SURROGATE_P (c))
  1682             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
  1683           else
  1684             {
  1685               if (eol_dos && c == '\r')
  1686                 {
  1687                   ONE_MORE_BYTE (byte_after_cr1);
  1688                   ONE_MORE_BYTE (byte_after_cr2);
  1689                 }
  1690               *charbuf++ = c;
  1691             }
  1692         }
  1693     }
  1694 
  1695  no_more_source:
  1696   coding->consumed_char += consumed_chars_base;
  1697   coding->consumed = src_base - coding->source;
  1698   coding->charbuf_used = charbuf - coding->charbuf;
  1699 }
  1700 
  1701 static bool
  1702 encode_coding_utf_16 (struct coding_system *coding)
  1703 {
  1704   bool multibytep = coding->dst_multibyte;
  1705   int *charbuf = coding->charbuf;
  1706   int *charbuf_end = charbuf + coding->charbuf_used;
  1707   unsigned char *dst = coding->destination + coding->produced;
  1708   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  1709   int safe_room = 8;
  1710   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
  1711   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
  1712   ptrdiff_t produced_chars = 0;
  1713   int c;
  1714 
  1715   if (bom != utf_without_bom)
  1716     {
  1717       ASSURE_DESTINATION (safe_room);
  1718       if (big_endian)
  1719         EMIT_TWO_BYTES (0xFE, 0xFF);
  1720       else
  1721         EMIT_TWO_BYTES (0xFF, 0xFE);
  1722       CODING_UTF_16_BOM (coding) = utf_without_bom;
  1723     }
  1724 
  1725   while (charbuf < charbuf_end)
  1726     {
  1727       ASSURE_DESTINATION (safe_room);
  1728       c = *charbuf++;
  1729       if (c > MAX_UNICODE_CHAR)
  1730         c = coding->default_char;
  1731 
  1732       if (c < 0x10000)
  1733         {
  1734           if (big_endian)
  1735             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
  1736           else
  1737             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
  1738         }
  1739       else
  1740         {
  1741           int c1, c2;
  1742 
  1743           c -= 0x10000;
  1744           c1 = (c >> 10) + 0xD800;
  1745           c2 = (c & 0x3FF) + 0xDC00;
  1746           if (big_endian)
  1747             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
  1748           else
  1749             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
  1750         }
  1751     }
  1752   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  1753   coding->produced = dst - coding->destination;
  1754   coding->produced_char += produced_chars;
  1755   return 0;
  1756 }
  1757 
  1758 
  1759 /*** 6. Old Emacs' internal format (emacs-mule) ***/
  1760 
  1761 /* Emacs' internal format for representation of multiple character
  1762    sets is a kind of multi-byte encoding, i.e. characters are
  1763    represented by variable-length sequences of one-byte codes.
  1764 
  1765    ASCII characters and control characters (e.g. `tab', `newline') are
  1766    represented by one-byte sequences which are their ASCII codes, in
  1767    the range 0x00 through 0x7F.
  1768 
  1769    8-bit characters of the range 0x80..0x9F are represented by
  1770    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
  1771    code + 0x20).
  1772 
  1773    8-bit characters of the range 0xA0..0xFF are represented by
  1774    one-byte sequences which are their 8-bit code.
  1775 
  1776    The other characters are represented by a sequence of `base
  1777    leading-code', optional `extended leading-code', and one or two
  1778    `position-code's.  The length of the sequence is determined by the
  1779    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
  1780    whereas extended leading-code and position-code take the range 0xA0
  1781    through 0xFF.  See `charset.h' for more details about leading-code
  1782    and position-code.
  1783 
  1784    --- CODE RANGE of Emacs' internal format ---
  1785    character set        range
  1786    -------------        -----
  1787    ascii                0x00..0x7F
  1788    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
  1789    eight-bit-graphic    0xA0..0xBF
  1790    ELSE                 0x81..0x9D + [0xA0..0xFF]+
  1791    ---------------------------------------------
  1792 
  1793    As this is the internal character representation, the format is
  1794    usually not used externally (i.e. in a file or in a data sent to a
  1795    process).  But, it is possible to have a text externally in this
  1796    format (i.e. by encoding by the coding system `emacs-mule').
  1797 
  1798    In that case, a sequence of one-byte codes has a slightly different
  1799    form.
  1800 
  1801    At first, all characters in eight-bit-control are represented by
  1802    one-byte sequences which are their 8-bit code.
  1803 
  1804    Next, character composition data are represented by the byte
  1805    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
  1806    where,
  1807         METHOD is 0xF2 plus one of composition method (enum
  1808         composition_method),
  1809 
  1810         BYTES is 0xA0 plus a byte length of this composition data,
  1811 
  1812         CHARS is 0xA0 plus a number of characters composed by this
  1813         data,
  1814 
  1815         COMPONENTs are characters of multibyte form or composition
  1816         rules encoded by two-byte of ASCII codes.
  1817 
  1818    In addition, for backward compatibility, the following formats are
  1819    also recognized as composition data on decoding.
  1820 
  1821    0x80 MSEQ ...
  1822    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
  1823 
  1824    Here,
  1825         MSEQ is a multibyte form but in these special format:
  1826           ASCII: 0xA0 ASCII_CODE+0x80,
  1827           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
  1828         RULE is a one byte code of the range 0xA0..0xF0 that
  1829         represents a composition rule.
  1830   */
  1831 
  1832 char emacs_mule_bytes[256];
  1833 
  1834 
  1835 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  1836    Return true if a text is encoded in 'emacs-mule'.  */
  1837 
  1838 static bool
  1839 detect_coding_emacs_mule (struct coding_system *coding,
  1840                           struct coding_detection_info *detect_info)
  1841 {
  1842   const unsigned char *src = coding->source, *src_base;
  1843   const unsigned char *src_end = coding->source + coding->src_bytes;
  1844   bool multibytep = coding->src_multibyte;
  1845   ptrdiff_t consumed_chars = 0;
  1846   int c;
  1847   int found = 0;
  1848 
  1849   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
  1850   /* A coding system of this category is always ASCII compatible.  */
  1851   src += coding->head_ascii;
  1852 
  1853   while (1)
  1854     {
  1855       src_base = src;
  1856       ONE_MORE_BYTE (c);
  1857       if (c < 0)
  1858         continue;
  1859       if (c == 0x80)
  1860         {
  1861           /* Perhaps the start of composite character.  We simply skip
  1862              it because analyzing it is too heavy for detecting.  But,
  1863              at least, we check that the composite character
  1864              constitutes of more than 4 bytes.  */
  1865           const unsigned char *src_start;
  1866 
  1867         repeat:
  1868           src_start = src;
  1869           do
  1870             {
  1871               ONE_MORE_BYTE (c);
  1872             }
  1873           while (c >= 0xA0);
  1874 
  1875           if (src - src_start <= 4)
  1876             break;
  1877           found = CATEGORY_MASK_EMACS_MULE;
  1878           if (c == 0x80)
  1879             goto repeat;
  1880         }
  1881 
  1882       if (c < 0x80)
  1883         {
  1884           if (c < 0x20
  1885               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
  1886             break;
  1887         }
  1888       else
  1889         {
  1890           int more_bytes = emacs_mule_bytes[c] - 1;
  1891 
  1892           while (more_bytes > 0)
  1893             {
  1894               ONE_MORE_BYTE (c);
  1895               if (c < 0xA0)
  1896                 {
  1897                   src--;        /* Unread the last byte.  */
  1898                   break;
  1899                 }
  1900               more_bytes--;
  1901             }
  1902           if (more_bytes != 0)
  1903             break;
  1904           found = CATEGORY_MASK_EMACS_MULE;
  1905         }
  1906     }
  1907   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
  1908   return 0;
  1909 
  1910  no_more_source:
  1911   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  1912     {
  1913       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
  1914       return 0;
  1915     }
  1916   detect_info->found |= found;
  1917   return 1;
  1918 }
  1919 
  1920 
  1921 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
  1922    character.  If CMP_STATUS indicates that we must expect MSEQ or
  1923    RULE described above, decode it and return the negative value of
  1924    the decoded character or rule.  If an invalid byte is found, return
  1925    -1.  If SRC is too short, return -2.  */
  1926 
  1927 static int
  1928 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
  1929                  int *nbytes, int *nchars, int *id,
  1930                  struct composition_status *cmp_status)
  1931 {
  1932   const unsigned char *src_end = coding->source + coding->src_bytes;
  1933   const unsigned char *src_base = src;
  1934   bool multibytep = coding->src_multibyte;
  1935   int charset_ID;
  1936   unsigned code;
  1937   int c;
  1938   ptrdiff_t consumed_chars = 0;
  1939   bool mseq_found = 0;
  1940 
  1941   ONE_MORE_BYTE (c);
  1942   if (c < 0)
  1943     {
  1944       c = -c;
  1945       charset_ID = emacs_mule_charset[0];
  1946     }
  1947   else
  1948     {
  1949       if (c >= 0xA0)
  1950         {
  1951           if (cmp_status->state != COMPOSING_NO
  1952               && cmp_status->old_form)
  1953             {
  1954               if (cmp_status->state == COMPOSING_CHAR)
  1955                 {
  1956                   if (c == 0xA0)
  1957                     {
  1958                       ONE_MORE_BYTE (c);
  1959                       c -= 0x80;
  1960                       if (c < 0)
  1961                         goto invalid_code;
  1962                     }
  1963                   else
  1964                     c -= 0x20;
  1965                   mseq_found = 1;
  1966                 }
  1967               else
  1968                 {
  1969                   *nbytes = src - src_base;
  1970                   *nchars = consumed_chars;
  1971                   return -c;
  1972                 }
  1973             }
  1974           else
  1975             goto invalid_code;
  1976         }
  1977 
  1978       switch (emacs_mule_bytes[c])
  1979         {
  1980         case 2:
  1981           if ((charset_ID = emacs_mule_charset[c]) < 0)
  1982             goto invalid_code;
  1983           ONE_MORE_BYTE (c);
  1984           if (c < 0xA0)
  1985             goto invalid_code;
  1986           code = c & 0x7F;
  1987           break;
  1988 
  1989         case 3:
  1990           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
  1991               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
  1992             {
  1993               ONE_MORE_BYTE (c);
  1994               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
  1995                 goto invalid_code;
  1996               ONE_MORE_BYTE (c);
  1997               if (c < 0xA0)
  1998                 goto invalid_code;
  1999               code = c & 0x7F;
  2000             }
  2001           else
  2002             {
  2003               if ((charset_ID = emacs_mule_charset[c]) < 0)
  2004                 goto invalid_code;
  2005               ONE_MORE_BYTE (c);
  2006               if (c < 0xA0)
  2007                 goto invalid_code;
  2008               code = (c & 0x7F) << 8;
  2009               ONE_MORE_BYTE (c);
  2010               if (c < 0xA0)
  2011                 goto invalid_code;
  2012               code |= c & 0x7F;
  2013             }
  2014           break;
  2015 
  2016         case 4:
  2017           ONE_MORE_BYTE (c);
  2018           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
  2019             goto invalid_code;
  2020           ONE_MORE_BYTE (c);
  2021           if (c < 0xA0)
  2022             goto invalid_code;
  2023           code = (c & 0x7F) << 8;
  2024           ONE_MORE_BYTE (c);
  2025           if (c < 0xA0)
  2026             goto invalid_code;
  2027           code |= c & 0x7F;
  2028           break;
  2029 
  2030         case 1:
  2031           code = c;
  2032           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
  2033           break;
  2034 
  2035         default:
  2036           emacs_abort ();
  2037         }
  2038       CODING_DECODE_CHAR (coding, src, src_base, src_end,
  2039                           CHARSET_FROM_ID (charset_ID), code, c);
  2040       if (c < 0)
  2041         goto invalid_code;
  2042     }
  2043   *nbytes = src - src_base;
  2044   *nchars = consumed_chars;
  2045   if (id)
  2046     *id = charset_ID;
  2047   return (mseq_found ? -c : c);
  2048 
  2049  no_more_source:
  2050   return -2;
  2051 
  2052  invalid_code:
  2053   return -1;
  2054 }
  2055 
  2056 
  2057 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  2058 
  2059 /* Handle these composition sequence ('|': the end of header elements,
  2060    BYTES and CHARS >= 0xA0):
  2061 
  2062    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
  2063    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
  2064    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
  2065 
  2066    and these old form:
  2067 
  2068    (4) relative composition: 0x80 | MSEQ ... MSEQ
  2069    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
  2070 
  2071    When the starter 0x80 and the following header elements are found,
  2072    this annotation header is produced.
  2073 
  2074         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
  2075 
  2076    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
  2077    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
  2078 
  2079    Then, upon reading the following elements, these codes are produced
  2080    until the composition end is found:
  2081 
  2082    (1) CHAR ... CHAR
  2083    (2) ALT ... ALT CHAR ... CHAR
  2084    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
  2085    (4) CHAR ... CHAR
  2086    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
  2087 
  2088    When the composition end is found, LENGTH and NCHARS in the
  2089    annotation header is updated as below:
  2090 
  2091    (1) LENGTH: unchanged, NCHARS: unchanged
  2092    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
  2093    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
  2094    (4) LENGTH: unchanged,  NCHARS: number of CHARs
  2095    (5) LENGTH: unchanged,  NCHARS: number of CHARs
  2096 
  2097    If an error is found while composing, the annotation header is
  2098    changed to the original composition header (plus filler -1s) as
  2099    below:
  2100 
  2101    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
  2102    (5)          [ 0x80 0xFF -1 -1- -1 ]
  2103 
  2104    and the sequence [ -2 DECODED-RULE ] is changed to the original
  2105    byte sequence as below:
  2106         o the original byte sequence is B: [ B -1 ]
  2107         o the original byte sequence is B1 B2: [ B1 B2 ]
  2108 
  2109    Most of the routines are implemented by macros because many
  2110    variables and labels in the caller decode_coding_emacs_mule must be
  2111    accessible, and they are usually called just once (thus doesn't
  2112    increase the size of compiled object).  */
  2113 
  2114 /* Decode a composition rule represented by C as a component of
  2115    composition sequence of Emacs 20 style.  Set RULE to the decoded
  2116    rule. */
  2117 
  2118 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
  2119   do {                                                  \
  2120     int gref, nref;                                     \
  2121                                                         \
  2122     c -= 0xA0;                                          \
  2123     if (c < 0 || c >= 81)                               \
  2124       goto invalid_code;                                \
  2125     gref = c / 9, nref = c % 9;                         \
  2126     if (gref == 4) gref = 10;                           \
  2127     if (nref == 4) nref = 10;                           \
  2128     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
  2129   } while (0)
  2130 
  2131 
  2132 /* Decode a composition rule represented by C and the following byte
  2133    at SRC as a component of composition sequence of Emacs 21 style.
  2134    Set RULE to the decoded rule.  */
  2135 
  2136 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
  2137   do {                                                  \
  2138     int gref, nref;                                     \
  2139                                                         \
  2140     gref = c - 0x20;                                    \
  2141     if (gref < 0 || gref >= 81)                         \
  2142       goto invalid_code;                                \
  2143     ONE_MORE_BYTE (c);                                  \
  2144     nref = c - 0x20;                                    \
  2145     if (nref < 0 || nref >= 81)                         \
  2146       goto invalid_code;                                \
  2147     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
  2148   } while (0)
  2149 
  2150 
  2151 /* Start of Emacs 21 style format.  The first three bytes at SRC are
  2152    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
  2153    byte length of this composition information, CHARS is the number of
  2154    characters composed by this composition.  */
  2155 
  2156 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
  2157   do {                                                                  \
  2158     enum composition_method method = c - 0xF2;                          \
  2159     int nbytes, nchars;                                                 \
  2160                                                                         \
  2161     ONE_MORE_BYTE (c);                                                  \
  2162     if (c < 0)                                                          \
  2163       goto invalid_code;                                                \
  2164     nbytes = c - 0xA0;                                                  \
  2165     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
  2166       goto invalid_code;                                                \
  2167     ONE_MORE_BYTE (c);                                                  \
  2168     nchars = c - 0xA0;                                                  \
  2169     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
  2170       goto invalid_code;                                                \
  2171     cmp_status->old_form = 0;                                           \
  2172     cmp_status->method = method;                                        \
  2173     if (method == COMPOSITION_RELATIVE)                                 \
  2174       cmp_status->state = COMPOSING_CHAR;                               \
  2175     else                                                                \
  2176       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
  2177     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
  2178     cmp_status->nchars = nchars;                                        \
  2179     cmp_status->ncomps = nbytes - 4;                                    \
  2180     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
  2181   } while (0)
  2182 
  2183 
  2184 /* Start of Emacs 20 style format for relative composition.  */
  2185 
  2186 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
  2187   do {                                                          \
  2188     cmp_status->old_form = 1;                                   \
  2189     cmp_status->method = COMPOSITION_RELATIVE;                  \
  2190     cmp_status->state = COMPOSING_CHAR;                         \
  2191     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
  2192     cmp_status->nchars = cmp_status->ncomps = 0;                \
  2193     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
  2194   } while (0)
  2195 
  2196 
  2197 /* Start of Emacs 20 style format for rule-base composition.  */
  2198 
  2199 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
  2200   do {                                                          \
  2201     cmp_status->old_form = 1;                                   \
  2202     cmp_status->method = COMPOSITION_WITH_RULE;                 \
  2203     cmp_status->state = COMPOSING_CHAR;                         \
  2204     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
  2205     cmp_status->nchars = cmp_status->ncomps = 0;                \
  2206     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
  2207   } while (0)
  2208 
  2209 
  2210 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
  2211   do {                                                  \
  2212     const unsigned char *current_src = src;             \
  2213                                                         \
  2214     ONE_MORE_BYTE (c);                                  \
  2215     if (c < 0)                                          \
  2216       goto invalid_code;                                \
  2217     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
  2218         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
  2219       DECODE_EMACS_MULE_21_COMPOSITION ();              \
  2220     else if (c < 0xA0)                                  \
  2221       goto invalid_code;                                \
  2222     else if (c < 0xC0)                                  \
  2223       {                                                 \
  2224         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
  2225         /* Re-read C as a composition component.  */    \
  2226         src = current_src;                              \
  2227       }                                                 \
  2228     else if (c == 0xFF)                                 \
  2229       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
  2230     else                                                \
  2231       goto invalid_code;                                \
  2232   } while (0)
  2233 
  2234 #define EMACS_MULE_COMPOSITION_END()                            \
  2235   do {                                                          \
  2236     int idx = - cmp_status->length;                             \
  2237                                                                 \
  2238     if (cmp_status->old_form)                                   \
  2239       charbuf[idx + 2] = cmp_status->nchars;                    \
  2240     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
  2241       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
  2242     cmp_status->state = COMPOSING_NO;                           \
  2243   } while (0)
  2244 
  2245 
  2246 static int
  2247 emacs_mule_finish_composition (int *charbuf,
  2248                                struct composition_status *cmp_status)
  2249 {
  2250   int idx = - cmp_status->length;
  2251   int new_chars;
  2252 
  2253   if (cmp_status->old_form && cmp_status->nchars > 0)
  2254     {
  2255       charbuf[idx + 2] = cmp_status->nchars;
  2256       new_chars = 0;
  2257       if (cmp_status->method == COMPOSITION_WITH_RULE
  2258           && cmp_status->state == COMPOSING_CHAR)
  2259         {
  2260           /* The last rule was invalid.  */
  2261           int rule = charbuf[-1] + 0xA0;
  2262 
  2263           charbuf[-2] = BYTE8_TO_CHAR (rule);
  2264           charbuf[-1] = -1;
  2265           new_chars = 1;
  2266         }
  2267     }
  2268   else
  2269     {
  2270       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
  2271 
  2272       if (cmp_status->method == COMPOSITION_WITH_RULE)
  2273         {
  2274           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
  2275           charbuf[idx++] = -3;
  2276           charbuf[idx++] = 0;
  2277           new_chars = 1;
  2278         }
  2279       else
  2280         {
  2281           int nchars = charbuf[idx + 1] + 0xA0;
  2282           int nbytes = charbuf[idx + 2] + 0xA0;
  2283 
  2284           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
  2285           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
  2286           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
  2287           charbuf[idx++] = -1;
  2288           new_chars = 4;
  2289         }
  2290     }
  2291   cmp_status->state = COMPOSING_NO;
  2292   return new_chars;
  2293 }
  2294 
  2295 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
  2296   do {                                                                    \
  2297     if (cmp_status->state != COMPOSING_NO)                                \
  2298       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
  2299   } while (0)
  2300 
  2301 
  2302 static void
  2303 decode_coding_emacs_mule (struct coding_system *coding)
  2304 {
  2305   const unsigned char *src = coding->source + coding->consumed;
  2306   const unsigned char *src_end = coding->source + coding->src_bytes;
  2307   const unsigned char *src_base;
  2308   int *charbuf = coding->charbuf + coding->charbuf_used;
  2309   /* We may produce two annotations (charset and composition) in one
  2310      loop and one more charset annotation at the end.  */
  2311   int *charbuf_end
  2312     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
  2313       /* We can produce up to 2 characters in a loop.  */
  2314       - 1;
  2315   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  2316   bool multibytep = coding->src_multibyte;
  2317   ptrdiff_t char_offset = coding->produced_char;
  2318   ptrdiff_t last_offset = char_offset;
  2319   int last_id = charset_ascii;
  2320   bool eol_dos
  2321     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  2322   int byte_after_cr = -1;
  2323   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
  2324 
  2325   if (cmp_status->state != COMPOSING_NO)
  2326     {
  2327       int i;
  2328 
  2329       if (charbuf_end - charbuf < cmp_status->length)
  2330         emacs_abort ();
  2331       for (i = 0; i < cmp_status->length; i++)
  2332         *charbuf++ = cmp_status->carryover[i];
  2333       coding->annotated = 1;
  2334     }
  2335 
  2336   while (1)
  2337     {
  2338       int c;
  2339       int id UNINIT;
  2340 
  2341       src_base = src;
  2342       consumed_chars_base = consumed_chars;
  2343 
  2344       if (charbuf >= charbuf_end)
  2345         {
  2346           if (byte_after_cr >= 0)
  2347             src_base--;
  2348           break;
  2349         }
  2350 
  2351       if (byte_after_cr >= 0)
  2352         c = byte_after_cr, byte_after_cr = -1;
  2353       else
  2354         ONE_MORE_BYTE (c);
  2355 
  2356       if (c < 0 || c == 0x80)
  2357         {
  2358           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2359           if (c < 0)
  2360             {
  2361               *charbuf++ = -c;
  2362               char_offset++;
  2363             }
  2364           else
  2365             DECODE_EMACS_MULE_COMPOSITION_START ();
  2366           continue;
  2367         }
  2368 
  2369       if (c < 0x80)
  2370         {
  2371           if (eol_dos && c == '\r')
  2372             ONE_MORE_BYTE (byte_after_cr);
  2373           id = charset_ascii;
  2374           if (cmp_status->state != COMPOSING_NO)
  2375             {
  2376               if (cmp_status->old_form)
  2377                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2378               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
  2379                 cmp_status->ncomps--;
  2380             }
  2381         }
  2382       else
  2383         {
  2384           int nchars UNINIT, nbytes UNINIT;
  2385           /* emacs_mule_char can load a charset map from a file, which
  2386              allocates a large structure and might cause buffer text
  2387              to be relocated as result.  Thus, we need to remember the
  2388              original pointer to buffer text, and fix up all related
  2389              pointers after the call.  */
  2390           const unsigned char *orig = coding->source;
  2391           ptrdiff_t offset;
  2392 
  2393           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
  2394                                cmp_status);
  2395           offset = coding->source - orig;
  2396           if (offset)
  2397             {
  2398               src += offset;
  2399               src_base += offset;
  2400               src_end += offset;
  2401             }
  2402           if (c < 0)
  2403             {
  2404               if (c == -1)
  2405                 goto invalid_code;
  2406               if (c == -2)
  2407                 break;
  2408             }
  2409           src = src_base + nbytes;
  2410           consumed_chars = consumed_chars_base + nchars;
  2411           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
  2412             cmp_status->ncomps -= nchars;
  2413         }
  2414 
  2415       /* Now if C >= 0, we found a normally encoded character, if C <
  2416          0, we found an old-style composition component character or
  2417          rule.  */
  2418 
  2419       if (cmp_status->state == COMPOSING_NO)
  2420         {
  2421           if (last_id != id)
  2422             {
  2423               if (last_id != charset_ascii)
  2424                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
  2425                                   last_id);
  2426               last_id = id;
  2427               last_offset = char_offset;
  2428             }
  2429           *charbuf++ = c;
  2430           char_offset++;
  2431         }
  2432       else if (cmp_status->state == COMPOSING_CHAR)
  2433         {
  2434           if (cmp_status->old_form)
  2435             {
  2436               if (c >= 0)
  2437                 {
  2438                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2439                   *charbuf++ = c;
  2440                   char_offset++;
  2441                 }
  2442               else
  2443                 {
  2444                   *charbuf++ = -c;
  2445                   cmp_status->nchars++;
  2446                   cmp_status->length++;
  2447                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
  2448                     EMACS_MULE_COMPOSITION_END ();
  2449                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
  2450                     cmp_status->state = COMPOSING_RULE;
  2451                 }
  2452             }
  2453           else
  2454             {
  2455               *charbuf++ = c;
  2456               cmp_status->length++;
  2457               cmp_status->nchars--;
  2458               if (cmp_status->nchars == 0)
  2459                 EMACS_MULE_COMPOSITION_END ();
  2460             }
  2461         }
  2462       else if (cmp_status->state == COMPOSING_RULE)
  2463         {
  2464           int rule;
  2465 
  2466           if (c >= 0)
  2467             {
  2468               EMACS_MULE_COMPOSITION_END ();
  2469               *charbuf++ = c;
  2470               char_offset++;
  2471             }
  2472           else
  2473             {
  2474               c = -c;
  2475               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
  2476               if (rule < 0)
  2477                 goto invalid_code;
  2478               *charbuf++ = -2;
  2479               *charbuf++ = rule;
  2480               cmp_status->length += 2;
  2481               cmp_status->state = COMPOSING_CHAR;
  2482             }
  2483         }
  2484       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
  2485         {
  2486           *charbuf++ = c;
  2487           cmp_status->length++;
  2488           if (cmp_status->ncomps == 0)
  2489             cmp_status->state = COMPOSING_CHAR;
  2490           else if (cmp_status->ncomps > 0)
  2491             {
  2492               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
  2493                 cmp_status->state = COMPOSING_COMPONENT_RULE;
  2494             }
  2495           else
  2496             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2497         }
  2498       else                      /* COMPOSING_COMPONENT_RULE */
  2499         {
  2500           int rule;
  2501 
  2502           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
  2503           if (rule < 0)
  2504             goto invalid_code;
  2505           *charbuf++ = -2;
  2506           *charbuf++ = rule;
  2507           cmp_status->length += 2;
  2508           cmp_status->ncomps--;
  2509           if (cmp_status->ncomps > 0)
  2510             cmp_status->state = COMPOSING_COMPONENT_CHAR;
  2511           else
  2512             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2513         }
  2514       continue;
  2515 
  2516     invalid_code:
  2517       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2518       src = src_base;
  2519       consumed_chars = consumed_chars_base;
  2520       ONE_MORE_BYTE (c);
  2521       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  2522       char_offset++;
  2523     }
  2524 
  2525  no_more_source:
  2526   if (cmp_status->state != COMPOSING_NO)
  2527     {
  2528       if (coding->mode & CODING_MODE_LAST_BLOCK)
  2529         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
  2530       else
  2531         {
  2532           int i;
  2533 
  2534           charbuf -= cmp_status->length;
  2535           for (i = 0; i < cmp_status->length; i++)
  2536             cmp_status->carryover[i] = charbuf[i];
  2537         }
  2538     }
  2539   if (last_id != charset_ascii)
  2540     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  2541   coding->consumed_char += consumed_chars_base;
  2542   coding->consumed = src_base - coding->source;
  2543   coding->charbuf_used = charbuf - coding->charbuf;
  2544 }
  2545 
  2546 
  2547 #define EMACS_MULE_LEADING_CODES(id, codes)     \
  2548   do {                                          \
  2549     if (id < 0xA0)                              \
  2550       codes[0] = id, codes[1] = 0;              \
  2551     else if (id < 0xE0)                         \
  2552       codes[0] = 0x9A, codes[1] = id;           \
  2553     else if (id < 0xF0)                         \
  2554       codes[0] = 0x9B, codes[1] = id;           \
  2555     else if (id < 0xF5)                         \
  2556       codes[0] = 0x9C, codes[1] = id;           \
  2557     else                                        \
  2558       codes[0] = 0x9D, codes[1] = id;           \
  2559   } while (0);
  2560 
  2561 
  2562 static bool
  2563 encode_coding_emacs_mule (struct coding_system *coding)
  2564 {
  2565   bool multibytep = coding->dst_multibyte;
  2566   int *charbuf = coding->charbuf;
  2567   int *charbuf_end = charbuf + coding->charbuf_used;
  2568   unsigned char *dst = coding->destination + coding->produced;
  2569   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  2570   int safe_room = 8;
  2571   ptrdiff_t produced_chars = 0;
  2572   Lisp_Object attrs, charset_list;
  2573   int c;
  2574   int preferred_charset_id = -1;
  2575 
  2576   CODING_GET_INFO (coding, attrs, charset_list);
  2577   if (! EQ (charset_list, Vemacs_mule_charset_list))
  2578     {
  2579       charset_list = Vemacs_mule_charset_list;
  2580       ASET (attrs, coding_attr_charset_list, charset_list);
  2581     }
  2582 
  2583   while (charbuf < charbuf_end)
  2584     {
  2585       ASSURE_DESTINATION (safe_room);
  2586       c = *charbuf++;
  2587 
  2588       if (c < 0)
  2589         {
  2590           /* Handle an annotation.  */
  2591           switch (*charbuf)
  2592             {
  2593             case CODING_ANNOTATE_COMPOSITION_MASK:
  2594               /* Not yet implemented.  */
  2595               break;
  2596             case CODING_ANNOTATE_CHARSET_MASK:
  2597               preferred_charset_id = charbuf[3];
  2598               if (preferred_charset_id >= 0
  2599                   && NILP (Fmemq (make_fixnum (preferred_charset_id),
  2600                                   charset_list)))
  2601                 preferred_charset_id = -1;
  2602               break;
  2603             default:
  2604               emacs_abort ();
  2605             }
  2606           charbuf += -c - 1;
  2607           continue;
  2608         }
  2609 
  2610       if (ASCII_CHAR_P (c))
  2611         EMIT_ONE_ASCII_BYTE (c);
  2612       else if (CHAR_BYTE8_P (c))
  2613         {
  2614           c = CHAR_TO_BYTE8 (c);
  2615           EMIT_ONE_BYTE (c);
  2616         }
  2617       else
  2618         {
  2619           struct charset *charset;
  2620           unsigned code;
  2621           int dimension;
  2622           int emacs_mule_id;
  2623           unsigned char leading_codes[2];
  2624 
  2625           if (preferred_charset_id >= 0)
  2626             {
  2627               bool result;
  2628 
  2629               charset = CHARSET_FROM_ID (preferred_charset_id);
  2630               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
  2631               if (result)
  2632                 code = ENCODE_CHAR (charset, c);
  2633               else
  2634                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2635                                      &code, charset);
  2636             }
  2637           else
  2638             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2639                                  &code, charset);
  2640           if (! charset)
  2641             {
  2642               c = coding->default_char;
  2643               if (ASCII_CHAR_P (c))
  2644                 {
  2645                   EMIT_ONE_ASCII_BYTE (c);
  2646                   continue;
  2647                 }
  2648               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  2649                                    &code, charset);
  2650             }
  2651           dimension = CHARSET_DIMENSION (charset);
  2652           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
  2653           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
  2654           EMIT_ONE_BYTE (leading_codes[0]);
  2655           if (leading_codes[1])
  2656             EMIT_ONE_BYTE (leading_codes[1]);
  2657           if (dimension == 1)
  2658             EMIT_ONE_BYTE (code | 0x80);
  2659           else
  2660             {
  2661               code |= 0x8080;
  2662               EMIT_ONE_BYTE (code >> 8);
  2663               EMIT_ONE_BYTE (code & 0xFF);
  2664             }
  2665         }
  2666     }
  2667   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  2668   coding->produced_char += produced_chars;
  2669   coding->produced = dst - coding->destination;
  2670   return 0;
  2671 }
  2672 
  2673 
  2674 /*** 7. ISO2022 handlers ***/
  2675 
  2676 /* The following note describes the coding system ISO2022 briefly.
  2677    Since the intention of this note is to help understand the
  2678    functions in this file, some parts are NOT ACCURATE or are OVERLY
  2679    SIMPLIFIED.  For thorough understanding, please refer to the
  2680    original document of ISO2022.  This is equivalent to the standard
  2681    ECMA-35, obtainable from <URL:https://www.ecma.ch/> (*).
  2682 
  2683    ISO2022 provides many mechanisms to encode several character sets
  2684    in 7-bit and 8-bit environments.  For 7-bit environments, all text
  2685    is encoded using bytes less than 128.  This may make the encoded
  2686    text a little bit longer, but the text passes more easily through
  2687    several types of gateway, some of which strip off the MSB (Most
  2688    Significant Bit).
  2689 
  2690    There are two kinds of character sets: control character sets and
  2691    graphic character sets.  The former contain control characters such
  2692    as `newline' and `escape' to provide control functions (control
  2693    functions are also provided by escape sequences).  The latter
  2694    contain graphic characters such as 'A' and '-'.  Emacs recognizes
  2695    two control character sets and many graphic character sets.
  2696 
  2697    Graphic character sets are classified into one of the following
  2698    four classes, according to the number of bytes (DIMENSION) and
  2699    number of characters in one dimension (CHARS) of the set:
  2700    - DIMENSION1_CHARS94
  2701    - DIMENSION1_CHARS96
  2702    - DIMENSION2_CHARS94
  2703    - DIMENSION2_CHARS96
  2704 
  2705    In addition, each character set is assigned an identification tag,
  2706    unique for each set, called the "final character" (denoted as <F>
  2707    hereafter).  The <F> of each character set is decided by ECMA(*)
  2708    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
  2709    (0x30..0x3F are for private use only).
  2710 
  2711    Note (*): ECMA = European Computer Manufacturers Association
  2712 
  2713    Here are examples of graphic character sets [NAME(<F>)]:
  2714         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
  2715         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
  2716         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
  2717         o DIMENSION2_CHARS96 -- none for the moment
  2718 
  2719    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
  2720         C0 [0x00..0x1F] -- control character plane 0
  2721         GL [0x20..0x7F] -- graphic character plane 0
  2722         C1 [0x80..0x9F] -- control character plane 1
  2723         GR [0xA0..0xFF] -- graphic character plane 1
  2724 
  2725    A control character set is directly designated and invoked to C0 or
  2726    C1 by an escape sequence.  The most common case is that:
  2727    - ISO646's  control character set is designated/invoked to C0, and
  2728    - ISO6429's control character set is designated/invoked to C1,
  2729    and usually these designations/invocations are omitted in encoded
  2730    text.  In a 7-bit environment, only C0 can be used, and a control
  2731    character for C1 is encoded by an appropriate escape sequence to
  2732    fit into the environment.  All control characters for C1 are
  2733    defined to have corresponding escape sequences.
  2734 
  2735    A graphic character set is at first designated to one of four
  2736    graphic registers (G0 through G3), then these graphic registers are
  2737    invoked to GL or GR.  These designations and invocations can be
  2738    done independently.  The most common case is that G0 is invoked to
  2739    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
  2740    these invocations and designations are omitted in encoded text.
  2741    In a 7-bit environment, only GL can be used.
  2742 
  2743    When a graphic character set of CHARS94 is invoked to GL, codes
  2744    0x20 and 0x7F of the GL area work as control characters SPACE and
  2745    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
  2746    be used.
  2747 
  2748    There are two ways of invocation: locking-shift and single-shift.
  2749    With locking-shift, the invocation lasts until the next different
  2750    invocation, whereas with single-shift, the invocation affects the
  2751    following character only and doesn't affect the locking-shift
  2752    state.  Invocations are done by the following control characters or
  2753    escape sequences:
  2754 
  2755    ----------------------------------------------------------------------
  2756    abbrev  function                  cntrl escape seq   description
  2757    ----------------------------------------------------------------------
  2758    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
  2759    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
  2760    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
  2761    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
  2762    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
  2763    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
  2764    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
  2765    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
  2766    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
  2767    ----------------------------------------------------------------------
  2768    (*) These are not used by any known coding system.
  2769 
  2770    Control characters for these functions are defined by macros
  2771    ISO_CODE_XXX in `coding.h'.
  2772 
  2773    Designations are done by the following escape sequences:
  2774    ----------------------------------------------------------------------
  2775    escape sequence      description
  2776    ----------------------------------------------------------------------
  2777    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
  2778    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
  2779    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
  2780    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
  2781    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
  2782    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
  2783    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
  2784    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
  2785    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
  2786    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
  2787    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
  2788    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
  2789    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
  2790    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
  2791    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
  2792    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
  2793    ----------------------------------------------------------------------
  2794 
  2795    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
  2796    of dimension 1, chars 94, and final character <F>, etc...
  2797 
  2798    Note (*): Although these designations are not allowed in ISO2022,
  2799    Emacs accepts them on decoding, and produces them on encoding
  2800    CHARS96 character sets in a coding system which is characterized as
  2801    7-bit environment, non-locking-shift, and non-single-shift.
  2802 
  2803    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
  2804    '(' must be omitted.  We refer to this as "short-form" hereafter.
  2805 
  2806    Now you may notice that there are a lot of ways of encoding the
  2807    same multilingual text in ISO2022.  Actually, there exist many
  2808    coding systems such as Compound Text (used in X11's inter client
  2809    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
  2810    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
  2811    localized platforms), and all of these are variants of ISO2022.
  2812 
  2813    In addition to the above, Emacs handles two more kinds of escape
  2814    sequences: ISO6429's direction specification and Emacs' private
  2815    sequence for specifying character composition.
  2816 
  2817    ISO6429's direction specification takes the following form:
  2818         o CSI ']'      -- end of the current direction
  2819         o CSI '0' ']'  -- end of the current direction
  2820         o CSI '1' ']'  -- start of left-to-right text
  2821         o CSI '2' ']'  -- start of right-to-left text
  2822    The control character CSI (0x9B: control sequence introducer) is
  2823    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
  2824 
  2825    Character composition specification takes the following form:
  2826         o ESC '0' -- start relative composition
  2827         o ESC '1' -- end composition
  2828         o ESC '2' -- start rule-base composition (*)
  2829         o ESC '3' -- start relative composition with alternate chars  (**)
  2830         o ESC '4' -- start rule-base composition with alternate chars  (**)
  2831   Since these are not standard escape sequences of any ISO standard,
  2832   the use of them with these meanings is restricted to Emacs only.
  2833 
  2834   (*) This form is used only in Emacs 20.7 and older versions,
  2835   but newer versions can safely decode it.
  2836   (**) This form is used only in Emacs 21.1 and newer versions,
  2837   and older versions can't decode it.
  2838 
  2839   Here's a list of example usages of these composition escape
  2840   sequences (categorized by `enum composition_method').
  2841 
  2842   COMPOSITION_RELATIVE:
  2843         ESC 0 CHAR [ CHAR ] ESC 1
  2844   COMPOSITION_WITH_RULE:
  2845         ESC 2 CHAR [ RULE CHAR ] ESC 1
  2846   COMPOSITION_WITH_ALTCHARS:
  2847         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
  2848   COMPOSITION_WITH_RULE_ALTCHARS:
  2849         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
  2850 
  2851 static enum iso_code_class_type iso_code_class[256];
  2852 
  2853 #define SAFE_CHARSET_P(coding, id)      \
  2854   ((id) <= (coding)->max_charset_id     \
  2855    && (coding)->safe_charsets[id] != 255)
  2856 
  2857 static void
  2858 setup_iso_safe_charsets (Lisp_Object attrs)
  2859 {
  2860   Lisp_Object charset_list, safe_charsets;
  2861   Lisp_Object request;
  2862   Lisp_Object reg_usage;
  2863   Lisp_Object tail;
  2864   EMACS_INT reg94, reg96;
  2865   int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  2866   int max_charset_id;
  2867 
  2868   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  2869   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
  2870       && ! EQ (charset_list, Viso_2022_charset_list))
  2871     {
  2872       charset_list = Viso_2022_charset_list;
  2873       ASET (attrs, coding_attr_charset_list, charset_list);
  2874       ASET (attrs, coding_attr_safe_charsets, Qnil);
  2875     }
  2876 
  2877   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
  2878     return;
  2879 
  2880   max_charset_id = 0;
  2881   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
  2882     {
  2883       int id = XFIXNUM (XCAR (tail));
  2884       if (max_charset_id < id)
  2885         max_charset_id = id;
  2886     }
  2887 
  2888   safe_charsets = make_uninit_string (max_charset_id + 1);
  2889   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
  2890   request = AREF (attrs, coding_attr_iso_request);
  2891   reg_usage = AREF (attrs, coding_attr_iso_usage);
  2892   reg94 = XFIXNUM (XCAR (reg_usage));
  2893   reg96 = XFIXNUM (XCDR (reg_usage));
  2894 
  2895   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
  2896     {
  2897       Lisp_Object id;
  2898       Lisp_Object reg;
  2899       struct charset *charset;
  2900 
  2901       id = XCAR (tail);
  2902       charset = CHARSET_FROM_ID (XFIXNUM (id));
  2903       reg = Fcdr (Fassq (id, request));
  2904       if (! NILP (reg))
  2905         SSET (safe_charsets, XFIXNUM (id), XFIXNUM (reg));
  2906       else if (charset->iso_chars_96)
  2907         {
  2908           if (reg96 < 4)
  2909             SSET (safe_charsets, XFIXNUM (id), reg96);
  2910         }
  2911       else
  2912         {
  2913           if (reg94 < 4)
  2914             SSET (safe_charsets, XFIXNUM (id), reg94);
  2915         }
  2916     }
  2917   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
  2918 }
  2919 
  2920 
  2921 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  2922    Return true if a text is encoded in one of ISO-2022 based coding
  2923    systems.  */
  2924 
  2925 static bool
  2926 detect_coding_iso_2022 (struct coding_system *coding,
  2927                         struct coding_detection_info *detect_info)
  2928 {
  2929   const unsigned char *src = coding->source, *src_base = src;
  2930   const unsigned char *src_end = coding->source + coding->src_bytes;
  2931   bool multibytep = coding->src_multibyte;
  2932   bool single_shifting = 0;
  2933   int id;
  2934   int c, c1;
  2935   ptrdiff_t consumed_chars = 0;
  2936   int i;
  2937   int rejected = 0;
  2938   int found = 0;
  2939   int composition_count = -1;
  2940 
  2941   detect_info->checked |= CATEGORY_MASK_ISO;
  2942 
  2943   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
  2944     {
  2945       struct coding_system *this = &(coding_categories[i]);
  2946       Lisp_Object attrs, val;
  2947 
  2948       if (this->id < 0)
  2949         continue;
  2950       attrs = CODING_ID_ATTRS (this->id);
  2951       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
  2952           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
  2953         setup_iso_safe_charsets (attrs);
  2954       val = CODING_ATTR_SAFE_CHARSETS (attrs);
  2955       this->max_charset_id = SCHARS (val) - 1;
  2956       this->safe_charsets = SDATA (val);
  2957     }
  2958 
  2959   /* A coding system of this category is always ASCII compatible.  */
  2960   src += coding->head_ascii;
  2961 
  2962   while (rejected != CATEGORY_MASK_ISO)
  2963     {
  2964       src_base = src;
  2965       ONE_MORE_BYTE (c);
  2966       switch (c)
  2967         {
  2968         case ISO_CODE_ESC:
  2969           if (inhibit_iso_escape_detection)
  2970             break;
  2971           single_shifting = 0;
  2972           ONE_MORE_BYTE (c);
  2973           if (c == 'N' || c == 'O')
  2974             {
  2975               /* ESC <Fe> for SS2 or SS3.  */
  2976               single_shifting = 1;
  2977               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
  2978             }
  2979           else if (c == '1')
  2980             {
  2981               /* End of composition.  */
  2982               if (composition_count < 0
  2983                   || composition_count > MAX_COMPOSITION_COMPONENTS)
  2984                 /* Invalid */
  2985                 break;
  2986               composition_count = -1;
  2987               found |= CATEGORY_MASK_ISO;
  2988             }
  2989           else if (c >= '0' && c <= '4')
  2990             {
  2991               /* ESC <Fp> for start/end composition.  */
  2992               composition_count = 0;
  2993             }
  2994           else
  2995             {
  2996               if (c >= '(' && c <= '/')
  2997                 {
  2998                   /* Designation sequence for a charset of dimension 1.  */
  2999                   ONE_MORE_BYTE (c1);
  3000                   if (c1 < ' ' || c1 >= 0x80
  3001                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
  3002                     {
  3003                       /* Invalid designation sequence.  Just ignore.  */
  3004                       if (c1 >= 0x80)
  3005                         rejected |= (CATEGORY_MASK_ISO_7BIT
  3006                                      | CATEGORY_MASK_ISO_7_ELSE);
  3007                       break;
  3008                     }
  3009                 }
  3010               else if (c == '$')
  3011                 {
  3012                   /* Designation sequence for a charset of dimension 2.  */
  3013                   ONE_MORE_BYTE (c);
  3014                   if (c >= '@' && c <= 'B')
  3015                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
  3016                     id = iso_charset_table[1][0][c];
  3017                   else if (c >= '(' && c <= '/')
  3018                     {
  3019                       ONE_MORE_BYTE (c1);
  3020                       if (c1 < ' ' || c1 >= 0x80
  3021                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
  3022                         {
  3023                           /* Invalid designation sequence.  Just ignore.  */
  3024                           if (c1 >= 0x80)
  3025                             rejected |= (CATEGORY_MASK_ISO_7BIT
  3026                                          | CATEGORY_MASK_ISO_7_ELSE);
  3027                           break;
  3028                         }
  3029                     }
  3030                   else
  3031                     {
  3032                       /* Invalid designation sequence.  Just ignore it.  */
  3033                       if (c >= 0x80)
  3034                         rejected |= (CATEGORY_MASK_ISO_7BIT
  3035                                      | CATEGORY_MASK_ISO_7_ELSE);
  3036                       break;
  3037                     }
  3038                 }
  3039               else
  3040                 {
  3041                   /* Invalid escape sequence.  Just ignore it.  */
  3042                   if (c >= 0x80)
  3043                     rejected |= (CATEGORY_MASK_ISO_7BIT
  3044                                  | CATEGORY_MASK_ISO_7_ELSE);
  3045                   break;
  3046                 }
  3047 
  3048               /* We found a valid designation sequence for CHARSET.  */
  3049               rejected |= CATEGORY_MASK_ISO_8BIT;
  3050               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
  3051                                   id))
  3052                 found |= CATEGORY_MASK_ISO_7;
  3053               else
  3054                 rejected |= CATEGORY_MASK_ISO_7;
  3055               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
  3056                                   id))
  3057                 found |= CATEGORY_MASK_ISO_7_TIGHT;
  3058               else
  3059                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
  3060               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
  3061                                   id))
  3062                 found |= CATEGORY_MASK_ISO_7_ELSE;
  3063               else
  3064                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
  3065               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
  3066                                   id))
  3067                 found |= CATEGORY_MASK_ISO_8_ELSE;
  3068               else
  3069                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
  3070             }
  3071           break;
  3072 
  3073         case ISO_CODE_SO:
  3074         case ISO_CODE_SI:
  3075           /* Locking shift out/in.  */
  3076           if (inhibit_iso_escape_detection)
  3077             break;
  3078           single_shifting = 0;
  3079           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
  3080           break;
  3081 
  3082         case ISO_CODE_CSI:
  3083           /* Control sequence introducer.  */
  3084           single_shifting = 0;
  3085           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3086           found |= CATEGORY_MASK_ISO_8_ELSE;
  3087           goto check_extra_latin;
  3088 
  3089         case ISO_CODE_SS2:
  3090         case ISO_CODE_SS3:
  3091           /* Single shift.   */
  3092           if (inhibit_iso_escape_detection)
  3093             break;
  3094           single_shifting = 0;
  3095           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3096           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
  3097               & CODING_ISO_FLAG_SINGLE_SHIFT)
  3098             {
  3099               found |= CATEGORY_MASK_ISO_8_1;
  3100               single_shifting = 1;
  3101             }
  3102           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
  3103               & CODING_ISO_FLAG_SINGLE_SHIFT)
  3104             {
  3105               found |= CATEGORY_MASK_ISO_8_2;
  3106               single_shifting = 1;
  3107             }
  3108           if (single_shifting)
  3109             break;
  3110           goto check_extra_latin;
  3111 
  3112         default:
  3113           if (c < 0)
  3114             continue;
  3115           if (c < 0x80)
  3116             {
  3117               if (composition_count >= 0)
  3118                 composition_count++;
  3119               single_shifting = 0;
  3120               break;
  3121             }
  3122           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
  3123           if (c >= 0xA0)
  3124             {
  3125               found |= CATEGORY_MASK_ISO_8_1;
  3126               /* Check the length of succeeding codes of the range
  3127                  0xA0..0FF.  If the byte length is even, we include
  3128                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
  3129                  only when we are not single shifting.  */
  3130               if (! single_shifting
  3131                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
  3132                 {
  3133                   ptrdiff_t len = 1;
  3134                   while (src < src_end)
  3135                     {
  3136                       src_base = src;
  3137                       ONE_MORE_BYTE (c);
  3138                       if (c < 0xA0)
  3139                         {
  3140                           src = src_base;
  3141                           break;
  3142                         }
  3143                       len++;
  3144                     }
  3145 
  3146                   if (len & 1 && src < src_end)
  3147                     {
  3148                       rejected |= CATEGORY_MASK_ISO_8_2;
  3149                       if (composition_count >= 0)
  3150                         composition_count += len;
  3151                     }
  3152                   else
  3153                     {
  3154                       found |= CATEGORY_MASK_ISO_8_2;
  3155                       if (composition_count >= 0)
  3156                         composition_count += len / 2;
  3157                     }
  3158                 }
  3159               break;
  3160             }
  3161         check_extra_latin:
  3162           if (! VECTORP (Vlatin_extra_code_table)
  3163               || NILP (AREF (Vlatin_extra_code_table, c)))
  3164             {
  3165               rejected = CATEGORY_MASK_ISO;
  3166               break;
  3167             }
  3168           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
  3169               & CODING_ISO_FLAG_LATIN_EXTRA)
  3170             found |= CATEGORY_MASK_ISO_8_1;
  3171           else
  3172             rejected |= CATEGORY_MASK_ISO_8_1;
  3173           rejected |= CATEGORY_MASK_ISO_8_2;
  3174           break;
  3175         }
  3176     }
  3177   detect_info->rejected |= CATEGORY_MASK_ISO;
  3178   return 0;
  3179 
  3180  no_more_source:
  3181   detect_info->rejected |= rejected;
  3182   detect_info->found |= (found & ~rejected);
  3183   return 1;
  3184 }
  3185 
  3186 
  3187 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
  3188    escape sequence should be kept.  */
  3189 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
  3190   do {                                                                  \
  3191     int id, prev;                                                       \
  3192                                                                         \
  3193     if (final < '0' || final >= 128                                     \
  3194         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
  3195         || !SAFE_CHARSET_P (coding, id))                                \
  3196       {                                                                 \
  3197         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
  3198         chars_96 = -1;                                                  \
  3199         break;                                                          \
  3200       }                                                                 \
  3201     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
  3202     if (id == charset_jisx0201_roman)                                   \
  3203       {                                                                 \
  3204         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
  3205           id = charset_ascii;                                           \
  3206       }                                                                 \
  3207     else if (id == charset_jisx0208_1978)                               \
  3208       {                                                                 \
  3209         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
  3210           id = charset_jisx0208;                                        \
  3211       }                                                                 \
  3212     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
  3213     /* If there was an invalid designation to REG previously, and this  \
  3214        designation is ASCII to REG, we should keep this designation     \
  3215        sequence.  */                                                    \
  3216     if (prev == -2 && id == charset_ascii)                              \
  3217       chars_96 = -1;                                                    \
  3218   } while (0)
  3219 
  3220 
  3221 /* Handle these composition sequence (ALT: alternate char):
  3222 
  3223    (1) relative composition: ESC 0 CHAR ... ESC 1
  3224    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
  3225    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
  3226    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
  3227 
  3228    When the start sequence (ESC 0/2/3/4) is found, this annotation
  3229    header is produced.
  3230 
  3231         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
  3232 
  3233    Then, upon reading CHAR or RULE (one or two bytes), these codes are
  3234    produced until the end sequence (ESC 1) is found:
  3235 
  3236    (1) CHAR ... CHAR
  3237    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
  3238    (3) ALT ... ALT -1 -1 CHAR ... CHAR
  3239    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
  3240 
  3241    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
  3242    annotation header is updated as below:
  3243 
  3244    (1) LENGTH: unchanged,  NCHARS: number of CHARs
  3245    (2) LENGTH: unchanged,  NCHARS: number of CHARs
  3246    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
  3247    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
  3248 
  3249    If an error is found while composing, the annotation header is
  3250    changed to:
  3251 
  3252         [ ESC '0'/'2'/'3'/'4' -2 0 ]
  3253 
  3254    and the sequence [ -2 DECODED-RULE ] is changed to the original
  3255    byte sequence as below:
  3256         o the original byte sequence is B: [ B -1 ]
  3257         o the original byte sequence is B1 B2: [ B1 B2 ]
  3258    and the sequence [ -1 -1 ] is changed to the original byte
  3259    sequence:
  3260         [ ESC '0' ]
  3261 */
  3262 
  3263 /* Decode a composition rule C1 and maybe one more byte from the
  3264    source, and set RULE to the encoded composition rule.  If the rule
  3265    is invalid, goto invalid_code.  */
  3266 
  3267 #define DECODE_COMPOSITION_RULE(rule)                                   \
  3268   do {                                                                  \
  3269     rule = c1 - 32;                                                     \
  3270     if (rule < 0)                                                       \
  3271       goto invalid_code;                                                \
  3272     if (rule < 81)              /* old format (before ver.21) */        \
  3273       {                                                                 \
  3274         int gref = (rule) / 9;                                          \
  3275         int nref = (rule) % 9;                                          \
  3276         if (gref == 4) gref = 10;                                       \
  3277         if (nref == 4) nref = 10;                                       \
  3278         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
  3279       }                                                                 \
  3280     else                        /* new format (after ver.21) */         \
  3281       {                                                                 \
  3282         int b;                                                          \
  3283                                                                         \
  3284         ONE_MORE_BYTE (b);                                              \
  3285         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
  3286           goto invalid_code;                                            \
  3287         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
  3288         rule += 0x100;   /* Distinguish it from the old format.  */     \
  3289       }                                                                 \
  3290   } while (0)
  3291 
  3292 #define ENCODE_COMPOSITION_RULE(rule)                           \
  3293   do {                                                          \
  3294     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
  3295                                                                 \
  3296     if (rule < 0x100)           /* old format */                \
  3297       {                                                         \
  3298         if (gref == 10) gref = 4;                               \
  3299         if (nref == 10) nref = 4;                               \
  3300         charbuf[idx] = 32 + gref * 9 + nref;                    \
  3301         charbuf[idx + 1] = -1;                                  \
  3302         new_chars++;                                            \
  3303       }                                                         \
  3304     else                                /* new format */        \
  3305       {                                                         \
  3306         charbuf[idx] = 32 + 81 + gref;                          \
  3307         charbuf[idx + 1] = 32 + nref;                           \
  3308         new_chars += 2;                                         \
  3309       }                                                         \
  3310   } while (0)
  3311 
  3312 /* Finish the current composition as invalid.  */
  3313 
  3314 static int
  3315 finish_composition (int *charbuf, struct composition_status *cmp_status)
  3316 {
  3317   int idx = - cmp_status->length;
  3318   int new_chars;
  3319 
  3320   /* Recover the original ESC sequence */
  3321   charbuf[idx++] = ISO_CODE_ESC;
  3322   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
  3323                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
  3324                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
  3325                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
  3326                     : '4');
  3327   charbuf[idx++] = -2;
  3328   charbuf[idx++] = 0;
  3329   charbuf[idx++] = -1;
  3330   new_chars = cmp_status->nchars;
  3331   if (cmp_status->method >= COMPOSITION_WITH_RULE)
  3332     for (; idx < 0; idx++)
  3333       {
  3334         int elt = charbuf[idx];
  3335 
  3336         if (elt == -2)
  3337           {
  3338             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
  3339             idx++;
  3340           }
  3341         else if (elt == -1)
  3342           {
  3343             charbuf[idx++] = ISO_CODE_ESC;
  3344             charbuf[idx] = '0';
  3345             new_chars += 2;
  3346           }
  3347       }
  3348   cmp_status->state = COMPOSING_NO;
  3349   return new_chars;
  3350 }
  3351 
  3352 /* If characters are under composition, finish the composition.  */
  3353 #define MAYBE_FINISH_COMPOSITION()                              \
  3354   do {                                                          \
  3355     if (cmp_status->state != COMPOSING_NO)                      \
  3356       char_offset += finish_composition (charbuf, cmp_status);  \
  3357   } while (0)
  3358 
  3359 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
  3360 
  3361    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
  3362    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
  3363    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
  3364    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
  3365 
  3366    Produce this annotation sequence now:
  3367 
  3368    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
  3369 */
  3370 
  3371 #define DECODE_COMPOSITION_START(c1)                                       \
  3372   do {                                                                     \
  3373     if (c1 == '0'                                                          \
  3374         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
  3375              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
  3376             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
  3377                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
  3378       {                                                                    \
  3379         *charbuf++ = -1;                                                   \
  3380         *charbuf++= -1;                                                    \
  3381         cmp_status->state = COMPOSING_CHAR;                                \
  3382         cmp_status->length += 2;                                           \
  3383       }                                                                    \
  3384     else                                                                   \
  3385       {                                                                    \
  3386         MAYBE_FINISH_COMPOSITION ();                                       \
  3387         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
  3388                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
  3389                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
  3390                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
  3391         cmp_status->state                                                  \
  3392           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
  3393         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
  3394         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
  3395         cmp_status->nchars = cmp_status->ncomps = 0;                       \
  3396         coding->annotated = 1;                                             \
  3397       }                                                                    \
  3398   } while (0)
  3399 
  3400 
  3401 /* Handle composition end sequence ESC 1.  */
  3402 
  3403 #define DECODE_COMPOSITION_END()                                        \
  3404   do {                                                                  \
  3405     if (cmp_status->nchars == 0                                         \
  3406         || ((cmp_status->state == COMPOSING_CHAR)                       \
  3407             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
  3408       {                                                                 \
  3409         MAYBE_FINISH_COMPOSITION ();                                    \
  3410         goto invalid_code;                                              \
  3411       }                                                                 \
  3412     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
  3413       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
  3414     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
  3415       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
  3416     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
  3417     char_offset += cmp_status->nchars;                                  \
  3418     cmp_status->state = COMPOSING_NO;                                   \
  3419   } while (0)
  3420 
  3421 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
  3422 
  3423 #define STORE_COMPOSITION_RULE(rule)    \
  3424   do {                                  \
  3425     *charbuf++ = -2;                    \
  3426     *charbuf++ = rule;                  \
  3427     cmp_status->length += 2;            \
  3428     cmp_status->state--;                \
  3429   } while (0)
  3430 
  3431 /* Store a composed char or a component char C in charbuf, and update
  3432    cmp_status.  */
  3433 
  3434 #define STORE_COMPOSITION_CHAR(c)                                       \
  3435   do {                                                                  \
  3436     *charbuf++ = (c);                                                   \
  3437     cmp_status->length++;                                               \
  3438     if (cmp_status->state == COMPOSING_CHAR)                            \
  3439       cmp_status->nchars++;                                             \
  3440     else                                                                \
  3441       cmp_status->ncomps++;                                             \
  3442     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
  3443         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
  3444             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
  3445       cmp_status->state++;                                              \
  3446   } while (0)
  3447 
  3448 
  3449 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  3450 
  3451 static void
  3452 decode_coding_iso_2022 (struct coding_system *coding)
  3453 {
  3454   const unsigned char *src = coding->source + coding->consumed;
  3455   const unsigned char *src_end = coding->source + coding->src_bytes;
  3456   const unsigned char *src_base;
  3457   int *charbuf = coding->charbuf + coding->charbuf_used;
  3458   /* We may produce two annotations (charset and composition) in one
  3459      loop and one more charset annotation at the end.  */
  3460   int *charbuf_end
  3461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
  3462   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  3463   bool multibytep = coding->src_multibyte;
  3464   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
  3465   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3466   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3467   int charset_id_2, charset_id_3;
  3468   struct charset *charset;
  3469   int c;
  3470   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
  3471   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
  3472   ptrdiff_t char_offset = coding->produced_char;
  3473   ptrdiff_t last_offset = char_offset;
  3474   int last_id = charset_ascii;
  3475   bool eol_dos
  3476     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  3477   int byte_after_cr = -1;
  3478   int i;
  3479 
  3480   setup_iso_safe_charsets (attrs);
  3481   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
  3482 
  3483   if (cmp_status->state != COMPOSING_NO)
  3484     {
  3485       if (charbuf_end - charbuf < cmp_status->length)
  3486         emacs_abort ();
  3487       for (i = 0; i < cmp_status->length; i++)
  3488         *charbuf++ = cmp_status->carryover[i];
  3489       coding->annotated = 1;
  3490     }
  3491 
  3492   while (1)
  3493     {
  3494       int c1, c2, c3;
  3495 
  3496       src_base = src;
  3497       consumed_chars_base = consumed_chars;
  3498 
  3499       if (charbuf >= charbuf_end)
  3500         {
  3501           if (byte_after_cr >= 0)
  3502             src_base--;
  3503           break;
  3504         }
  3505 
  3506       if (byte_after_cr >= 0)
  3507         c1 = byte_after_cr, byte_after_cr = -1;
  3508       else
  3509         ONE_MORE_BYTE (c1);
  3510       if (c1 < 0)
  3511         goto invalid_code;
  3512 
  3513       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
  3514         {
  3515           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  3516           char_offset++;
  3517           CODING_ISO_EXTSEGMENT_LEN (coding)--;
  3518           continue;
  3519         }
  3520 
  3521       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
  3522         {
  3523           if (c1 == ISO_CODE_ESC)
  3524             {
  3525               if (src + 1 >= src_end)
  3526                 goto no_more_source;
  3527               *charbuf++ = ISO_CODE_ESC;
  3528               char_offset++;
  3529               if (src[0] == '%' && src[1] == '@')
  3530                 {
  3531                   src += 2;
  3532                   consumed_chars += 2;
  3533                   char_offset += 2;
  3534                   /* We are sure charbuf can contain two more chars. */
  3535                   *charbuf++ = '%';
  3536                   *charbuf++ = '@';
  3537                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
  3538                 }
  3539             }
  3540           else
  3541             {
  3542               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
  3543               char_offset++;
  3544             }
  3545           continue;
  3546         }
  3547 
  3548       if ((cmp_status->state == COMPOSING_RULE
  3549            || cmp_status->state == COMPOSING_COMPONENT_RULE)
  3550           && c1 != ISO_CODE_ESC)
  3551         {
  3552           int rule;
  3553 
  3554           DECODE_COMPOSITION_RULE (rule);
  3555           STORE_COMPOSITION_RULE (rule);
  3556           continue;
  3557         }
  3558 
  3559       /* We produce at most one character.  */
  3560       switch (iso_code_class [c1])
  3561         {
  3562         case ISO_0x20_or_0x7F:
  3563           if (charset_id_0 < 0
  3564               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
  3565             /* This is SPACE or DEL.  */
  3566             charset = CHARSET_FROM_ID (charset_ascii);
  3567           else
  3568             charset = CHARSET_FROM_ID (charset_id_0);
  3569           break;
  3570 
  3571         case ISO_graphic_plane_0:
  3572           if (charset_id_0 < 0)
  3573             charset = CHARSET_FROM_ID (charset_ascii);
  3574           else
  3575             charset = CHARSET_FROM_ID (charset_id_0);
  3576           break;
  3577 
  3578         case ISO_0xA0_or_0xFF:
  3579           if (charset_id_1 < 0
  3580               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
  3581               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3582             goto invalid_code;
  3583           /* This is a graphic character, we fall down ... */
  3584           FALLTHROUGH;
  3585         case ISO_graphic_plane_1:
  3586           if (charset_id_1 < 0)
  3587             goto invalid_code;
  3588           charset = CHARSET_FROM_ID (charset_id_1);
  3589           break;
  3590 
  3591         case ISO_control_0:
  3592           if (eol_dos && c1 == '\r')
  3593             ONE_MORE_BYTE (byte_after_cr);
  3594           MAYBE_FINISH_COMPOSITION ();
  3595           charset = CHARSET_FROM_ID (charset_ascii);
  3596           break;
  3597 
  3598         case ISO_control_1:
  3599           goto invalid_code;
  3600 
  3601         case ISO_shift_out:
  3602           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3603               || CODING_ISO_DESIGNATION (coding, 1) < 0)
  3604             goto invalid_code;
  3605           CODING_ISO_INVOCATION (coding, 0) = 1;
  3606           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3607           continue;
  3608 
  3609         case ISO_shift_in:
  3610           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
  3611             goto invalid_code;
  3612           CODING_ISO_INVOCATION (coding, 0) = 0;
  3613           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3614           continue;
  3615 
  3616         case ISO_single_shift_2_7:
  3617           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
  3618             goto invalid_code;
  3619           FALLTHROUGH;
  3620         case ISO_single_shift_2:
  3621           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
  3622             goto invalid_code;
  3623           /* SS2 is handled as an escape sequence of ESC 'N' */
  3624           c1 = 'N';
  3625           goto label_escape_sequence;
  3626 
  3627         case ISO_single_shift_3:
  3628           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
  3629             goto invalid_code;
  3630           /* SS2 is handled as an escape sequence of ESC 'O' */
  3631           c1 = 'O';
  3632           goto label_escape_sequence;
  3633 
  3634         case ISO_control_sequence_introducer:
  3635           /* CSI is handled as an escape sequence of ESC '[' ...  */
  3636           c1 = '[';
  3637           goto label_escape_sequence;
  3638 
  3639         case ISO_escape:
  3640           ONE_MORE_BYTE (c1);
  3641         label_escape_sequence:
  3642           /* Escape sequences handled here are invocation,
  3643              designation, direction specification, and character
  3644              composition specification.  */
  3645           switch (c1)
  3646             {
  3647             case '&':           /* revision of following character set */
  3648               ONE_MORE_BYTE (c1);
  3649               if (!(c1 >= '@' && c1 <= '~'))
  3650                 goto invalid_code;
  3651               ONE_MORE_BYTE (c1);
  3652               if (c1 != ISO_CODE_ESC)
  3653                 goto invalid_code;
  3654               ONE_MORE_BYTE (c1);
  3655               goto label_escape_sequence;
  3656 
  3657             case '$':           /* designation of 2-byte character set */
  3658               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
  3659                 goto invalid_code;
  3660               {
  3661                 int reg, chars96;
  3662 
  3663                 ONE_MORE_BYTE (c1);
  3664                 if (c1 >= '@' && c1 <= 'B')
  3665                   {     /* designation of JISX0208.1978, GB2312.1980,
  3666                            or JISX0208.1980 */
  3667                     reg = 0, chars96 = 0;
  3668                   }
  3669                 else if (c1 >= 0x28 && c1 <= 0x2B)
  3670                   { /* designation of DIMENSION2_CHARS94 character set */
  3671                     reg = c1 - 0x28, chars96 = 0;
  3672                     ONE_MORE_BYTE (c1);
  3673                   }
  3674                 else if (c1 >= 0x2C && c1 <= 0x2F)
  3675                   { /* designation of DIMENSION2_CHARS96 character set */
  3676                     reg = c1 - 0x2C, chars96 = 1;
  3677                     ONE_MORE_BYTE (c1);
  3678                   }
  3679                 else
  3680                   goto invalid_code;
  3681                 DECODE_DESIGNATION (reg, 2, chars96, c1);
  3682                 /* We must update these variables now.  */
  3683                 if (reg == 0)
  3684                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3685                 else if (reg == 1)
  3686                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3687                 if (chars96 < 0)
  3688                   goto invalid_code;
  3689               }
  3690               continue;
  3691 
  3692             case 'n':           /* invocation of locking-shift-2 */
  3693               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3694                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
  3695                 goto invalid_code;
  3696               CODING_ISO_INVOCATION (coding, 0) = 2;
  3697               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3698               continue;
  3699 
  3700             case 'o':           /* invocation of locking-shift-3 */
  3701               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
  3702                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
  3703                 goto invalid_code;
  3704               CODING_ISO_INVOCATION (coding, 0) = 3;
  3705               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3706               continue;
  3707 
  3708             case 'N':           /* invocation of single-shift-2 */
  3709               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  3710                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
  3711                 goto invalid_code;
  3712               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
  3713               if (charset_id_2 < 0)
  3714                 charset = CHARSET_FROM_ID (charset_ascii);
  3715               else
  3716                 charset = CHARSET_FROM_ID (charset_id_2);
  3717               ONE_MORE_BYTE (c1);
  3718               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
  3719                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3720                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
  3721                           ? c1 >= 0x80 : c1 < 0x80)))
  3722                 goto invalid_code;
  3723               break;
  3724 
  3725             case 'O':           /* invocation of single-shift-3 */
  3726               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  3727                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
  3728                 goto invalid_code;
  3729               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
  3730               if (charset_id_3 < 0)
  3731                 charset = CHARSET_FROM_ID (charset_ascii);
  3732               else
  3733                 charset = CHARSET_FROM_ID (charset_id_3);
  3734               ONE_MORE_BYTE (c1);
  3735               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
  3736                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
  3737                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
  3738                           ? c1 >= 0x80 : c1 < 0x80)))
  3739                 goto invalid_code;
  3740               break;
  3741 
  3742             case '0': case '2': case '3': case '4': /* start composition */
  3743               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
  3744                 goto invalid_code;
  3745               if (last_id != charset_ascii)
  3746                 {
  3747                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
  3748                   last_id = charset_ascii;
  3749                   last_offset = char_offset;
  3750                 }
  3751               DECODE_COMPOSITION_START (c1);
  3752               continue;
  3753 
  3754             case '1':           /* end composition */
  3755               if (cmp_status->state == COMPOSING_NO)
  3756                 goto invalid_code;
  3757               DECODE_COMPOSITION_END ();
  3758               continue;
  3759 
  3760             case '[':           /* specification of direction */
  3761               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
  3762                 goto invalid_code;
  3763               /* For the moment, nested direction is not supported.
  3764                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
  3765                  left-to-right, and nonzero means right-to-left.  */
  3766               ONE_MORE_BYTE (c1);
  3767               switch (c1)
  3768                 {
  3769                 case ']':       /* end of the current direction */
  3770                   coding->mode &= ~CODING_MODE_DIRECTION;
  3771                   break;
  3772 
  3773                 case '0':       /* end of the current direction */
  3774                 case '1':       /* start of left-to-right direction */
  3775                   ONE_MORE_BYTE (c1);
  3776                   if (c1 == ']')
  3777                     coding->mode &= ~CODING_MODE_DIRECTION;
  3778                   else
  3779                     goto invalid_code;
  3780                   break;
  3781 
  3782                 case '2':       /* start of right-to-left direction */
  3783                   ONE_MORE_BYTE (c1);
  3784                   if (c1 == ']')
  3785                     coding->mode |= CODING_MODE_DIRECTION;
  3786                   else
  3787                     goto invalid_code;
  3788                   break;
  3789 
  3790                 default:
  3791                   goto invalid_code;
  3792                 }
  3793               continue;
  3794 
  3795             case '%':
  3796               ONE_MORE_BYTE (c1);
  3797               if (c1 == '/')
  3798                 {
  3799                   /* CTEXT extended segment:
  3800                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
  3801                      We keep these bytes as is for the moment.
  3802                      They may be decoded by post-read-conversion.  */
  3803                   int dim, M, L;
  3804                   int size;
  3805 
  3806                   ONE_MORE_BYTE (dim);
  3807                   if (dim < '0' || dim > '4')
  3808                     goto invalid_code;
  3809                   ONE_MORE_BYTE (M);
  3810                   if (M < 128)
  3811                     goto invalid_code;
  3812                   ONE_MORE_BYTE (L);
  3813                   if (L < 128)
  3814                     goto invalid_code;
  3815                   size = ((M - 128) * 128) + (L - 128);
  3816                   if (charbuf + 6 > charbuf_end)
  3817                     goto break_loop;
  3818                   *charbuf++ = ISO_CODE_ESC;
  3819                   *charbuf++ = '%';
  3820                   *charbuf++ = '/';
  3821                   *charbuf++ = dim;
  3822                   *charbuf++ = BYTE8_TO_CHAR (M);
  3823                   *charbuf++ = BYTE8_TO_CHAR (L);
  3824                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
  3825                 }
  3826               else if (c1 == 'G')
  3827                 {
  3828                   /* XFree86 extension for embedding UTF-8 in CTEXT:
  3829                      ESC % G --UTF-8-BYTES-- ESC % @
  3830                      We keep these bytes as is for the moment.
  3831                      They may be decoded by post-read-conversion.  */
  3832                   if (charbuf + 3 > charbuf_end)
  3833                     goto break_loop;
  3834                   *charbuf++ = ISO_CODE_ESC;
  3835                   *charbuf++ = '%';
  3836                   *charbuf++ = 'G';
  3837                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
  3838                 }
  3839               else
  3840                 goto invalid_code;
  3841               continue;
  3842               break;
  3843 
  3844             default:
  3845               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
  3846                 goto invalid_code;
  3847               {
  3848                 int reg, chars96;
  3849 
  3850                 if (c1 >= 0x28 && c1 <= 0x2B)
  3851                   { /* designation of DIMENSION1_CHARS94 character set */
  3852                     reg = c1 - 0x28, chars96 = 0;
  3853                     ONE_MORE_BYTE (c1);
  3854                   }
  3855                 else if (c1 >= 0x2C && c1 <= 0x2F)
  3856                   { /* designation of DIMENSION1_CHARS96 character set */
  3857                     reg = c1 - 0x2C, chars96 = 1;
  3858                     ONE_MORE_BYTE (c1);
  3859                   }
  3860                 else
  3861                   goto invalid_code;
  3862                 DECODE_DESIGNATION (reg, 1, chars96, c1);
  3863                 /* We must update these variables now.  */
  3864                 if (reg == 0)
  3865                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
  3866                 else if (reg == 1)
  3867                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
  3868                 if (chars96 < 0)
  3869                   goto invalid_code;
  3870               }
  3871               continue;
  3872             }
  3873           break;
  3874 
  3875         default:
  3876           emacs_abort ();
  3877         }
  3878 
  3879       if (cmp_status->state == COMPOSING_NO
  3880           && charset->id != charset_ascii
  3881           && last_id != charset->id)
  3882         {
  3883           if (last_id != charset_ascii)
  3884             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  3885           last_id = charset->id;
  3886           last_offset = char_offset;
  3887         }
  3888 
  3889       /* Now we know CHARSET and 1st position code C1 of a character.
  3890          Produce a decoded character while getting 2nd and 3rd
  3891          position codes C2, C3 if necessary.  */
  3892       if (CHARSET_DIMENSION (charset) > 1)
  3893         {
  3894           ONE_MORE_BYTE (c2);
  3895           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
  3896               || ((c1 & 0x80) != (c2 & 0x80)))
  3897             /* C2 is not in a valid range.  */
  3898             goto invalid_code;
  3899           if (CHARSET_DIMENSION (charset) == 2)
  3900             c1 = (c1 << 8) | c2;
  3901           else
  3902             {
  3903               ONE_MORE_BYTE (c3);
  3904               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
  3905                   || ((c1 & 0x80) != (c3 & 0x80)))
  3906                 /* C3 is not in a valid range.  */
  3907                 goto invalid_code;
  3908               c1 = (c1 << 16) | (c2 << 8) | c2;
  3909             }
  3910         }
  3911       c1 &= 0x7F7F7F;
  3912       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
  3913       if (c < 0)
  3914         {
  3915           MAYBE_FINISH_COMPOSITION ();
  3916           for (; src_base < src; src_base++, char_offset++)
  3917             {
  3918               if (ASCII_CHAR_P (*src_base))
  3919                 *charbuf++ = *src_base;
  3920               else
  3921                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
  3922             }
  3923         }
  3924       else if (cmp_status->state == COMPOSING_NO)
  3925         {
  3926           *charbuf++ = c;
  3927           char_offset++;
  3928         }
  3929       else if ((cmp_status->state == COMPOSING_CHAR
  3930                 ? cmp_status->nchars
  3931                 : cmp_status->ncomps)
  3932                >= MAX_COMPOSITION_COMPONENTS)
  3933         {
  3934           /* Too long composition.  */
  3935           MAYBE_FINISH_COMPOSITION ();
  3936           *charbuf++ = c;
  3937           char_offset++;
  3938         }
  3939       else
  3940         STORE_COMPOSITION_CHAR (c);
  3941       continue;
  3942 
  3943     invalid_code:
  3944       MAYBE_FINISH_COMPOSITION ();
  3945       src = src_base;
  3946       consumed_chars = consumed_chars_base;
  3947       ONE_MORE_BYTE (c);
  3948       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  3949       char_offset++;
  3950       /* Reset the invocation and designation status to the safest
  3951          one; i.e. designate ASCII to the graphic register 0, and
  3952          invoke that register to the graphic plane 0.  This typically
  3953          helps the case that a designation sequence for ASCII "ESC (
  3954          B" is somehow broken (e.g. broken by a newline).  */
  3955       CODING_ISO_INVOCATION (coding, 0) = 0;
  3956       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
  3957       charset_id_0 = charset_ascii;
  3958       continue;
  3959 
  3960     break_loop:
  3961       break;
  3962     }
  3963 
  3964  no_more_source:
  3965   if (cmp_status->state != COMPOSING_NO)
  3966     {
  3967       if (coding->mode & CODING_MODE_LAST_BLOCK)
  3968         MAYBE_FINISH_COMPOSITION ();
  3969       else
  3970         {
  3971           charbuf -= cmp_status->length;
  3972           for (i = 0; i < cmp_status->length; i++)
  3973             cmp_status->carryover[i] = charbuf[i];
  3974         }
  3975     }
  3976   else if (last_id != charset_ascii)
  3977     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  3978   coding->consumed_char += consumed_chars_base;
  3979   coding->consumed = src_base - coding->source;
  3980   coding->charbuf_used = charbuf - coding->charbuf;
  3981 }
  3982 
  3983 
  3984 /* ISO2022 encoding stuff.  */
  3985 
  3986 /*
  3987    It is not enough to say just "ISO2022" on encoding, we have to
  3988    specify more details.  In Emacs, each coding system of ISO2022
  3989    variant has the following specifications:
  3990         1. Initial designation to G0 thru G3.
  3991         2. Allows short-form designation?
  3992         3. ASCII should be designated to G0 before control characters?
  3993         4. ASCII should be designated to G0 at end of line?
  3994         5. 7-bit environment or 8-bit environment?
  3995         6. Use locking-shift?
  3996         7. Use Single-shift?
  3997    And the following two are only for Japanese:
  3998         8. Use ASCII in place of JIS0201-1976-Roman?
  3999         9. Use JISX0208-1983 in place of JISX0208-1978?
  4000    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
  4001    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
  4002    details.
  4003 */
  4004 
  4005 /* Produce codes (escape sequence) for designating CHARSET to graphic
  4006    register REG at DST, and increment DST.  If <final-char> of CHARSET is
  4007    '@', 'A', or 'B' and the coding system CODING allows, produce
  4008    designation sequence of short-form.  */
  4009 
  4010 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
  4011   do {                                                                  \
  4012     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
  4013     const char *intermediate_char_94 = "()*+";                          \
  4014     const char *intermediate_char_96 = ",-./";                          \
  4015     int revision = -1;                                                  \
  4016                                                                         \
  4017     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
  4018       revision = CHARSET_ISO_REVISION (charset);                        \
  4019                                                                         \
  4020     if (revision >= 0)                                                  \
  4021       {                                                                 \
  4022         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
  4023         EMIT_ONE_BYTE ('@' + revision);                                 \
  4024       }                                                                 \
  4025     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
  4026     if (CHARSET_DIMENSION (charset) == 1)                               \
  4027       {                                                                 \
  4028         int b;                                                          \
  4029         if (! CHARSET_ISO_CHARS_96 (charset))                           \
  4030           b = intermediate_char_94[reg];                                \
  4031         else                                                            \
  4032           b = intermediate_char_96[reg];                                \
  4033         EMIT_ONE_ASCII_BYTE (b);                                        \
  4034       }                                                                 \
  4035     else                                                                \
  4036       {                                                                 \
  4037         EMIT_ONE_ASCII_BYTE ('$');                                      \
  4038         if (! CHARSET_ISO_CHARS_96 (charset))                           \
  4039           {                                                             \
  4040             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
  4041                 || reg != 0                                             \
  4042                 || final_char < '@' || final_char > 'B')                \
  4043               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
  4044           }                                                             \
  4045         else                                                            \
  4046           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
  4047       }                                                                 \
  4048     EMIT_ONE_ASCII_BYTE (final_char);                                   \
  4049                                                                         \
  4050     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
  4051   } while (0)
  4052 
  4053 
  4054 /* The following two macros produce codes (control character or escape
  4055    sequence) for ISO2022 single-shift functions (single-shift-2 and
  4056    single-shift-3).  */
  4057 
  4058 #define ENCODE_SINGLE_SHIFT_2                                           \
  4059   do {                                                                  \
  4060     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
  4061       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
  4062     else                                                                \
  4063       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
  4064     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
  4065   } while (0)
  4066 
  4067 
  4068 #define ENCODE_SINGLE_SHIFT_3                                           \
  4069   do {                                                                  \
  4070     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
  4071       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
  4072     else                                                                \
  4073       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
  4074     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
  4075   } while (0)
  4076 
  4077 
  4078 /* The following four macros produce codes (control character or
  4079    escape sequence) for ISO2022 locking-shift functions (shift-in,
  4080    shift-out, locking-shift-2, and locking-shift-3).  */
  4081 
  4082 #define ENCODE_SHIFT_IN                                 \
  4083   do {                                                  \
  4084     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
  4085     CODING_ISO_INVOCATION (coding, 0) = 0;              \
  4086   } while (0)
  4087 
  4088 
  4089 #define ENCODE_SHIFT_OUT                                \
  4090   do {                                                  \
  4091     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
  4092     CODING_ISO_INVOCATION (coding, 0) = 1;              \
  4093   } while (0)
  4094 
  4095 
  4096 #define ENCODE_LOCKING_SHIFT_2                          \
  4097   do {                                                  \
  4098     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
  4099     CODING_ISO_INVOCATION (coding, 0) = 2;              \
  4100   } while (0)
  4101 
  4102 
  4103 #define ENCODE_LOCKING_SHIFT_3                          \
  4104   do {                                                  \
  4105     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
  4106     CODING_ISO_INVOCATION (coding, 0) = 3;              \
  4107   } while (0)
  4108 
  4109 
  4110 /* Produce codes for a DIMENSION1 character whose character set is
  4111    CHARSET and whose position-code is C1.  Designation and invocation
  4112    sequences are also produced in advance if necessary.  */
  4113 
  4114 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
  4115   do {                                                                  \
  4116     int id = CHARSET_ID (charset);                                      \
  4117                                                                         \
  4118     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
  4119         && id == charset_ascii)                                         \
  4120       {                                                                 \
  4121         id = charset_jisx0201_roman;                                    \
  4122         charset = CHARSET_FROM_ID (id);                                 \
  4123       }                                                                 \
  4124                                                                         \
  4125     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
  4126       {                                                                 \
  4127         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
  4128           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
  4129         else                                                            \
  4130           EMIT_ONE_BYTE (c1 | 0x80);                                    \
  4131         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
  4132         break;                                                          \
  4133       }                                                                 \
  4134     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
  4135       {                                                                 \
  4136         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
  4137         break;                                                          \
  4138       }                                                                 \
  4139     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
  4140       {                                                                 \
  4141         EMIT_ONE_BYTE (c1 | 0x80);                                      \
  4142         break;                                                          \
  4143       }                                                                 \
  4144     else                                                                \
  4145       /* Since CHARSET is not yet invoked to any graphic planes, we     \
  4146          must invoke it, or, at first, designate it to some graphic     \
  4147          register.  Then repeat the loop to actually produce the        \
  4148          character.  */                                                 \
  4149       dst = encode_invocation_designation (charset, coding, dst,        \
  4150                                            &produced_chars);            \
  4151   } while (1)
  4152 
  4153 
  4154 /* Produce codes for a DIMENSION2 character whose character set is
  4155    CHARSET and whose position-codes are C1 and C2.  Designation and
  4156    invocation codes are also produced in advance if necessary.  */
  4157 
  4158 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
  4159   do {                                                                  \
  4160     int id = CHARSET_ID (charset);                                      \
  4161                                                                         \
  4162     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
  4163         && id == charset_jisx0208)                                      \
  4164       {                                                                 \
  4165         id = charset_jisx0208_1978;                                     \
  4166         charset = CHARSET_FROM_ID (id);                                 \
  4167       }                                                                 \
  4168                                                                         \
  4169     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
  4170       {                                                                 \
  4171         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
  4172           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
  4173         else                                                            \
  4174           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
  4175         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
  4176         break;                                                          \
  4177       }                                                                 \
  4178     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
  4179       {                                                                 \
  4180         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
  4181         break;                                                          \
  4182       }                                                                 \
  4183     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
  4184       {                                                                 \
  4185         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
  4186         break;                                                          \
  4187       }                                                                 \
  4188     else                                                                \
  4189       /* Since CHARSET is not yet invoked to any graphic planes, we     \
  4190          must invoke it, or, at first, designate it to some graphic     \
  4191          register.  Then repeat the loop to actually produce the        \
  4192          character.  */                                                 \
  4193       dst = encode_invocation_designation (charset, coding, dst,        \
  4194                                            &produced_chars);            \
  4195   } while (1)
  4196 
  4197 
  4198 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
  4199   do {                                                                     \
  4200     unsigned code;                                                         \
  4201     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
  4202                                                                            \
  4203     if (CHARSET_DIMENSION (charset) == 1)                                  \
  4204       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
  4205     else                                                                   \
  4206       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
  4207   } while (0)
  4208 
  4209 
  4210 /* Produce designation and invocation codes at a place pointed by DST
  4211    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
  4212    Return new DST.  */
  4213 
  4214 static unsigned char *
  4215 encode_invocation_designation (struct charset *charset,
  4216                                struct coding_system *coding,
  4217                                unsigned char *dst, ptrdiff_t *p_nchars)
  4218 {
  4219   bool multibytep = coding->dst_multibyte;
  4220   ptrdiff_t produced_chars = *p_nchars;
  4221   int reg;                      /* graphic register number */
  4222   int id = CHARSET_ID (charset);
  4223 
  4224   /* At first, check designations.  */
  4225   for (reg = 0; reg < 4; reg++)
  4226     if (id == CODING_ISO_DESIGNATION (coding, reg))
  4227       break;
  4228 
  4229   if (reg >= 4)
  4230     {
  4231       /* CHARSET is not yet designated to any graphic registers.  */
  4232       /* At first check the requested designation.  */
  4233       reg = CODING_ISO_REQUEST (coding, id);
  4234       if (reg < 0)
  4235         /* Since CHARSET requests no special designation, designate it
  4236            to graphic register 0.  */
  4237         reg = 0;
  4238 
  4239       ENCODE_DESIGNATION (charset, reg, coding);
  4240     }
  4241 
  4242   if (CODING_ISO_INVOCATION (coding, 0) != reg
  4243       && CODING_ISO_INVOCATION (coding, 1) != reg)
  4244     {
  4245       /* Since the graphic register REG is not invoked to any graphic
  4246          planes, invoke it to graphic plane 0.  */
  4247       switch (reg)
  4248         {
  4249         case 0:                 /* graphic register 0 */
  4250           ENCODE_SHIFT_IN;
  4251           break;
  4252 
  4253         case 1:                 /* graphic register 1 */
  4254           ENCODE_SHIFT_OUT;
  4255           break;
  4256 
  4257         case 2:                 /* graphic register 2 */
  4258           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  4259             ENCODE_SINGLE_SHIFT_2;
  4260           else
  4261             ENCODE_LOCKING_SHIFT_2;
  4262           break;
  4263 
  4264         case 3:                 /* graphic register 3 */
  4265           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
  4266             ENCODE_SINGLE_SHIFT_3;
  4267           else
  4268             ENCODE_LOCKING_SHIFT_3;
  4269           break;
  4270 
  4271         default:
  4272           break;
  4273         }
  4274     }
  4275 
  4276   *p_nchars = produced_chars;
  4277   return dst;
  4278 }
  4279 
  4280 
  4281 /* Produce codes for designation and invocation to reset the graphic
  4282    planes and registers to initial state.  */
  4283 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
  4284   do {                                                                  \
  4285     int reg;                                                            \
  4286     struct charset *charset;                                            \
  4287                                                                         \
  4288     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
  4289       ENCODE_SHIFT_IN;                                                  \
  4290     for (reg = 0; reg < 4; reg++)                                       \
  4291       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
  4292           && (CODING_ISO_DESIGNATION (coding, reg)                      \
  4293               != CODING_ISO_INITIAL (coding, reg)))                     \
  4294         {                                                               \
  4295           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
  4296           ENCODE_DESIGNATION (charset, reg, coding);                    \
  4297         }                                                               \
  4298   } while (0)
  4299 
  4300 
  4301 /* Produce designation sequences of charsets in the line started from
  4302    CHARBUF to a place pointed by DST, and return the number of
  4303    produced bytes.  DST should not directly point a buffer text area
  4304    which may be relocated by char_charset call.
  4305 
  4306    If the current block ends before any end-of-line, we may fail to
  4307    find all the necessary designations.  */
  4308 
  4309 static ptrdiff_t
  4310 encode_designation_at_bol (struct coding_system *coding,
  4311                            int *charbuf, int *charbuf_end,
  4312                            unsigned char *dst)
  4313 {
  4314   unsigned char *orig = dst;
  4315   struct charset *charset;
  4316   /* Table of charsets to be designated to each graphic register.  */
  4317   int r[4];
  4318   int c, found = 0, reg;
  4319   ptrdiff_t produced_chars = 0;
  4320   bool multibytep = coding->dst_multibyte;
  4321   Lisp_Object attrs;
  4322   Lisp_Object charset_list;
  4323 
  4324   attrs = CODING_ID_ATTRS (coding->id);
  4325   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  4326   if (EQ (charset_list, Qiso_2022))
  4327     charset_list = Viso_2022_charset_list;
  4328 
  4329   for (reg = 0; reg < 4; reg++)
  4330     r[reg] = -1;
  4331 
  4332   while (charbuf < charbuf_end && found < 4)
  4333     {
  4334       int id;
  4335 
  4336       c = *charbuf++;
  4337       if (c == '\n')
  4338         break;
  4339       charset = char_charset (c, charset_list, NULL);
  4340       id = CHARSET_ID (charset);
  4341       reg = CODING_ISO_REQUEST (coding, id);
  4342       if (reg >= 0 && r[reg] < 0)
  4343         {
  4344           found++;
  4345           r[reg] = id;
  4346         }
  4347     }
  4348 
  4349   if (found)
  4350     {
  4351       for (reg = 0; reg < 4; reg++)
  4352         if (r[reg] >= 0
  4353             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
  4354           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
  4355     }
  4356 
  4357   return dst - orig;
  4358 }
  4359 
  4360 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
  4361 
  4362 static bool
  4363 encode_coding_iso_2022 (struct coding_system *coding)
  4364 {
  4365   bool multibytep = coding->dst_multibyte;
  4366   int *charbuf = coding->charbuf;
  4367   int *charbuf_end = charbuf + coding->charbuf_used;
  4368   unsigned char *dst = coding->destination + coding->produced;
  4369   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4370   int safe_room = 16;
  4371   bool bol_designation
  4372     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
  4373        && CODING_ISO_BOL (coding));
  4374   ptrdiff_t produced_chars = 0;
  4375   Lisp_Object attrs, eol_type, charset_list;
  4376   bool ascii_compatible;
  4377   int c;
  4378   int preferred_charset_id = -1;
  4379 
  4380   CODING_GET_INFO (coding, attrs, charset_list);
  4381   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  4382   if (VECTORP (eol_type))
  4383     eol_type = Qunix;
  4384 
  4385   setup_iso_safe_charsets (attrs);
  4386   /* Charset list may have been changed.  */
  4387   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  4388   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
  4389 
  4390   ascii_compatible
  4391     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  4392        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
  4393                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
  4394 
  4395   while (charbuf < charbuf_end)
  4396     {
  4397       ASSURE_DESTINATION (safe_room);
  4398 
  4399       if (bol_designation)
  4400         {
  4401           /* We have to produce designation sequences if any now.  */
  4402           unsigned char desig_buf[16];
  4403           ptrdiff_t nbytes;
  4404           ptrdiff_t offset;
  4405 
  4406           charset_map_loaded = 0;
  4407           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
  4408                                               desig_buf);
  4409           if (charset_map_loaded
  4410               && (offset = coding_change_destination (coding)))
  4411             {
  4412               dst += offset;
  4413               dst_end += offset;
  4414             }
  4415           memcpy (dst, desig_buf, nbytes);
  4416           dst += nbytes;
  4417           /* We are sure that designation sequences are all ASCII bytes.  */
  4418           produced_chars += nbytes;
  4419           bol_designation = 0;
  4420           ASSURE_DESTINATION (safe_room);
  4421         }
  4422 
  4423       c = *charbuf++;
  4424 
  4425       if (c < 0)
  4426         {
  4427           /* Handle an annotation.  */
  4428           switch (*charbuf)
  4429             {
  4430             case CODING_ANNOTATE_COMPOSITION_MASK:
  4431               /* Not yet implemented.  */
  4432               break;
  4433             case CODING_ANNOTATE_CHARSET_MASK:
  4434               preferred_charset_id = charbuf[2];
  4435               if (preferred_charset_id >= 0
  4436                   && NILP (Fmemq (make_fixnum (preferred_charset_id),
  4437                                   charset_list)))
  4438                 preferred_charset_id = -1;
  4439               break;
  4440             default:
  4441               emacs_abort ();
  4442             }
  4443           charbuf += -c - 1;
  4444           continue;
  4445         }
  4446 
  4447       /* Now encode the character C.  */
  4448       if (c < 0x20 || c == 0x7F)
  4449         {
  4450           if (c == '\n'
  4451               || (c == '\r' && EQ (eol_type, Qmac)))
  4452             {
  4453               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
  4454                 ENCODE_RESET_PLANE_AND_REGISTER ();
  4455               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
  4456                 {
  4457                   int i;
  4458 
  4459                   for (i = 0; i < 4; i++)
  4460                     CODING_ISO_DESIGNATION (coding, i)
  4461                       = CODING_ISO_INITIAL (coding, i);
  4462                 }
  4463               bol_designation = ((CODING_ISO_FLAGS (coding)
  4464                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
  4465                                  != 0);
  4466             }
  4467           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
  4468             ENCODE_RESET_PLANE_AND_REGISTER ();
  4469           EMIT_ONE_ASCII_BYTE (c);
  4470         }
  4471       else if (ASCII_CHAR_P (c))
  4472         {
  4473           if (ascii_compatible)
  4474             EMIT_ONE_ASCII_BYTE (c);
  4475           else
  4476             {
  4477               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
  4478               ENCODE_ISO_CHARACTER (charset, c);
  4479             }
  4480         }
  4481       else if (CHAR_BYTE8_P (c))
  4482         {
  4483           c = CHAR_TO_BYTE8 (c);
  4484           EMIT_ONE_BYTE (c);
  4485         }
  4486       else
  4487         {
  4488           struct charset *charset;
  4489 
  4490           if (preferred_charset_id >= 0)
  4491             {
  4492               bool result;
  4493 
  4494               charset = CHARSET_FROM_ID (preferred_charset_id);
  4495               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
  4496               if (! result)
  4497                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4498                                      NULL, charset);
  4499             }
  4500           else
  4501             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4502                                  NULL, charset);
  4503           if (!charset)
  4504             {
  4505               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  4506                 {
  4507                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  4508                   charset = CHARSET_FROM_ID (charset_ascii);
  4509                 }
  4510               else
  4511                 {
  4512                   c = coding->default_char;
  4513                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  4514                                        charset_list, NULL, charset);
  4515                 }
  4516             }
  4517           ENCODE_ISO_CHARACTER (charset, c);
  4518         }
  4519     }
  4520 
  4521   if (coding->mode & CODING_MODE_LAST_BLOCK
  4522       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
  4523     {
  4524       ASSURE_DESTINATION (safe_room);
  4525       ENCODE_RESET_PLANE_AND_REGISTER ();
  4526     }
  4527   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  4528   CODING_ISO_BOL (coding) = bol_designation;
  4529   coding->produced_char += produced_chars;
  4530   coding->produced = dst - coding->destination;
  4531   return 0;
  4532 }
  4533 
  4534 
  4535 /*** 8,9. SJIS and BIG5 handlers ***/
  4536 
  4537 /* Although SJIS and BIG5 are not ISO's coding system, they are used
  4538    quite widely.  So, for the moment, Emacs supports them in the bare
  4539    C code.  But, in the future, they may be supported only by CCL.  */
  4540 
  4541 /* SJIS is a coding system encoding three character sets: ASCII, right
  4542    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
  4543    as is.  A character of charset katakana-jisx0201 is encoded by
  4544    "position-code + 0x80".  A character of charset japanese-jisx0208
  4545    is encoded in 2-byte but two position-codes are divided and shifted
  4546    so that it fit in the range below.
  4547 
  4548    --- CODE RANGE of SJIS ---
  4549    (character set)      (range)
  4550    ASCII                0x00 .. 0x7F
  4551    KATAKANA-JISX0201    0xA0 .. 0xDF
  4552    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
  4553             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
  4554    -------------------------------
  4555 
  4556 */
  4557 
  4558 /* BIG5 is a coding system encoding two character sets: ASCII and
  4559    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
  4560    character set and is encoded in two-byte.
  4561 
  4562    --- CODE RANGE of BIG5 ---
  4563    (character set)      (range)
  4564    ASCII                0x00 .. 0x7F
  4565    Big5 (1st byte)      0xA1 .. 0xFE
  4566         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
  4567    --------------------------
  4568 
  4569   */
  4570 
  4571 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  4572    Return true if a text is encoded in SJIS.  */
  4573 
  4574 static bool
  4575 detect_coding_sjis (struct coding_system *coding,
  4576                     struct coding_detection_info *detect_info)
  4577 {
  4578   const unsigned char *src = coding->source, *src_base;
  4579   const unsigned char *src_end = coding->source + coding->src_bytes;
  4580   bool multibytep = coding->src_multibyte;
  4581   ptrdiff_t consumed_chars = 0;
  4582   int found = 0;
  4583   int c;
  4584   Lisp_Object attrs, charset_list;
  4585   int max_first_byte_of_2_byte_code;
  4586 
  4587   CODING_GET_INFO (coding, attrs, charset_list);
  4588   max_first_byte_of_2_byte_code = list_length (charset_list) <= 3 ? 0xEF : 0xFC;
  4589 
  4590   detect_info->checked |= CATEGORY_MASK_SJIS;
  4591   /* A coding system of this category is always ASCII compatible.  */
  4592   src += coding->head_ascii;
  4593 
  4594   while (1)
  4595     {
  4596       src_base = src;
  4597       ONE_MORE_BYTE (c);
  4598       if (c < 0x80)
  4599         continue;
  4600       if ((c >= 0x81 && c <= 0x9F)
  4601           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
  4602         {
  4603           ONE_MORE_BYTE (c);
  4604           if (c < 0x40 || c == 0x7F || c > 0xFC)
  4605             break;
  4606           found = CATEGORY_MASK_SJIS;
  4607         }
  4608       else if (c >= 0xA0 && c < 0xE0)
  4609         found = CATEGORY_MASK_SJIS;
  4610       else
  4611         break;
  4612     }
  4613   detect_info->rejected |= CATEGORY_MASK_SJIS;
  4614   return 0;
  4615 
  4616  no_more_source:
  4617   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  4618     {
  4619       detect_info->rejected |= CATEGORY_MASK_SJIS;
  4620       return 0;
  4621     }
  4622   detect_info->found |= found;
  4623   return 1;
  4624 }
  4625 
  4626 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  4627    Return true if a text is encoded in BIG5.  */
  4628 
  4629 static bool
  4630 detect_coding_big5 (struct coding_system *coding,
  4631                     struct coding_detection_info *detect_info)
  4632 {
  4633   const unsigned char *src = coding->source, *src_base;
  4634   const unsigned char *src_end = coding->source + coding->src_bytes;
  4635   bool multibytep = coding->src_multibyte;
  4636   ptrdiff_t consumed_chars = 0;
  4637   int found = 0;
  4638   int c;
  4639 
  4640   detect_info->checked |= CATEGORY_MASK_BIG5;
  4641   /* A coding system of this category is always ASCII compatible.  */
  4642   src += coding->head_ascii;
  4643 
  4644   while (1)
  4645     {
  4646       src_base = src;
  4647       ONE_MORE_BYTE (c);
  4648       if (c < 0x80)
  4649         continue;
  4650       if (c >= 0xA1)
  4651         {
  4652           ONE_MORE_BYTE (c);
  4653           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
  4654             return 0;
  4655           found = CATEGORY_MASK_BIG5;
  4656         }
  4657       else
  4658         break;
  4659     }
  4660   detect_info->rejected |= CATEGORY_MASK_BIG5;
  4661   return 0;
  4662 
  4663  no_more_source:
  4664   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
  4665     {
  4666       detect_info->rejected |= CATEGORY_MASK_BIG5;
  4667       return 0;
  4668     }
  4669   detect_info->found |= found;
  4670   return 1;
  4671 }
  4672 
  4673 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  4674 
  4675 static void
  4676 decode_coding_sjis (struct coding_system *coding)
  4677 {
  4678   const unsigned char *src = coding->source + coding->consumed;
  4679   const unsigned char *src_end = coding->source + coding->src_bytes;
  4680   const unsigned char *src_base;
  4681   int *charbuf = coding->charbuf + coding->charbuf_used;
  4682   /* We may produce one charset annotation in one loop and one more at
  4683      the end.  */
  4684   int *charbuf_end
  4685     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  4686   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  4687   bool multibytep = coding->src_multibyte;
  4688   struct charset *charset_roman, *charset_kanji, *charset_kana;
  4689   struct charset *charset_kanji2;
  4690   Lisp_Object attrs, charset_list, val;
  4691   ptrdiff_t char_offset = coding->produced_char;
  4692   ptrdiff_t last_offset = char_offset;
  4693   int last_id = charset_ascii;
  4694   bool eol_dos
  4695     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  4696   int byte_after_cr = -1;
  4697 
  4698   CODING_GET_INFO (coding, attrs, charset_list);
  4699 
  4700   val = charset_list;
  4701   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4702   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4703   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4704   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4705 
  4706   while (1)
  4707     {
  4708       int c, c1;
  4709       struct charset *charset;
  4710 
  4711       src_base = src;
  4712       consumed_chars_base = consumed_chars;
  4713 
  4714       if (charbuf >= charbuf_end)
  4715         {
  4716           if (byte_after_cr >= 0)
  4717             src_base--;
  4718           break;
  4719         }
  4720 
  4721       if (byte_after_cr >= 0)
  4722         c = byte_after_cr, byte_after_cr = -1;
  4723       else
  4724         ONE_MORE_BYTE (c);
  4725       if (c < 0)
  4726         goto invalid_code;
  4727       if (c < 0x80)
  4728         {
  4729           if (eol_dos && c == '\r')
  4730             ONE_MORE_BYTE (byte_after_cr);
  4731           charset = charset_roman;
  4732         }
  4733       else if (c == 0x80 || c == 0xA0)
  4734         goto invalid_code;
  4735       else if (c >= 0xA1 && c <= 0xDF)
  4736         {
  4737           /* SJIS -> JISX0201-Kana */
  4738           c &= 0x7F;
  4739           charset = charset_kana;
  4740         }
  4741       else if (c <= 0xEF)
  4742         {
  4743           /* SJIS -> JISX0208 */
  4744           ONE_MORE_BYTE (c1);
  4745           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
  4746             goto invalid_code;
  4747           c = (c << 8) | c1;
  4748           SJIS_TO_JIS (c);
  4749           charset = charset_kanji;
  4750         }
  4751       else if (c <= 0xFC && charset_kanji2)
  4752         {
  4753           /* SJIS -> JISX0213-2 */
  4754           ONE_MORE_BYTE (c1);
  4755           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
  4756             goto invalid_code;
  4757           c = (c << 8) | c1;
  4758           SJIS_TO_JIS2 (c);
  4759           charset = charset_kanji2;
  4760         }
  4761       else
  4762         goto invalid_code;
  4763       if (charset->id != charset_ascii
  4764           && last_id != charset->id)
  4765         {
  4766           if (last_id != charset_ascii)
  4767             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4768           last_id = charset->id;
  4769           last_offset = char_offset;
  4770         }
  4771       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
  4772       *charbuf++ = c;
  4773       char_offset++;
  4774       continue;
  4775 
  4776     invalid_code:
  4777       src = src_base;
  4778       consumed_chars = consumed_chars_base;
  4779       ONE_MORE_BYTE (c);
  4780       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
  4781       char_offset++;
  4782     }
  4783 
  4784  no_more_source:
  4785   if (last_id != charset_ascii)
  4786     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4787   coding->consumed_char += consumed_chars_base;
  4788   coding->consumed = src_base - coding->source;
  4789   coding->charbuf_used = charbuf - coding->charbuf;
  4790 }
  4791 
  4792 static void
  4793 decode_coding_big5 (struct coding_system *coding)
  4794 {
  4795   const unsigned char *src = coding->source + coding->consumed;
  4796   const unsigned char *src_end = coding->source + coding->src_bytes;
  4797   const unsigned char *src_base;
  4798   int *charbuf = coding->charbuf + coding->charbuf_used;
  4799   /* We may produce one charset annotation in one loop and one more at
  4800      the end.  */
  4801   int *charbuf_end
  4802     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  4803   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  4804   bool multibytep = coding->src_multibyte;
  4805   struct charset *charset_roman, *charset_big5;
  4806   Lisp_Object attrs, charset_list, val;
  4807   ptrdiff_t char_offset = coding->produced_char;
  4808   ptrdiff_t last_offset = char_offset;
  4809   int last_id = charset_ascii;
  4810   bool eol_dos
  4811     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  4812   int byte_after_cr = -1;
  4813 
  4814   CODING_GET_INFO (coding, attrs, charset_list);
  4815   val = charset_list;
  4816   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4817   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4818 
  4819   while (1)
  4820     {
  4821       int c, c1;
  4822       struct charset *charset;
  4823 
  4824       src_base = src;
  4825       consumed_chars_base = consumed_chars;
  4826 
  4827       if (charbuf >= charbuf_end)
  4828         {
  4829           if (byte_after_cr >= 0)
  4830             src_base--;
  4831           break;
  4832         }
  4833 
  4834       if (byte_after_cr >= 0)
  4835         c = byte_after_cr, byte_after_cr = -1;
  4836       else
  4837         ONE_MORE_BYTE (c);
  4838 
  4839       if (c < 0)
  4840         goto invalid_code;
  4841       if (c < 0x80)
  4842         {
  4843           if (eol_dos && c == '\r')
  4844             ONE_MORE_BYTE (byte_after_cr);
  4845           charset = charset_roman;
  4846         }
  4847       else
  4848         {
  4849           /* BIG5 -> Big5 */
  4850           if (c < 0xA1 || c > 0xFE)
  4851             goto invalid_code;
  4852           ONE_MORE_BYTE (c1);
  4853           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
  4854             goto invalid_code;
  4855           c = c << 8 | c1;
  4856           charset = charset_big5;
  4857         }
  4858       if (charset->id != charset_ascii
  4859           && last_id != charset->id)
  4860         {
  4861           if (last_id != charset_ascii)
  4862             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4863           last_id = charset->id;
  4864           last_offset = char_offset;
  4865         }
  4866       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
  4867       *charbuf++ = c;
  4868       char_offset++;
  4869       continue;
  4870 
  4871     invalid_code:
  4872       src = src_base;
  4873       consumed_chars = consumed_chars_base;
  4874       ONE_MORE_BYTE (c);
  4875       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
  4876       char_offset++;
  4877     }
  4878 
  4879  no_more_source:
  4880   if (last_id != charset_ascii)
  4881     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  4882   coding->consumed_char += consumed_chars_base;
  4883   coding->consumed = src_base - coding->source;
  4884   coding->charbuf_used = charbuf - coding->charbuf;
  4885 }
  4886 
  4887 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
  4888    This function can encode charsets `ascii', `katakana-jisx0201',
  4889    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
  4890    are sure that all these charsets are registered as official charset
  4891    (i.e. do not have extended leading-codes).  Characters of other
  4892    charsets are produced without any encoding.  */
  4893 
  4894 static bool
  4895 encode_coding_sjis (struct coding_system *coding)
  4896 {
  4897   bool multibytep = coding->dst_multibyte;
  4898   int *charbuf = coding->charbuf;
  4899   int *charbuf_end = charbuf + coding->charbuf_used;
  4900   unsigned char *dst = coding->destination + coding->produced;
  4901   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4902   int safe_room = 4;
  4903   ptrdiff_t produced_chars = 0;
  4904   Lisp_Object attrs, charset_list, val;
  4905   bool ascii_compatible;
  4906   struct charset *charset_kanji, *charset_kana;
  4907   struct charset *charset_kanji2;
  4908   int c;
  4909 
  4910   CODING_GET_INFO (coding, attrs, charset_list);
  4911   val = XCDR (charset_list);
  4912   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4913   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
  4914   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  4915 
  4916   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  4917 
  4918   while (charbuf < charbuf_end)
  4919     {
  4920       ASSURE_DESTINATION (safe_room);
  4921       c = *charbuf++;
  4922       /* Now encode the character C.  */
  4923       if (ASCII_CHAR_P (c) && ascii_compatible)
  4924         EMIT_ONE_ASCII_BYTE (c);
  4925       else if (CHAR_BYTE8_P (c))
  4926         {
  4927           c = CHAR_TO_BYTE8 (c);
  4928           EMIT_ONE_BYTE (c);
  4929         }
  4930       else
  4931         {
  4932           unsigned code;
  4933           struct charset *charset;
  4934           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  4935                                &code, charset);
  4936 
  4937           if (!charset)
  4938             {
  4939               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  4940                 {
  4941                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  4942                   charset = CHARSET_FROM_ID (charset_ascii);
  4943                 }
  4944               else
  4945                 {
  4946                   c = coding->default_char;
  4947                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  4948                                        charset_list, &code, charset);
  4949                 }
  4950             }
  4951           if (code == CHARSET_INVALID_CODE (charset))
  4952             emacs_abort ();
  4953           if (charset == charset_kanji)
  4954             {
  4955               int c1, c2;
  4956               JIS_TO_SJIS (code);
  4957               c1 = code >> 8, c2 = code & 0xFF;
  4958               EMIT_TWO_BYTES (c1, c2);
  4959             }
  4960           else if (charset == charset_kana)
  4961             EMIT_ONE_BYTE (code | 0x80);
  4962           else if (charset_kanji2 && charset == charset_kanji2)
  4963             {
  4964               int c1, c2;
  4965 
  4966               c1 = code >> 8;
  4967               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
  4968                   || c1 == 0x28
  4969                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
  4970                 {
  4971                   JIS_TO_SJIS2 (code);
  4972                   c1 = code >> 8, c2 = code & 0xFF;
  4973                   EMIT_TWO_BYTES (c1, c2);
  4974                 }
  4975               else
  4976                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
  4977             }
  4978           else
  4979             EMIT_ONE_ASCII_BYTE (code & 0x7F);
  4980         }
  4981     }
  4982   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  4983   coding->produced_char += produced_chars;
  4984   coding->produced = dst - coding->destination;
  4985   return 0;
  4986 }
  4987 
  4988 static bool
  4989 encode_coding_big5 (struct coding_system *coding)
  4990 {
  4991   bool multibytep = coding->dst_multibyte;
  4992   int *charbuf = coding->charbuf;
  4993   int *charbuf_end = charbuf + coding->charbuf_used;
  4994   unsigned char *dst = coding->destination + coding->produced;
  4995   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  4996   int safe_room = 4;
  4997   ptrdiff_t produced_chars = 0;
  4998   Lisp_Object attrs, charset_list, val;
  4999   bool ascii_compatible;
  5000   struct charset *charset_big5;
  5001   int c;
  5002 
  5003   CODING_GET_INFO (coding, attrs, charset_list);
  5004   val = XCDR (charset_list);
  5005   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
  5006   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  5007 
  5008   while (charbuf < charbuf_end)
  5009     {
  5010       ASSURE_DESTINATION (safe_room);
  5011       c = *charbuf++;
  5012       /* Now encode the character C.  */
  5013       if (ASCII_CHAR_P (c) && ascii_compatible)
  5014         EMIT_ONE_ASCII_BYTE (c);
  5015       else if (CHAR_BYTE8_P (c))
  5016         {
  5017           c = CHAR_TO_BYTE8 (c);
  5018           EMIT_ONE_BYTE (c);
  5019         }
  5020       else
  5021         {
  5022           unsigned code;
  5023           struct charset *charset;
  5024           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  5025                                &code, charset);
  5026 
  5027           if (! charset)
  5028             {
  5029               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  5030                 {
  5031                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  5032                   charset = CHARSET_FROM_ID (charset_ascii);
  5033                 }
  5034               else
  5035                 {
  5036                   c = coding->default_char;
  5037                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
  5038                                        charset_list, &code, charset);
  5039                 }
  5040             }
  5041           if (code == CHARSET_INVALID_CODE (charset))
  5042             emacs_abort ();
  5043           if (charset == charset_big5)
  5044             {
  5045               int c1, c2;
  5046 
  5047               c1 = code >> 8, c2 = code & 0xFF;
  5048               EMIT_TWO_BYTES (c1, c2);
  5049             }
  5050           else
  5051             EMIT_ONE_ASCII_BYTE (code & 0x7F);
  5052         }
  5053     }
  5054   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5055   coding->produced_char += produced_chars;
  5056   coding->produced = dst - coding->destination;
  5057   return 0;
  5058 }
  5059 
  5060 
  5061 /*** 10. CCL handlers ***/
  5062 
  5063 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  5064    Return true if a text is encoded in a coding system of which
  5065    encoder/decoder are written in CCL program.  */
  5066 
  5067 static bool
  5068 detect_coding_ccl (struct coding_system *coding,
  5069                    struct coding_detection_info *detect_info)
  5070 {
  5071   const unsigned char *src = coding->source, *src_base;
  5072   const unsigned char *src_end = coding->source + coding->src_bytes;
  5073   bool multibytep = coding->src_multibyte;
  5074   ptrdiff_t consumed_chars = 0;
  5075   int found = 0;
  5076   unsigned char *valids;
  5077   ptrdiff_t head_ascii = coding->head_ascii;
  5078   Lisp_Object attrs;
  5079 
  5080   detect_info->checked |= CATEGORY_MASK_CCL;
  5081 
  5082   coding = &coding_categories[coding_category_ccl];
  5083   valids = CODING_CCL_VALIDS (coding);
  5084   attrs = CODING_ID_ATTRS (coding->id);
  5085   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
  5086     src += head_ascii;
  5087 
  5088   while (1)
  5089     {
  5090       int c;
  5091 
  5092       src_base = src;
  5093       ONE_MORE_BYTE (c);
  5094       if (c < 0 || ! valids[c])
  5095         break;
  5096       if ((valids[c] > 1))
  5097         found = CATEGORY_MASK_CCL;
  5098     }
  5099   detect_info->rejected |= CATEGORY_MASK_CCL;
  5100   return 0;
  5101 
  5102  no_more_source:
  5103   detect_info->found |= found;
  5104   return 1;
  5105 }
  5106 
  5107 static void
  5108 decode_coding_ccl (struct coding_system *coding)
  5109 {
  5110   const unsigned char *src = coding->source + coding->consumed;
  5111   const unsigned char *src_end = coding->source + coding->src_bytes;
  5112   int *charbuf = coding->charbuf + coding->charbuf_used;
  5113   int *charbuf_end = coding->charbuf + coding->charbuf_size;
  5114   ptrdiff_t consumed_chars = 0;
  5115   bool multibytep = coding->src_multibyte;
  5116   struct ccl_program *ccl = &coding->spec.ccl->ccl;
  5117   int source_charbuf[1024];
  5118   int source_byteidx[1025];
  5119   Lisp_Object attrs, charset_list;
  5120 
  5121   CODING_GET_INFO (coding, attrs, charset_list);
  5122 
  5123   while (1)
  5124     {
  5125       const unsigned char *p = src;
  5126       ptrdiff_t offset;
  5127       int i = 0;
  5128 
  5129       if (multibytep)
  5130         {
  5131           while (i < 1024 && p < src_end)
  5132             {
  5133               source_byteidx[i] = p - src;
  5134               source_charbuf[i++] = string_char_advance (&p);
  5135             }
  5136           source_byteidx[i] = p - src;
  5137         }
  5138       else
  5139         while (i < 1024 && p < src_end)
  5140           source_charbuf[i++] = *p++;
  5141 
  5142       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
  5143         ccl->last_block = true;
  5144       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
  5145       charset_map_loaded = 0;
  5146       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
  5147                   charset_list);
  5148       if (charset_map_loaded
  5149           && (offset = coding_change_source (coding)))
  5150         {
  5151           p += offset;
  5152           src += offset;
  5153           src_end += offset;
  5154         }
  5155       charbuf += ccl->produced;
  5156       if (multibytep)
  5157         src += source_byteidx[ccl->consumed];
  5158       else
  5159         src += ccl->consumed;
  5160       consumed_chars += ccl->consumed;
  5161       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
  5162         break;
  5163     }
  5164 
  5165   switch (ccl->status)
  5166     {
  5167     case CCL_STAT_SUSPEND_BY_SRC:
  5168       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5169       break;
  5170     case CCL_STAT_SUSPEND_BY_DST:
  5171       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
  5172       break;
  5173     case CCL_STAT_QUIT:
  5174     case CCL_STAT_INVALID_CMD:
  5175       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
  5176       break;
  5177     default:
  5178       record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5179       break;
  5180     }
  5181   coding->consumed_char += consumed_chars;
  5182   coding->consumed = src - coding->source;
  5183   coding->charbuf_used = charbuf - coding->charbuf;
  5184 }
  5185 
  5186 static bool
  5187 encode_coding_ccl (struct coding_system *coding)
  5188 {
  5189   struct ccl_program *ccl = &coding->spec.ccl->ccl;
  5190   bool multibytep = coding->dst_multibyte;
  5191   int *charbuf = coding->charbuf;
  5192   int *charbuf_end = charbuf + coding->charbuf_used;
  5193   unsigned char *dst = coding->destination + coding->produced;
  5194   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5195   int destination_charbuf[1024];
  5196   ptrdiff_t produced_chars = 0;
  5197   int i;
  5198   Lisp_Object attrs, charset_list;
  5199 
  5200   CODING_GET_INFO (coding, attrs, charset_list);
  5201   if (coding->consumed_char == coding->src_chars
  5202       && coding->mode & CODING_MODE_LAST_BLOCK)
  5203     ccl->last_block = true;
  5204 
  5205   do
  5206     {
  5207       ptrdiff_t offset;
  5208 
  5209       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
  5210       charset_map_loaded = 0;
  5211       ccl_driver (ccl, charbuf, destination_charbuf,
  5212                   charbuf_end - charbuf, 1024, charset_list);
  5213       if (charset_map_loaded
  5214           && (offset = coding_change_destination (coding)))
  5215         dst += offset;
  5216       if (multibytep)
  5217         {
  5218           ASSURE_DESTINATION (ccl->produced * 2);
  5219           for (i = 0; i < ccl->produced; i++)
  5220             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
  5221         }
  5222       else
  5223         {
  5224           ASSURE_DESTINATION (ccl->produced);
  5225           for (i = 0; i < ccl->produced; i++)
  5226             *dst++ = destination_charbuf[i] & 0xFF;
  5227           produced_chars += ccl->produced;
  5228         }
  5229       charbuf += ccl->consumed;
  5230       if (ccl->status == CCL_STAT_QUIT
  5231           || ccl->status == CCL_STAT_INVALID_CMD)
  5232         break;
  5233     }
  5234   while (charbuf < charbuf_end);
  5235 
  5236   switch (ccl->status)
  5237     {
  5238     case CCL_STAT_SUSPEND_BY_SRC:
  5239       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5240       break;
  5241     case CCL_STAT_SUSPEND_BY_DST:
  5242       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
  5243       break;
  5244     case CCL_STAT_QUIT:
  5245     case CCL_STAT_INVALID_CMD:
  5246       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
  5247       break;
  5248     default:
  5249       record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5250       break;
  5251     }
  5252 
  5253   coding->produced_char += produced_chars;
  5254   coding->produced = dst - coding->destination;
  5255   return 0;
  5256 }
  5257 
  5258 
  5259 /*** 10, 11. no-conversion handlers ***/
  5260 
  5261 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
  5262 
  5263 static void
  5264 decode_coding_raw_text (struct coding_system *coding)
  5265 {
  5266   bool eol_dos
  5267     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  5268 
  5269   coding->chars_at_source = 1;
  5270   coding->consumed_char = coding->src_chars;
  5271   coding->consumed = coding->src_bytes;
  5272   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
  5273     {
  5274       coding->consumed_char--;
  5275       coding->consumed--;
  5276       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
  5277     }
  5278   else
  5279     record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5280 }
  5281 
  5282 static bool
  5283 encode_coding_raw_text (struct coding_system *coding)
  5284 {
  5285   bool multibytep = coding->dst_multibyte;
  5286   int *charbuf = coding->charbuf;
  5287   int *charbuf_end = coding->charbuf + coding->charbuf_used;
  5288   unsigned char *dst = coding->destination + coding->produced;
  5289   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5290   ptrdiff_t produced_chars = 0;
  5291   int c;
  5292 
  5293   if (multibytep)
  5294     {
  5295       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
  5296 
  5297       if (coding->src_multibyte)
  5298         while (charbuf < charbuf_end)
  5299           {
  5300             ASSURE_DESTINATION (safe_room);
  5301             c = *charbuf++;
  5302             if (ASCII_CHAR_P (c))
  5303               EMIT_ONE_ASCII_BYTE (c);
  5304             else if (CHAR_BYTE8_P (c))
  5305               {
  5306                 c = CHAR_TO_BYTE8 (c);
  5307                 EMIT_ONE_BYTE (c);
  5308               }
  5309             else
  5310               {
  5311                 unsigned char str[MAX_MULTIBYTE_LENGTH];
  5312                 int len = CHAR_STRING (c, str);
  5313                 for (int i = 0; i < len; i++)
  5314                   EMIT_ONE_BYTE (str[i]);
  5315               }
  5316           }
  5317       else
  5318         while (charbuf < charbuf_end)
  5319           {
  5320             ASSURE_DESTINATION (safe_room);
  5321             c = *charbuf++;
  5322             EMIT_ONE_BYTE (c);
  5323           }
  5324     }
  5325   else
  5326     {
  5327       if (coding->src_multibyte)
  5328         {
  5329           int safe_room = MAX_MULTIBYTE_LENGTH;
  5330 
  5331           while (charbuf < charbuf_end)
  5332             {
  5333               ASSURE_DESTINATION (safe_room);
  5334               c = *charbuf++;
  5335               if (ASCII_CHAR_P (c))
  5336                 *dst++ = c;
  5337               else if (CHAR_BYTE8_P (c))
  5338                 *dst++ = CHAR_TO_BYTE8 (c);
  5339               else
  5340                 dst += CHAR_STRING (c, dst);
  5341             }
  5342         }
  5343       else
  5344         {
  5345           ASSURE_DESTINATION (charbuf_end - charbuf);
  5346           while (charbuf < charbuf_end && dst < dst_end)
  5347             *dst++ = *charbuf++;
  5348         }
  5349       produced_chars = dst - (coding->destination + coding->produced);
  5350     }
  5351   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5352   coding->produced_char += produced_chars;
  5353   coding->produced = dst - coding->destination;
  5354   return 0;
  5355 }
  5356 
  5357 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
  5358    Return true if a text is encoded in a charset-based coding system.  */
  5359 
  5360 static bool
  5361 detect_coding_charset (struct coding_system *coding,
  5362                        struct coding_detection_info *detect_info)
  5363 {
  5364   const unsigned char *src = coding->source, *src_base;
  5365   const unsigned char *src_end = coding->source + coding->src_bytes;
  5366   bool multibytep = coding->src_multibyte;
  5367   ptrdiff_t consumed_chars = 0;
  5368   Lisp_Object attrs, valids, name;
  5369   int found = 0;
  5370   ptrdiff_t head_ascii = coding->head_ascii;
  5371   bool check_latin_extra = 0;
  5372 
  5373   detect_info->checked |= CATEGORY_MASK_CHARSET;
  5374 
  5375   coding = &coding_categories[coding_category_charset];
  5376   attrs = CODING_ID_ATTRS (coding->id);
  5377   valids = AREF (attrs, coding_attr_charset_valids);
  5378   name = CODING_ID_NAME (coding->id);
  5379   if (strncmp (SSDATA (SYMBOL_NAME (name)),
  5380                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
  5381       || strncmp (SSDATA (SYMBOL_NAME (name)),
  5382                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
  5383     check_latin_extra = 1;
  5384 
  5385   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
  5386     src += head_ascii;
  5387 
  5388   while (1)
  5389     {
  5390       int c;
  5391       Lisp_Object val;
  5392       struct charset *charset;
  5393       int dim, idx;
  5394 
  5395       src_base = src;
  5396       ONE_MORE_BYTE (c);
  5397       if (c < 0)
  5398         continue;
  5399       val = AREF (valids, c);
  5400       if (NILP (val))
  5401         break;
  5402       if (c >= 0x80)
  5403         {
  5404           if (c < 0xA0
  5405               && check_latin_extra
  5406               && (!VECTORP (Vlatin_extra_code_table)
  5407                   || NILP (AREF (Vlatin_extra_code_table, c))))
  5408             break;
  5409           found = CATEGORY_MASK_CHARSET;
  5410         }
  5411       if (FIXNUMP (val))
  5412         {
  5413           charset = CHARSET_FROM_ID (XFIXNAT (val));
  5414           dim = CHARSET_DIMENSION (charset);
  5415           for (idx = 1; idx < dim; idx++)
  5416             {
  5417               if (src == src_end)
  5418                 goto too_short;
  5419               ONE_MORE_BYTE (c);
  5420               if (c < charset->code_space[(dim - 1 - idx) * 4]
  5421                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
  5422                 break;
  5423             }
  5424           if (idx < dim)
  5425             break;
  5426         }
  5427       else
  5428         {
  5429           idx = 1;
  5430           for (; CONSP (val); val = XCDR (val))
  5431             {
  5432               charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
  5433               dim = CHARSET_DIMENSION (charset);
  5434               while (idx < dim)
  5435                 {
  5436                   if (src == src_end)
  5437                     goto too_short;
  5438                   ONE_MORE_BYTE (c);
  5439                   if (c < charset->code_space[(dim - 1 - idx) * 4]
  5440                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
  5441                     break;
  5442                   idx++;
  5443                 }
  5444               if (idx == dim)
  5445                 {
  5446                   val = Qnil;
  5447                   break;
  5448                 }
  5449             }
  5450           if (CONSP (val))
  5451             break;
  5452         }
  5453     }
  5454  too_short:
  5455   detect_info->rejected |= CATEGORY_MASK_CHARSET;
  5456   return 0;
  5457 
  5458  no_more_source:
  5459   detect_info->found |= found;
  5460   return 1;
  5461 }
  5462 
  5463 static void
  5464 decode_coding_charset (struct coding_system *coding)
  5465 {
  5466   const unsigned char *src = coding->source + coding->consumed;
  5467   const unsigned char *src_end = coding->source + coding->src_bytes;
  5468   const unsigned char *src_base;
  5469   int *charbuf = coding->charbuf + coding->charbuf_used;
  5470   /* We may produce one charset annotation in one loop and one more at
  5471      the end.  */
  5472   int *charbuf_end
  5473     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
  5474   ptrdiff_t consumed_chars = 0, consumed_chars_base;
  5475   bool multibytep = coding->src_multibyte;
  5476   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
  5477   Lisp_Object valids;
  5478   ptrdiff_t char_offset = coding->produced_char;
  5479   ptrdiff_t last_offset = char_offset;
  5480   int last_id = charset_ascii;
  5481   bool eol_dos
  5482     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
  5483   int byte_after_cr = -1;
  5484 
  5485   valids = AREF (attrs, coding_attr_charset_valids);
  5486 
  5487   while (1)
  5488     {
  5489       int c;
  5490       Lisp_Object val;
  5491       struct charset *charset;
  5492       int dim;
  5493       int len = 1;
  5494       unsigned code;
  5495 
  5496       src_base = src;
  5497       consumed_chars_base = consumed_chars;
  5498 
  5499       if (charbuf >= charbuf_end)
  5500         {
  5501           if (byte_after_cr >= 0)
  5502             src_base--;
  5503           break;
  5504         }
  5505 
  5506       if (byte_after_cr >= 0)
  5507         {
  5508           c = byte_after_cr;
  5509           byte_after_cr = -1;
  5510         }
  5511       else
  5512         {
  5513           ONE_MORE_BYTE (c);
  5514           if (eol_dos && c == '\r')
  5515             ONE_MORE_BYTE (byte_after_cr);
  5516         }
  5517       if (c < 0)
  5518         goto invalid_code;
  5519       code = c;
  5520 
  5521       val = AREF (valids, c);
  5522       if (! FIXNUMP (val) && ! CONSP (val))
  5523         goto invalid_code;
  5524       if (FIXNUMP (val))
  5525         {
  5526           charset = CHARSET_FROM_ID (XFIXNAT (val));
  5527           dim = CHARSET_DIMENSION (charset);
  5528           while (len < dim)
  5529             {
  5530               ONE_MORE_BYTE (c);
  5531               code = (code << 8) | c;
  5532               len++;
  5533             }
  5534           CODING_DECODE_CHAR (coding, src, src_base, src_end,
  5535                               charset, code, c);
  5536         }
  5537       else
  5538         {
  5539           /* VAL is a list of charset IDs.  It is assured that the
  5540              list is sorted by charset dimensions (smaller one
  5541              comes first).  */
  5542           while (CONSP (val))
  5543             {
  5544               charset = CHARSET_FROM_ID (XFIXNAT (XCAR (val)));
  5545               dim = CHARSET_DIMENSION (charset);
  5546               while (len < dim)
  5547                 {
  5548                   ONE_MORE_BYTE (c);
  5549                   code = (code << 8) | c;
  5550                   len++;
  5551                 }
  5552               CODING_DECODE_CHAR (coding, src, src_base,
  5553                                   src_end, charset, code, c);
  5554               if (c >= 0)
  5555                 break;
  5556               val = XCDR (val);
  5557             }
  5558         }
  5559       if (c < 0)
  5560         goto invalid_code;
  5561       if (charset->id != charset_ascii
  5562           && last_id != charset->id)
  5563         {
  5564           if (last_id != charset_ascii)
  5565             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  5566           last_id = charset->id;
  5567           last_offset = char_offset;
  5568         }
  5569 
  5570       *charbuf++ = c;
  5571       char_offset++;
  5572       continue;
  5573 
  5574     invalid_code:
  5575       src = src_base;
  5576       consumed_chars = consumed_chars_base;
  5577       ONE_MORE_BYTE (c);
  5578       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
  5579       char_offset++;
  5580     }
  5581 
  5582  no_more_source:
  5583   if (last_id != charset_ascii)
  5584     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
  5585   coding->consumed_char += consumed_chars_base;
  5586   coding->consumed = src_base - coding->source;
  5587   coding->charbuf_used = charbuf - coding->charbuf;
  5588 }
  5589 
  5590 static bool
  5591 encode_coding_charset (struct coding_system *coding)
  5592 {
  5593   bool multibytep = coding->dst_multibyte;
  5594   int *charbuf = coding->charbuf;
  5595   int *charbuf_end = charbuf + coding->charbuf_used;
  5596   unsigned char *dst = coding->destination + coding->produced;
  5597   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  5598   int safe_room = MAX_MULTIBYTE_LENGTH;
  5599   ptrdiff_t produced_chars = 0;
  5600   Lisp_Object attrs, charset_list;
  5601   bool ascii_compatible;
  5602   int c;
  5603 
  5604   CODING_GET_INFO (coding, attrs, charset_list);
  5605   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  5606 
  5607   while (charbuf < charbuf_end)
  5608     {
  5609       struct charset *charset;
  5610       unsigned code;
  5611 
  5612       ASSURE_DESTINATION (safe_room);
  5613       c = *charbuf++;
  5614       if (ascii_compatible && ASCII_CHAR_P (c))
  5615         EMIT_ONE_ASCII_BYTE (c);
  5616       else if (CHAR_BYTE8_P (c))
  5617         {
  5618           c = CHAR_TO_BYTE8 (c);
  5619           EMIT_ONE_BYTE (c);
  5620         }
  5621       else
  5622         {
  5623           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
  5624                                &code, charset);
  5625 
  5626           if (charset)
  5627             {
  5628               if (CHARSET_DIMENSION (charset) == 1)
  5629                 EMIT_ONE_BYTE (code);
  5630               else if (CHARSET_DIMENSION (charset) == 2)
  5631                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
  5632               else if (CHARSET_DIMENSION (charset) == 3)
  5633                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
  5634               else
  5635                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
  5636                                  (code >> 8) & 0xFF, code & 0xFF);
  5637             }
  5638           else
  5639             {
  5640               if (coding->mode & CODING_MODE_SAFE_ENCODING)
  5641                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
  5642               else
  5643                 c = coding->default_char;
  5644               EMIT_ONE_BYTE (c);
  5645             }
  5646         }
  5647     }
  5648 
  5649   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  5650   coding->produced_char += produced_chars;
  5651   coding->produced = dst - coding->destination;
  5652   return 0;
  5653 }
  5654 
  5655 
  5656 /*** 7. C library functions ***/
  5657 
  5658 /* Setup coding context CODING from information about CODING_SYSTEM.
  5659    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
  5660    CODING_SYSTEM is invalid, signal an error.  */
  5661 
  5662 void
  5663 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
  5664 {
  5665   Lisp_Object attrs;
  5666   Lisp_Object eol_type;
  5667   Lisp_Object coding_type;
  5668   Lisp_Object val;
  5669 
  5670   if (NILP (coding_system))
  5671     coding_system = Qundecided;
  5672 
  5673   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
  5674 
  5675   attrs = CODING_ID_ATTRS (coding->id);
  5676   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  5677 
  5678   coding->mode = 0;
  5679   if (VECTORP (eol_type))
  5680     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
  5681                             | CODING_REQUIRE_DETECTION_MASK);
  5682   else if (! EQ (eol_type, Qunix))
  5683     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
  5684                             | CODING_REQUIRE_ENCODING_MASK);
  5685   else
  5686     coding->common_flags = 0;
  5687   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  5688     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
  5689   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
  5690     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
  5691   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
  5692     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
  5693 
  5694   val = CODING_ATTR_SAFE_CHARSETS (attrs);
  5695   coding->max_charset_id = SCHARS (val) - 1;
  5696   coding->safe_charsets = SDATA (val);
  5697   coding->default_char = XFIXNUM (CODING_ATTR_DEFAULT_CHAR (attrs));
  5698   coding->carryover_bytes = 0;
  5699   coding->raw_destination = 0;
  5700 
  5701   coding_type = CODING_ATTR_TYPE (attrs);
  5702   if (EQ (coding_type, Qundecided))
  5703     {
  5704       coding->detector = NULL;
  5705       coding->decoder = decode_coding_raw_text;
  5706       coding->encoder = encode_coding_raw_text;
  5707       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5708       coding->spec.undecided.inhibit_nbd
  5709         = (encode_inhibit_flag
  5710            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
  5711       coding->spec.undecided.inhibit_ied
  5712         = (encode_inhibit_flag
  5713            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
  5714       coding->spec.undecided.prefer_utf_8
  5715         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
  5716     }
  5717   else if (EQ (coding_type, Qiso_2022))
  5718     {
  5719       int i;
  5720       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5721 
  5722       /* Invoke graphic register 0 to plane 0.  */
  5723       CODING_ISO_INVOCATION (coding, 0) = 0;
  5724       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
  5725       CODING_ISO_INVOCATION (coding, 1)
  5726         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
  5727       /* Setup the initial status of designation.  */
  5728       for (i = 0; i < 4; i++)
  5729         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
  5730       /* Not single shifting initially.  */
  5731       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
  5732       /* Beginning of buffer should also be regarded as bol. */
  5733       CODING_ISO_BOL (coding) = 1;
  5734       coding->detector = detect_coding_iso_2022;
  5735       coding->decoder = decode_coding_iso_2022;
  5736       coding->encoder = encode_coding_iso_2022;
  5737       if (flags & CODING_ISO_FLAG_SAFE)
  5738         coding->mode |= CODING_MODE_SAFE_ENCODING;
  5739       coding->common_flags
  5740         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
  5741             | CODING_REQUIRE_FLUSHING_MASK);
  5742       if (flags & CODING_ISO_FLAG_COMPOSITION)
  5743         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
  5744       if (flags & CODING_ISO_FLAG_DESIGNATION)
  5745         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
  5746       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5747         {
  5748           setup_iso_safe_charsets (attrs);
  5749           val = CODING_ATTR_SAFE_CHARSETS (attrs);
  5750           coding->max_charset_id = SCHARS (val) - 1;
  5751           coding->safe_charsets = SDATA (val);
  5752         }
  5753       CODING_ISO_FLAGS (coding) = flags;
  5754       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
  5755       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
  5756       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
  5757       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
  5758     }
  5759   else if (EQ (coding_type, Qcharset))
  5760     {
  5761       coding->detector = detect_coding_charset;
  5762       coding->decoder = decode_coding_charset;
  5763       coding->encoder = encode_coding_charset;
  5764       coding->common_flags
  5765         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5766     }
  5767   else if (EQ (coding_type, Qutf_8))
  5768     {
  5769       val = AREF (attrs, coding_attr_utf_bom);
  5770       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
  5771                                    : EQ (val, Qt) ? utf_with_bom
  5772                                    : utf_without_bom);
  5773       coding->detector = detect_coding_utf_8;
  5774       coding->decoder = decode_coding_utf_8;
  5775       coding->encoder = encode_coding_utf_8;
  5776       coding->common_flags
  5777         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5778       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
  5779         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5780     }
  5781   else if (EQ (coding_type, Qutf_16))
  5782     {
  5783       val = AREF (attrs, coding_attr_utf_bom);
  5784       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
  5785                                     : EQ (val, Qt) ? utf_with_bom
  5786                                     : utf_without_bom);
  5787       val = AREF (attrs, coding_attr_utf_16_endian);
  5788       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
  5789                                        : utf_16_little_endian);
  5790       CODING_UTF_16_SURROGATE (coding) = 0;
  5791       coding->detector = detect_coding_utf_16;
  5792       coding->decoder = decode_coding_utf_16;
  5793       coding->encoder = encode_coding_utf_16;
  5794       coding->common_flags
  5795         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5796       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
  5797         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
  5798     }
  5799   else if (EQ (coding_type, Qccl))
  5800     {
  5801       coding->detector = detect_coding_ccl;
  5802       coding->decoder = decode_coding_ccl;
  5803       coding->encoder = encode_coding_ccl;
  5804       coding->common_flags
  5805         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
  5806             | CODING_REQUIRE_FLUSHING_MASK);
  5807     }
  5808   else if (EQ (coding_type, Qemacs_mule))
  5809     {
  5810       coding->detector = detect_coding_emacs_mule;
  5811       coding->decoder = decode_coding_emacs_mule;
  5812       coding->encoder = encode_coding_emacs_mule;
  5813       coding->common_flags
  5814         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5815       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
  5816           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
  5817         {
  5818           Lisp_Object tail, safe_charsets;
  5819           int max_charset_id = 0;
  5820 
  5821           for (tail = Vemacs_mule_charset_list; CONSP (tail);
  5822                tail = XCDR (tail))
  5823             if (max_charset_id < XFIXNAT (XCAR (tail)))
  5824               max_charset_id = XFIXNAT (XCAR (tail));
  5825           safe_charsets = make_uninit_string (max_charset_id + 1);
  5826           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
  5827           for (tail = Vemacs_mule_charset_list; CONSP (tail);
  5828                tail = XCDR (tail))
  5829             SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
  5830           coding->max_charset_id = max_charset_id;
  5831           coding->safe_charsets = SDATA (safe_charsets);
  5832         }
  5833       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
  5834       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
  5835     }
  5836   else if (EQ (coding_type, Qshift_jis))
  5837     {
  5838       coding->detector = detect_coding_sjis;
  5839       coding->decoder = decode_coding_sjis;
  5840       coding->encoder = encode_coding_sjis;
  5841       coding->common_flags
  5842         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5843     }
  5844   else if (EQ (coding_type, Qbig5))
  5845     {
  5846       coding->detector = detect_coding_big5;
  5847       coding->decoder = decode_coding_big5;
  5848       coding->encoder = encode_coding_big5;
  5849       coding->common_flags
  5850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
  5851     }
  5852   else                          /* EQ (coding_type, Qraw_text) */
  5853     {
  5854       coding->detector = NULL;
  5855       coding->decoder = decode_coding_raw_text;
  5856       coding->encoder = encode_coding_raw_text;
  5857       if (! EQ (eol_type, Qunix))
  5858         {
  5859           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
  5860           if (! VECTORP (eol_type))
  5861             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
  5862         }
  5863 
  5864     }
  5865 
  5866   return;
  5867 }
  5868 
  5869 /* Return a list of charsets supported by CODING.  */
  5870 
  5871 Lisp_Object
  5872 coding_charset_list (struct coding_system *coding)
  5873 {
  5874   Lisp_Object attrs, charset_list;
  5875 
  5876   CODING_GET_INFO (coding, attrs, charset_list);
  5877   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
  5878     {
  5879       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5880 
  5881       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5882         charset_list = Viso_2022_charset_list;
  5883     }
  5884   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
  5885     {
  5886       charset_list = Vemacs_mule_charset_list;
  5887     }
  5888   return charset_list;
  5889 }
  5890 
  5891 
  5892 /* Return a list of charsets supported by CODING-SYSTEM.  */
  5893 
  5894 Lisp_Object
  5895 coding_system_charset_list (Lisp_Object coding_system)
  5896 {
  5897   ptrdiff_t id;
  5898   Lisp_Object attrs, charset_list;
  5899 
  5900   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
  5901   attrs = CODING_ID_ATTRS (id);
  5902 
  5903   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
  5904     {
  5905       int flags = XFIXNUM (AREF (attrs, coding_attr_iso_flags));
  5906 
  5907       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
  5908         charset_list = Viso_2022_charset_list;
  5909       else
  5910         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  5911     }
  5912   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
  5913     {
  5914       charset_list = Vemacs_mule_charset_list;
  5915     }
  5916   else
  5917     {
  5918       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  5919     }
  5920   return charset_list;
  5921 }
  5922 
  5923 
  5924 /* Return raw-text or one of its subsidiaries that has the same
  5925    eol_type as CODING-SYSTEM.  */
  5926 
  5927 Lisp_Object
  5928 raw_text_coding_system (Lisp_Object coding_system)
  5929 {
  5930   Lisp_Object spec, attrs;
  5931   Lisp_Object eol_type, raw_text_eol_type;
  5932 
  5933   if (NILP (coding_system))
  5934     return Qraw_text;
  5935   spec = CODING_SYSTEM_SPEC (coding_system);
  5936   attrs = AREF (spec, 0);
  5937 
  5938   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
  5939     return coding_system;
  5940 
  5941   eol_type = AREF (spec, 2);
  5942   if (VECTORP (eol_type))
  5943     return Qraw_text;
  5944   spec = CODING_SYSTEM_SPEC (Qraw_text);
  5945   raw_text_eol_type = AREF (spec, 2);
  5946   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
  5947           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
  5948           : AREF (raw_text_eol_type, 2));
  5949 }
  5950 
  5951 /* Return true if CODING corresponds to raw-text coding-system.  */
  5952 
  5953 bool
  5954 raw_text_coding_system_p (struct coding_system *coding)
  5955 {
  5956   return (coding->decoder == decode_coding_raw_text
  5957           && coding->encoder == encode_coding_raw_text) ? true : false;
  5958 }
  5959 
  5960 
  5961 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
  5962    the subsidiary that has the same eol-spec as PARENT (if it is not
  5963    nil and specifies end-of-line format) or the system's setting.  */
  5964 
  5965 Lisp_Object
  5966 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
  5967 {
  5968   Lisp_Object spec, eol_type;
  5969 
  5970   if (NILP (coding_system))
  5971     coding_system = Qraw_text;
  5972   else
  5973     CHECK_CODING_SYSTEM (coding_system);
  5974   spec = CODING_SYSTEM_SPEC (coding_system);
  5975   eol_type = AREF (spec, 2);
  5976   if (VECTORP (eol_type))
  5977     {
  5978       /* Format of end-of-line decided by system.
  5979          This is Qunix on Unix and Mac, Qdos on DOS/Windows.
  5980          This has an effect only for external encoding (i.e., for output to
  5981          file and process), not for in-buffer or Lisp string encoding.  */
  5982       Lisp_Object system_eol_type = Qunix;
  5983       #ifdef DOS_NT
  5984        system_eol_type = Qdos;
  5985       #endif
  5986 
  5987       Lisp_Object parent_eol_type = system_eol_type;
  5988       if (! NILP (parent))
  5989         {
  5990           CHECK_CODING_SYSTEM (parent);
  5991           Lisp_Object parent_spec = CODING_SYSTEM_SPEC (parent);
  5992           Lisp_Object pspec_type = AREF (parent_spec, 2);
  5993           if (!VECTORP (pspec_type))
  5994             parent_eol_type = pspec_type;
  5995         }
  5996       if (EQ (parent_eol_type, Qunix))
  5997         coding_system = AREF (eol_type, 0);
  5998       else if (EQ (parent_eol_type, Qdos))
  5999         coding_system = AREF (eol_type, 1);
  6000       else if (EQ (parent_eol_type, Qmac))
  6001         coding_system = AREF (eol_type, 2);
  6002     }
  6003   return coding_system;
  6004 }
  6005 
  6006 
  6007 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
  6008    decided for writing to a process.  If not, complement them, and
  6009    return a new coding system.  */
  6010 
  6011 Lisp_Object
  6012 complement_process_encoding_system (Lisp_Object coding_system)
  6013 {
  6014   Lisp_Object coding_base = Qnil, eol_base = Qnil;
  6015   Lisp_Object spec, attrs;
  6016   int i;
  6017 
  6018   for (i = 0; i < 3; i++)
  6019     {
  6020       if (i == 1)
  6021         coding_system = CDR_SAFE (Vdefault_process_coding_system);
  6022       else if (i == 2)
  6023         coding_system = preferred_coding_system ();
  6024       spec = CODING_SYSTEM_SPEC (coding_system);
  6025       if (NILP (spec))
  6026         continue;
  6027       attrs = AREF (spec, 0);
  6028       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
  6029         coding_base = CODING_ATTR_BASE_NAME (attrs);
  6030       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
  6031         eol_base = coding_system;
  6032       if (! NILP (coding_base) && ! NILP (eol_base))
  6033         break;
  6034     }
  6035 
  6036   if (i > 0)
  6037     /* The original CODING_SYSTEM didn't specify text-conversion or
  6038        eol-conversion.  Be sure that we return a fully complemented
  6039        coding system.  */
  6040     coding_system = coding_inherit_eol_type (coding_base, eol_base);
  6041   return coding_system;
  6042 }
  6043 
  6044 
  6045 /* Emacs has a mechanism to automatically detect a coding system if it
  6046    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
  6047    it's impossible to distinguish some coding systems accurately
  6048    because they use the same range of codes.  So, at first, coding
  6049    systems are categorized into 7, those are:
  6050 
  6051    o coding-category-emacs-mule
  6052 
  6053         The category for a coding system which has the same code range
  6054         as Emacs' internal format.  Assigned the coding-system (Lisp
  6055         symbol) `emacs-mule' by default.
  6056 
  6057    o coding-category-sjis
  6058 
  6059         The category for a coding system which has the same code range
  6060         as SJIS.  Assigned the coding-system (Lisp
  6061         symbol) `japanese-shift-jis' by default.
  6062 
  6063    o coding-category-iso-7
  6064 
  6065         The category for a coding system which has the same code range
  6066         as ISO2022 of 7-bit environment.  This doesn't use any locking
  6067         shift and single shift functions.  This can encode/decode all
  6068         charsets.  Assigned the coding-system (Lisp symbol)
  6069         `iso-2022-7bit' by default.
  6070 
  6071    o coding-category-iso-7-tight
  6072 
  6073         Same as coding-category-iso-7 except that this can
  6074         encode/decode only the specified charsets.
  6075 
  6076    o coding-category-iso-8-1
  6077 
  6078         The category for a coding system which has the same code range
  6079         as ISO2022 of 8-bit environment and graphic plane 1 used only
  6080         for DIMENSION1 charset.  This doesn't use any locking shift
  6081         and single shift functions.  Assigned the coding-system (Lisp
  6082         symbol) `iso-latin-1' by default.
  6083 
  6084    o coding-category-iso-8-2
  6085 
  6086         The category for a coding system which has the same code range
  6087         as ISO2022 of 8-bit environment and graphic plane 1 used only
  6088         for DIMENSION2 charset.  This doesn't use any locking shift
  6089         and single shift functions.  Assigned the coding-system (Lisp
  6090         symbol) `japanese-iso-8bit' by default.
  6091 
  6092    o coding-category-iso-7-else
  6093 
  6094         The category for a coding system which has the same code range
  6095         as ISO2022 of 7-bit environment but uses locking shift or
  6096         single shift functions.  Assigned the coding-system (Lisp
  6097         symbol) `iso-2022-7bit-lock' by default.
  6098 
  6099    o coding-category-iso-8-else
  6100 
  6101         The category for a coding system which has the same code range
  6102         as ISO2022 of 8-bit environment but uses locking shift or
  6103         single shift functions.  Assigned the coding-system (Lisp
  6104         symbol) `iso-2022-8bit-ss2' by default.
  6105 
  6106    o coding-category-big5
  6107 
  6108         The category for a coding system which has the same code range
  6109         as BIG5.  Assigned the coding-system (Lisp symbol)
  6110         `cn-big5' by default.
  6111 
  6112    o coding-category-utf-8
  6113 
  6114         The category for a coding system which has the same code range
  6115         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
  6116         symbol) `utf-8' by default.
  6117 
  6118    o coding-category-utf-16-be
  6119 
  6120         The category for a coding system in which a text has an
  6121         Unicode signature (cf. Unicode Standard) in the order of BIG
  6122         endian at the head.  Assigned the coding-system (Lisp symbol)
  6123         `utf-16-be' by default.
  6124 
  6125    o coding-category-utf-16-le
  6126 
  6127         The category for a coding system in which a text has an
  6128         Unicode signature (cf. Unicode Standard) in the order of
  6129         LITTLE endian at the head.  Assigned the coding-system (Lisp
  6130         symbol) `utf-16-le' by default.
  6131 
  6132    o coding-category-ccl
  6133 
  6134         The category for a coding system of which encoder/decoder is
  6135         written in CCL programs.  The default value is nil, i.e., no
  6136         coding system is assigned.
  6137 
  6138    o coding-category-binary
  6139 
  6140         The category for a coding system not categorized in any of the
  6141         above.  Assigned the coding-system (Lisp symbol)
  6142         `no-conversion' by default.
  6143 
  6144    Each of them is a Lisp symbol and the value is an actual
  6145    `coding-system's (this is also a Lisp symbol) assigned by a user.
  6146    What Emacs does actually is to detect a category of coding system.
  6147    Then, it uses a `coding-system' assigned to it.  If Emacs can't
  6148    decide only one possible category, it selects a category of the
  6149    highest priority.  Priorities of categories are also specified by a
  6150    user in a Lisp variable `coding-category-list'.
  6151 
  6152 */
  6153 
  6154 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
  6155                                            int eol_seen);
  6156 
  6157 
  6158 /* Return the number of ASCII characters at the head of the source.
  6159    By side effects, set coding->head_ascii and update
  6160    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
  6161    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
  6162    reliable only when all the source bytes are ASCII.  */
  6163 
  6164 static ptrdiff_t
  6165 check_ascii (struct coding_system *coding)
  6166 {
  6167   const unsigned char *src, *end;
  6168   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  6169   int eol_seen = coding->eol_seen;
  6170 
  6171   coding_set_source (coding);
  6172   src = coding->source;
  6173   end = src + coding->src_bytes;
  6174 
  6175   if (inhibit_eol_conversion
  6176       || SYMBOLP (eol_type))
  6177     {
  6178       /* We don't have to check EOL format.  */
  6179       while (src < end && !( *src & 0x80))
  6180         {
  6181           if (*src++ == '\n')
  6182             eol_seen |= EOL_SEEN_LF;
  6183         }
  6184     }
  6185   else
  6186     {
  6187       end--;                /* We look ahead one byte for "CR LF".  */
  6188       while (src < end)
  6189         {
  6190           int c = *src;
  6191 
  6192           if (c & 0x80)
  6193             break;
  6194           src++;
  6195           if (c == '\r')
  6196             {
  6197               if (*src == '\n')
  6198                 {
  6199                   eol_seen |= EOL_SEEN_CRLF;
  6200                   src++;
  6201                 }
  6202               else
  6203                 eol_seen |= EOL_SEEN_CR;
  6204             }
  6205           else if (c == '\n')
  6206             eol_seen |= EOL_SEEN_LF;
  6207         }
  6208       if (src == end)
  6209         {
  6210           int c = *src;
  6211 
  6212           /* All bytes but the last one C are ASCII.  */
  6213           if (! (c & 0x80))
  6214             {
  6215               if (c == '\r')
  6216                 eol_seen |= EOL_SEEN_CR;
  6217               else if (c  == '\n')
  6218                 eol_seen |= EOL_SEEN_LF;
  6219               src++;
  6220             }
  6221         }
  6222     }
  6223   coding->head_ascii = src - coding->source;
  6224   coding->eol_seen = eol_seen;
  6225   return (coding->head_ascii);
  6226 }
  6227 
  6228 
  6229 /* Return the number of characters at the source if all the bytes are
  6230    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
  6231    effects, update coding->eol_seen.  The value of coding->eol_seen is
  6232    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
  6233    the value is reliable only when all the source bytes are valid
  6234    UTF-8.  */
  6235 
  6236 static ptrdiff_t
  6237 check_utf_8 (struct coding_system *coding)
  6238 {
  6239   const unsigned char *src, *end;
  6240   int eol_seen;
  6241   ptrdiff_t nchars = coding->head_ascii;
  6242 
  6243   if (coding->head_ascii < 0)
  6244     check_ascii (coding);
  6245   else
  6246     coding_set_source (coding);
  6247   src = coding->source + coding->head_ascii;
  6248   /* We look ahead one byte for CR LF.  */
  6249   end = coding->source + coding->src_bytes - 1;
  6250   eol_seen = coding->eol_seen;
  6251   while (src < end)
  6252     {
  6253       int c = *src;
  6254 
  6255       if (UTF_8_1_OCTET_P (*src))
  6256         {
  6257           src++;
  6258           if (c < 0x20)
  6259             {
  6260               if (c == '\r')
  6261                 {
  6262                   if (*src == '\n')
  6263                     {
  6264                       eol_seen |= EOL_SEEN_CRLF;
  6265                       src++;
  6266                       nchars++;
  6267                     }
  6268                   else
  6269                     eol_seen |= EOL_SEEN_CR;
  6270                 }
  6271               else if (c == '\n')
  6272                 eol_seen |= EOL_SEEN_LF;
  6273             }
  6274         }
  6275       else if (UTF_8_2_OCTET_LEADING_P (c))
  6276         {
  6277           if (c < 0xC2          /* overlong sequence */
  6278               || src + 1 >= end
  6279               || ! UTF_8_EXTRA_OCTET_P (src[1]))
  6280             return -1;
  6281           src += 2;
  6282         }
  6283       else if (UTF_8_3_OCTET_LEADING_P (c))
  6284         {
  6285           if (src + 2 >= end
  6286               || ! (UTF_8_EXTRA_OCTET_P (src[1])
  6287                     && UTF_8_EXTRA_OCTET_P (src[2])))
  6288             return -1;
  6289           c = (((c & 0xF) << 12)
  6290                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
  6291           if (c < 0x800                       /* overlong sequence */
  6292               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
  6293             return -1;
  6294           src += 3;
  6295         }
  6296       else if (UTF_8_4_OCTET_LEADING_P (c))
  6297         {
  6298           if (src + 3 >= end
  6299               || ! (UTF_8_EXTRA_OCTET_P (src[1])
  6300                     && UTF_8_EXTRA_OCTET_P (src[2])
  6301                     && UTF_8_EXTRA_OCTET_P (src[3])))
  6302             return -1;
  6303           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
  6304                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
  6305           if (c < 0x10000       /* overlong sequence */
  6306               || c >= 0x110000) /* non-Unicode character  */
  6307             return -1;
  6308           src += 4;
  6309         }
  6310       else
  6311         return -1;
  6312       nchars++;
  6313     }
  6314 
  6315   if (src == end)
  6316     {
  6317       if (! UTF_8_1_OCTET_P (*src))
  6318         return -1;
  6319       nchars++;
  6320       if (*src == '\r')
  6321         eol_seen |= EOL_SEEN_CR;
  6322       else if (*src  == '\n')
  6323         eol_seen |= EOL_SEEN_LF;
  6324     }
  6325   coding->eol_seen = eol_seen;
  6326   return nchars;
  6327 }
  6328 
  6329 
  6330 /* Return whether STRING is a valid UTF-8 string.  STRING must be a
  6331    unibyte string.  */
  6332 
  6333 bool
  6334 utf8_string_p (Lisp_Object string)
  6335 {
  6336   eassert (!STRING_MULTIBYTE (string));
  6337   struct coding_system coding;
  6338   setup_coding_system (Qutf_8_unix, &coding);
  6339   /* We initialize only the fields that check_utf_8 accesses.  */
  6340   coding.head_ascii = -1;
  6341   coding.src_pos = 0;
  6342   coding.src_pos_byte = 0;
  6343   coding.src_chars = SCHARS (string);
  6344   coding.src_bytes = SBYTES (string);
  6345   coding.src_object = string;
  6346   coding.eol_seen = EOL_SEEN_NONE;
  6347   return check_utf_8 (&coding) != -1;
  6348 }
  6349 
  6350 /* Like make_string, but always returns a multibyte Lisp string, and
  6351    avoids decoding if TEXT is encoded in UTF-8.  */
  6352 Lisp_Object
  6353 make_string_from_utf8 (const char *text, ptrdiff_t nbytes)
  6354 {
  6355 #if 0
  6356   /* This method is on average 2 times slower than if we use
  6357      decode_string_utf_8.  However, please leave the slower
  6358      implementation in the code for now, in case it needs to be reused
  6359      in some situations.  */
  6360   ptrdiff_t chars, bytes;
  6361   parse_str_as_multibyte ((const unsigned char *) text, nbytes,
  6362                           &chars, &bytes);
  6363   /* If TEXT is a valid UTF-8 string, we can convert it to a Lisp
  6364      string directly.  Otherwise, we need to decode it.  */
  6365   if (chars == nbytes || bytes == nbytes)
  6366     return make_specified_string (text, chars, nbytes, true);
  6367   else
  6368     {
  6369       struct coding_system coding;
  6370       setup_coding_system (Qutf_8_unix, &coding);
  6371       coding.mode |= CODING_MODE_LAST_BLOCK;
  6372       coding.source = (const unsigned char *) text;
  6373       decode_coding_object (&coding, Qnil, 0, 0, nbytes, nbytes, Qt);
  6374       return coding.dst_object;
  6375     }
  6376 #else
  6377   return decode_string_utf_8 (Qnil, text, nbytes, Qnil, false, Qt, Qt);
  6378 #endif
  6379 }
  6380 
  6381 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
  6382    SOURCE is encoded.  If CATEGORY is one of
  6383    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
  6384    two-byte, else they are encoded by one-byte.
  6385 
  6386    Return one of EOL_SEEN_XXX.  */
  6387 
  6388 #define MAX_EOL_CHECK_COUNT 3
  6389 
  6390 static int
  6391 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
  6392             enum coding_category category)
  6393 {
  6394   const unsigned char *src = source, *src_end = src + src_bytes;
  6395   unsigned char c;
  6396   int total  = 0;
  6397   int eol_seen = EOL_SEEN_NONE;
  6398 
  6399   if ((1 << category) & CATEGORY_MASK_UTF_16)
  6400     {
  6401       bool msb = category == (coding_category_utf_16_le
  6402                               | coding_category_utf_16_le_nosig);
  6403       bool lsb = !msb;
  6404 
  6405       while (src + 1 < src_end)
  6406         {
  6407           c = src[lsb];
  6408           if (src[msb] == 0 && (c == '\n' || c == '\r'))
  6409             {
  6410               int this_eol;
  6411 
  6412               if (c == '\n')
  6413                 this_eol = EOL_SEEN_LF;
  6414               else if (src + 3 >= src_end
  6415                        || src[msb + 2] != 0
  6416                        || src[lsb + 2] != '\n')
  6417                 this_eol = EOL_SEEN_CR;
  6418               else
  6419                 {
  6420                   this_eol = EOL_SEEN_CRLF;
  6421                   src += 2;
  6422                 }
  6423 
  6424               if (eol_seen == EOL_SEEN_NONE)
  6425                 /* This is the first end-of-line.  */
  6426                 eol_seen = this_eol;
  6427               else if (eol_seen != this_eol)
  6428                 {
  6429                   /* The found type is different from what found before.
  6430                      Allow for stray ^M characters in DOS EOL files.  */
  6431                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
  6432                       || (eol_seen == EOL_SEEN_CRLF
  6433                           && this_eol == EOL_SEEN_CR))
  6434                     eol_seen = EOL_SEEN_CRLF;
  6435                   else
  6436                     {
  6437                       eol_seen = EOL_SEEN_LF;
  6438                       break;
  6439                     }
  6440                 }
  6441               if (++total == MAX_EOL_CHECK_COUNT)
  6442                 break;
  6443             }
  6444           src += 2;
  6445         }
  6446     }
  6447   else
  6448     while (src < src_end)
  6449       {
  6450         c = *src++;
  6451         if (c == '\n' || c == '\r')
  6452           {
  6453             int this_eol;
  6454 
  6455             if (c == '\n')
  6456               this_eol = EOL_SEEN_LF;
  6457             else if (src >= src_end || *src != '\n')
  6458               this_eol = EOL_SEEN_CR;
  6459             else
  6460               this_eol = EOL_SEEN_CRLF, src++;
  6461 
  6462             if (eol_seen == EOL_SEEN_NONE)
  6463               /* This is the first end-of-line.  */
  6464               eol_seen = this_eol;
  6465             else if (eol_seen != this_eol)
  6466               {
  6467                 /* The found type is different from what found before.
  6468                    Allow for stray ^M characters in DOS EOL files.  */
  6469                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
  6470                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
  6471                   eol_seen = EOL_SEEN_CRLF;
  6472                 else
  6473                   {
  6474                     eol_seen = EOL_SEEN_LF;
  6475                     break;
  6476                   }
  6477               }
  6478             if (++total == MAX_EOL_CHECK_COUNT)
  6479               break;
  6480           }
  6481       }
  6482   return eol_seen;
  6483 }
  6484 
  6485 
  6486 static Lisp_Object
  6487 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
  6488 {
  6489   Lisp_Object eol_type;
  6490 
  6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
  6492   if (! VECTORP (eol_type))
  6493     /* Already adjusted.  */
  6494     return eol_type;
  6495   if (eol_seen & EOL_SEEN_LF)
  6496     {
  6497       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
  6498       eol_type = Qunix;
  6499     }
  6500   else if (eol_seen & EOL_SEEN_CRLF)
  6501     {
  6502       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
  6503       eol_type = Qdos;
  6504     }
  6505   else if (eol_seen & EOL_SEEN_CR)
  6506     {
  6507       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
  6508       eol_type = Qmac;
  6509     }
  6510   return eol_type;
  6511 }
  6512 
  6513 /* Detect how a text specified in CODING is encoded.  If a coding
  6514    system is detected, update fields of CODING by the detected coding
  6515    system.  */
  6516 
  6517 static void
  6518 detect_coding (struct coding_system *coding)
  6519 {
  6520   const unsigned char *src, *src_end;
  6521   unsigned int saved_mode = coding->mode;
  6522   Lisp_Object found = Qnil;
  6523   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  6524 
  6525   coding->consumed = coding->consumed_char = 0;
  6526   coding->produced = coding->produced_char = 0;
  6527   coding_set_source (coding);
  6528 
  6529   src_end = coding->source + coding->src_bytes;
  6530 
  6531   coding->eol_seen = EOL_SEEN_NONE;
  6532   /* If we have not yet decided the text encoding type, detect it
  6533      now.  */
  6534   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
  6535     {
  6536       int c, i;
  6537       struct coding_detection_info detect_info = {0};
  6538       bool null_byte_found = 0, eight_bit_found = 0;
  6539       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
  6540                                        inhibit_null_byte_detection);
  6541       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
  6542                                        inhibit_iso_escape_detection);
  6543       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
  6544 
  6545       coding->head_ascii = 0;
  6546       for (src = coding->source; src < src_end; src++)
  6547         {
  6548           c = *src;
  6549           if (c & 0x80)
  6550             {
  6551               eight_bit_found = 1;
  6552               if (null_byte_found)
  6553                 break;
  6554             }
  6555           else if (c < 0x20)
  6556             {
  6557               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
  6558                   && ! inhibit_ied
  6559                   && ! detect_info.checked)
  6560                 {
  6561                   if (detect_coding_iso_2022 (coding, &detect_info))
  6562                     {
  6563                       /* We have scanned the whole data.  */
  6564                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
  6565                         {
  6566                           /* We didn't find an 8-bit code.  We may
  6567                              have found a null-byte, but it's very
  6568                              rare that a binary file conforms to
  6569                              ISO-2022.  */
  6570                           src = src_end;
  6571                           coding->head_ascii = src - coding->source;
  6572                         }
  6573                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
  6574                       break;
  6575                     }
  6576                 }
  6577               else if (! c && !inhibit_nbd)
  6578                 {
  6579                   null_byte_found = 1;
  6580                   if (eight_bit_found)
  6581                     break;
  6582                 }
  6583               else if (! disable_ascii_optimization
  6584                        && ! inhibit_eol_conversion)
  6585                 {
  6586                   if (c == '\r')
  6587                     {
  6588                       if (src < src_end && src[1] == '\n')
  6589                         {
  6590                           coding->eol_seen |= EOL_SEEN_CRLF;
  6591                           src++;
  6592                           if (! eight_bit_found)
  6593                             coding->head_ascii++;
  6594                         }
  6595                       else
  6596                         coding->eol_seen |= EOL_SEEN_CR;
  6597                     }
  6598                   else if (c == '\n')
  6599                     {
  6600                       coding->eol_seen |= EOL_SEEN_LF;
  6601                     }
  6602                 }
  6603 
  6604               if (! eight_bit_found)
  6605                 coding->head_ascii++;
  6606             }
  6607           else if (! eight_bit_found)
  6608             coding->head_ascii++;
  6609         }
  6610 
  6611       if (null_byte_found || eight_bit_found
  6612           || coding->head_ascii < coding->src_bytes
  6613           || detect_info.found)
  6614         {
  6615           enum coding_category category;
  6616           struct coding_system *this;
  6617 
  6618           if (coding->head_ascii == coding->src_bytes)
  6619             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
  6620             for (i = 0; i < coding_category_raw_text; i++)
  6621               {
  6622                 category = coding_priorities[i];
  6623                 this = coding_categories + category;
  6624                 if (detect_info.found & (1 << category))
  6625                   break;
  6626               }
  6627           else
  6628             {
  6629               if (null_byte_found)
  6630                 {
  6631                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
  6632                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
  6633                 }
  6634               else if (prefer_utf_8
  6635                        && detect_coding_utf_8 (coding, &detect_info))
  6636                 {
  6637                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
  6638                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
  6639                 }
  6640               for (i = 0; i < coding_category_raw_text; i++)
  6641                 {
  6642                   category = coding_priorities[i];
  6643                   this = coding_categories + category;
  6644                   /* Some of this->detector (e.g. detect_coding_sjis)
  6645                      require this information.  */
  6646                   coding->id = this->id;
  6647                   if (this->id < 0)
  6648                     {
  6649                       /* No coding system of this category is defined.  */
  6650                       detect_info.rejected |= (1 << category);
  6651                     }
  6652                   else if (category >= coding_category_raw_text)
  6653                     continue;
  6654                   else if (detect_info.checked & (1 << category))
  6655                     {
  6656                       if (detect_info.found & (1 << category))
  6657                         break;
  6658                     }
  6659                   else if ((*(this->detector)) (coding, &detect_info)
  6660                            && detect_info.found & (1 << category))
  6661                     break;
  6662                 }
  6663             }
  6664 
  6665           if (i < coding_category_raw_text)
  6666             {
  6667               if (category == coding_category_utf_8_auto)
  6668                 {
  6669                   Lisp_Object coding_systems;
  6670 
  6671                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
  6672                                          coding_attr_utf_bom);
  6673                   if (CONSP (coding_systems))
  6674                     {
  6675                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  6676                         found = XCAR (coding_systems);
  6677                       else
  6678                         found = XCDR (coding_systems);
  6679                     }
  6680                   else
  6681                     found = CODING_ID_NAME (this->id);
  6682                 }
  6683               else if (category == coding_category_utf_16_auto)
  6684                 {
  6685                   Lisp_Object coding_systems;
  6686 
  6687                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
  6688                                          coding_attr_utf_bom);
  6689                   if (CONSP (coding_systems))
  6690                     {
  6691                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  6692                         found = XCAR (coding_systems);
  6693                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  6694                         found = XCDR (coding_systems);
  6695                     }
  6696                   else
  6697                     found = CODING_ID_NAME (this->id);
  6698                 }
  6699               else
  6700                 found = CODING_ID_NAME (this->id);
  6701             }
  6702           else if (null_byte_found)
  6703             found = Qno_conversion;
  6704           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
  6705                    == CATEGORY_MASK_ANY)
  6706             found = Qraw_text;
  6707           else if (detect_info.rejected)
  6708             for (i = 0; i < coding_category_raw_text; i++)
  6709               if (! (detect_info.rejected & (1 << coding_priorities[i])))
  6710                 {
  6711                   this = coding_categories + coding_priorities[i];
  6712                   found = CODING_ID_NAME (this->id);
  6713                   break;
  6714                 }
  6715         }
  6716     }
  6717   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
  6718            == coding_category_utf_8_auto)
  6719     {
  6720       Lisp_Object coding_systems
  6721         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
  6722       if (check_ascii (coding) == coding->src_bytes)
  6723         {
  6724           if (CONSP (coding_systems))
  6725             found = XCDR (coding_systems);
  6726         }
  6727       else
  6728         {
  6729           struct coding_detection_info detect_info = {0};
  6730           if (CONSP (coding_systems)
  6731               && detect_coding_utf_8 (coding, &detect_info))
  6732             {
  6733               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  6734                 found = XCAR (coding_systems);
  6735               else
  6736                 found = XCDR (coding_systems);
  6737             }
  6738         }
  6739     }
  6740   else if (XFIXNUM (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
  6741            == coding_category_utf_16_auto)
  6742     {
  6743       Lisp_Object coding_systems
  6744         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
  6745       coding->head_ascii = 0;
  6746       if (CONSP (coding_systems))
  6747         {
  6748           struct coding_detection_info detect_info = {0};
  6749           if (detect_coding_utf_16 (coding, &detect_info))
  6750             {
  6751               if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  6752                 found = XCAR (coding_systems);
  6753               else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  6754                 found = XCDR (coding_systems);
  6755             }
  6756         }
  6757     }
  6758 
  6759   if (! NILP (found))
  6760     {
  6761       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
  6762                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
  6763                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
  6764                            : EOL_SEEN_LF);
  6765 
  6766       setup_coding_system (found, coding);
  6767       if (specified_eol != EOL_SEEN_NONE)
  6768         adjust_coding_eol_type (coding, specified_eol);
  6769     }
  6770 
  6771   coding->mode = saved_mode;
  6772 }
  6773 
  6774 
  6775 static void
  6776 decode_eol (struct coding_system *coding)
  6777 {
  6778   Lisp_Object eol_type;
  6779   unsigned char *p, *pbeg, *pend;
  6780 
  6781   eol_type = CODING_ID_EOL_TYPE (coding->id);
  6782   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
  6783     return;
  6784 
  6785   if (NILP (coding->dst_object))
  6786     pbeg = coding->destination;
  6787   else
  6788     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
  6789   pend = pbeg + coding->produced;
  6790 
  6791   if (VECTORP (eol_type))
  6792     {
  6793       int eol_seen = EOL_SEEN_NONE;
  6794 
  6795       for (p = pbeg; p < pend; p++)
  6796         {
  6797           if (*p == '\n')
  6798             eol_seen |= EOL_SEEN_LF;
  6799           else if (*p == '\r')
  6800             {
  6801               if (p + 1 < pend && *(p + 1) == '\n')
  6802                 {
  6803                   eol_seen |= EOL_SEEN_CRLF;
  6804                   p++;
  6805                 }
  6806               else
  6807                 eol_seen |= EOL_SEEN_CR;
  6808             }
  6809         }
  6810       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
  6811       if ((eol_seen & EOL_SEEN_CRLF) != 0
  6812           && (eol_seen & EOL_SEEN_CR) != 0
  6813           && (eol_seen & EOL_SEEN_LF) == 0)
  6814         eol_seen = EOL_SEEN_CRLF;
  6815       else if (eol_seen != EOL_SEEN_NONE
  6816           && eol_seen != EOL_SEEN_LF
  6817           && eol_seen != EOL_SEEN_CRLF
  6818           && eol_seen != EOL_SEEN_CR)
  6819         eol_seen = EOL_SEEN_LF;
  6820       if (eol_seen != EOL_SEEN_NONE)
  6821         eol_type = adjust_coding_eol_type (coding, eol_seen);
  6822     }
  6823 
  6824   if (EQ (eol_type, Qmac))
  6825     {
  6826       for (p = pbeg; p < pend; p++)
  6827         if (*p == '\r')
  6828           *p = '\n';
  6829     }
  6830   else if (EQ (eol_type, Qdos))
  6831     {
  6832       ptrdiff_t n = 0;
  6833       ptrdiff_t pos = coding->dst_pos;
  6834       ptrdiff_t pos_byte = coding->dst_pos_byte;
  6835       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
  6836 
  6837       /* This assertion is here instead of code, now deleted, that
  6838          handled the NILP case, which no longer happens with the
  6839          current codebase.  */
  6840       eassert (!NILP (coding->dst_object));
  6841 
  6842       while (pos_byte < pos_end)
  6843         {
  6844           int incr;
  6845 
  6846           p = BYTE_POS_ADDR (pos_byte);
  6847           if (coding->dst_multibyte)
  6848             incr = BYTES_BY_CHAR_HEAD (*p);
  6849           else
  6850             incr = 1;
  6851 
  6852           if (*p == '\r' && p[1] == '\n')
  6853             {
  6854               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
  6855               n++;
  6856               pos_end--;
  6857             }
  6858           pos++;
  6859           pos_byte += incr;
  6860         }
  6861       coding->produced -= n;
  6862       coding->produced_char -= n;
  6863     }
  6864 }
  6865 
  6866 
  6867 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
  6868    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
  6869    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
  6870 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
  6871 
  6872 /* Return a translation table (or list of them) from coding system
  6873    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
  6874    not ENCODEP). */
  6875 
  6876 static Lisp_Object
  6877 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
  6878 {
  6879   Lisp_Object standard, translation_table;
  6880   Lisp_Object val;
  6881 
  6882   if (NILP (Venable_character_translation))
  6883     {
  6884       if (max_lookup)
  6885         *max_lookup = 0;
  6886       return Qnil;
  6887     }
  6888   if (encodep)
  6889     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
  6890       standard = Vstandard_translation_table_for_encode;
  6891   else
  6892     translation_table = CODING_ATTR_DECODE_TBL (attrs),
  6893       standard = Vstandard_translation_table_for_decode;
  6894   if (NILP (translation_table))
  6895     translation_table = standard;
  6896   else
  6897     {
  6898       if (SYMBOLP (translation_table))
  6899         translation_table = Fget (translation_table, Qtranslation_table);
  6900       else if (CONSP (translation_table))
  6901         {
  6902           translation_table = Fcopy_sequence (translation_table);
  6903           for (val = translation_table; CONSP (val); val = XCDR (val))
  6904             if (SYMBOLP (XCAR (val)))
  6905               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
  6906         }
  6907       if (CHAR_TABLE_P (standard))
  6908         {
  6909           if (CONSP (translation_table))
  6910             translation_table = nconc2 (translation_table, list1 (standard));
  6911           else
  6912             translation_table = list2 (translation_table, standard);
  6913         }
  6914     }
  6915 
  6916   if (max_lookup)
  6917     {
  6918       *max_lookup = 1;
  6919       if (CHAR_TABLE_P (translation_table)
  6920           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
  6921         {
  6922           val = XCHAR_TABLE (translation_table)->extras[1];
  6923           if (FIXNATP (val) && *max_lookup < XFIXNAT (val))
  6924             *max_lookup = min (XFIXNAT (val), MAX_LOOKUP_MAX);
  6925         }
  6926       else if (CONSP (translation_table))
  6927         {
  6928           Lisp_Object tail;
  6929 
  6930           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
  6931             if (CHAR_TABLE_P (XCAR (tail))
  6932                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
  6933               {
  6934                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
  6935                 if (FIXNATP (tailval) && *max_lookup < XFIXNAT (tailval))
  6936                   *max_lookup = min (XFIXNAT (tailval), MAX_LOOKUP_MAX);
  6937               }
  6938         }
  6939     }
  6940   return translation_table;
  6941 }
  6942 
  6943 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
  6944   do {                                                          \
  6945     trans = Qnil;                                               \
  6946     if (CHAR_TABLE_P (table))                                   \
  6947       {                                                         \
  6948         trans = CHAR_TABLE_REF (table, c);                      \
  6949         if (CHARACTERP (trans))                                 \
  6950           c = XFIXNAT (trans), trans = Qnil;                    \
  6951       }                                                         \
  6952     else if (CONSP (table))                                     \
  6953       {                                                         \
  6954         Lisp_Object tail;                                       \
  6955                                                                 \
  6956         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
  6957           if (CHAR_TABLE_P (XCAR (tail)))                       \
  6958             {                                                   \
  6959               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
  6960               if (CHARACTERP (trans))                           \
  6961                 c = XFIXNAT (trans), trans = Qnil;              \
  6962               else if (! NILP (trans))                          \
  6963                 break;                                          \
  6964             }                                                   \
  6965       }                                                         \
  6966   } while (0)
  6967 
  6968 
  6969 /* Return a translation of character(s) at BUF according to TRANS.
  6970    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
  6971    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
  6972    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
  6973    found, or Qt if BUF is too short to lookup characters in FROM.  As
  6974    a side effect, if a translation is found, *NCHARS is set to the
  6975    number of characters being translated.  */
  6976 
  6977 static Lisp_Object
  6978 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
  6979 {
  6980   if (FIXNUMP (trans) || VECTORP (trans))
  6981     {
  6982       *nchars = 1;
  6983       return trans;
  6984     }
  6985   for (; CONSP (trans); trans = XCDR (trans))
  6986     {
  6987       Lisp_Object val = XCAR (trans);
  6988       Lisp_Object from = XCAR (val);
  6989       ptrdiff_t len = ASIZE (from);
  6990       ptrdiff_t i;
  6991 
  6992       for (i = 0; i < len; i++)
  6993         {
  6994           if (buf + i == buf_end)
  6995             return Qt;
  6996           if (XFIXNUM (AREF (from, i)) != buf[i])
  6997             break;
  6998         }
  6999       if (i == len)
  7000         {
  7001           *nchars = len;
  7002           return XCDR (val);
  7003         }
  7004     }
  7005   return Qnil;
  7006 }
  7007 
  7008 
  7009 static int
  7010 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
  7011                bool last_block)
  7012 {
  7013   unsigned char *dst = coding->destination + coding->produced;
  7014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
  7015   ptrdiff_t produced;
  7016   ptrdiff_t produced_chars = 0;
  7017   int carryover = 0;
  7018 
  7019   if (! coding->chars_at_source)
  7020     {
  7021       /* Source characters are in coding->charbuf.  */
  7022       int *buf = coding->charbuf;
  7023       int *buf_end = buf + coding->charbuf_used;
  7024 
  7025       if (EQ (coding->src_object, coding->dst_object)
  7026           && ! NILP (coding->dst_object))
  7027         {
  7028           eassert (growable_destination (coding));
  7029           coding_set_source (coding);
  7030           dst_end = ((unsigned char *) coding->source) + coding->consumed;
  7031         }
  7032 
  7033       while (buf < buf_end)
  7034         {
  7035           int c = *buf;
  7036           ptrdiff_t i;
  7037 
  7038           if (c >= 0)
  7039             {
  7040               ptrdiff_t from_nchars = 1, to_nchars = 1;
  7041               Lisp_Object trans = Qnil;
  7042 
  7043               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
  7044               if (! NILP (trans))
  7045                 {
  7046                   trans = get_translation (trans, buf, buf_end, &from_nchars);
  7047                   if (FIXNUMP (trans))
  7048                     c = XFIXNUM (trans);
  7049                   else if (VECTORP (trans))
  7050                     {
  7051                       to_nchars = ASIZE (trans);
  7052                       c = XFIXNUM (AREF (trans, 0));
  7053                     }
  7054                   else if (EQ (trans, Qt) && ! last_block)
  7055                     break;
  7056                 }
  7057 
  7058               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
  7059                 {
  7060                   eassert (growable_destination (coding));
  7061                   ptrdiff_t dst_size;
  7062                   if (ckd_mul (&dst_size, to_nchars, MAX_MULTIBYTE_LENGTH)
  7063                       || ckd_add (&dst_size, dst_size, buf_end - buf))
  7064                     memory_full (SIZE_MAX);
  7065                   dst = alloc_destination (coding, dst_size, dst);
  7066                   if (EQ (coding->src_object, coding->dst_object))
  7067                     {
  7068                       coding_set_source (coding);
  7069                       dst_end = (((unsigned char *) coding->source)
  7070                                  + coding->consumed);
  7071                     }
  7072                   else
  7073                     dst_end = coding->destination + coding->dst_bytes;
  7074                 }
  7075 
  7076               for (i = 0; i < to_nchars; i++)
  7077                 {
  7078                   if (i > 0)
  7079                     c = XFIXNUM (AREF (trans, i));
  7080                   if (coding->dst_multibyte
  7081                       || ! CHAR_BYTE8_P (c))
  7082                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
  7083                   else
  7084                     *dst++ = CHAR_TO_BYTE8 (c);
  7085                 }
  7086               produced_chars += to_nchars;
  7087               buf += from_nchars;
  7088             }
  7089           else
  7090             /* This is an annotation datum.  (-C) is the length.  */
  7091             buf += -c;
  7092         }
  7093       carryover = buf_end - buf;
  7094     }
  7095   else
  7096     {
  7097       /* Source characters are at coding->source.  */
  7098       const unsigned char *src = coding->source;
  7099       const unsigned char *src_end = src + coding->consumed;
  7100 
  7101       if (EQ (coding->dst_object, coding->src_object))
  7102         {
  7103           eassert (growable_destination (coding));
  7104           dst_end = (unsigned char *) src;
  7105         }
  7106       if (coding->src_multibyte != coding->dst_multibyte)
  7107         {
  7108           if (coding->src_multibyte)
  7109             {
  7110               bool multibytep = 1;
  7111               ptrdiff_t consumed_chars = 0;
  7112 
  7113               while (1)
  7114                 {
  7115                   const unsigned char *src_base = src;
  7116                   int c;
  7117 
  7118                   ONE_MORE_BYTE (c);
  7119                   if (dst == dst_end)
  7120                     {
  7121                       eassert (growable_destination (coding));
  7122                       if (EQ (coding->src_object, coding->dst_object))
  7123                         dst_end = (unsigned char *) src;
  7124                       if (dst == dst_end)
  7125                         {
  7126                           ptrdiff_t offset = src - coding->source;
  7127 
  7128                           dst = alloc_destination (coding, src_end - src + 1,
  7129                                                    dst);
  7130                           dst_end = coding->destination + coding->dst_bytes;
  7131                           coding_set_source (coding);
  7132                           src = coding->source + offset;
  7133                           src_end = coding->source + coding->consumed;
  7134                           if (EQ (coding->src_object, coding->dst_object))
  7135                             dst_end = (unsigned char *) src;
  7136                         }
  7137                     }
  7138                   *dst++ = c;
  7139                   produced_chars++;
  7140                 }
  7141             no_more_source:
  7142               ;
  7143             }
  7144           else
  7145             while (src < src_end)
  7146               {
  7147                 bool multibytep = 1;
  7148                 int c = *src++;
  7149 
  7150                 if (dst >= dst_end - 1)
  7151                   {
  7152                     eassert (growable_destination (coding));
  7153                     if (EQ (coding->src_object, coding->dst_object))
  7154                       dst_end = (unsigned char *) src;
  7155                     if (dst >= dst_end - 1)
  7156                       {
  7157                         ptrdiff_t offset = src - coding->source;
  7158                         ptrdiff_t more_bytes;
  7159 
  7160                         if (EQ (coding->src_object, coding->dst_object))
  7161                           more_bytes = ((src_end - src) / 2) + 2;
  7162                         else
  7163                           more_bytes = src_end - src + 2;
  7164                         dst = alloc_destination (coding, more_bytes, dst);
  7165                         dst_end = coding->destination + coding->dst_bytes;
  7166                         coding_set_source (coding);
  7167                         src = coding->source + offset;
  7168                         src_end = coding->source + coding->consumed;
  7169                         if (EQ (coding->src_object, coding->dst_object))
  7170                           dst_end = (unsigned char *) src;
  7171                       }
  7172                   }
  7173                 EMIT_ONE_BYTE (c);
  7174               }
  7175         }
  7176       else
  7177         {
  7178           if (!EQ (coding->src_object, coding->dst_object))
  7179             {
  7180               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
  7181 
  7182               if (require > 0)
  7183                 {
  7184                   ptrdiff_t offset = src - coding->source;
  7185 
  7186                   dst = alloc_destination (coding, require, dst);
  7187                   coding_set_source (coding);
  7188                   src = coding->source + offset;
  7189                   src_end = coding->source + coding->consumed;
  7190                 }
  7191             }
  7192           produced_chars = coding->consumed_char;
  7193           while (src < src_end)
  7194             *dst++ = *src++;
  7195         }
  7196     }
  7197 
  7198   produced = dst - (coding->destination + coding->produced);
  7199   if (BUFFERP (coding->dst_object) && produced_chars > 0)
  7200     insert_from_gap (produced_chars, produced, 0);
  7201   coding->produced += produced;
  7202   coding->produced_char += produced_chars;
  7203   return carryover;
  7204 }
  7205 
  7206 /* Compose text in CODING->object according to the annotation data at
  7207    CHARBUF.  CHARBUF is an array:
  7208      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
  7209  */
  7210 
  7211 static void
  7212 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
  7213 {
  7214   int len;
  7215   ptrdiff_t to;
  7216   enum composition_method method;
  7217   Lisp_Object components;
  7218 
  7219   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
  7220   to = pos + charbuf[2];
  7221   method = (enum composition_method) (charbuf[4]);
  7222 
  7223   if (method == COMPOSITION_RELATIVE)
  7224     components = Qnil;
  7225   else
  7226     {
  7227       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
  7228       int i, j;
  7229 
  7230       if (method == COMPOSITION_WITH_RULE)
  7231         len = charbuf[2] * 3 - 2;
  7232       charbuf += MAX_ANNOTATION_LENGTH;
  7233       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
  7234       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
  7235         {
  7236           if (charbuf[i] >= 0)
  7237             args[j] = make_fixnum (charbuf[i]);
  7238           else
  7239             {
  7240               i++;
  7241               args[j] = make_fixnum (charbuf[i] % 0x100);
  7242             }
  7243         }
  7244       components = (i == j ? Fstring (j, args) : Fvector (j, args));
  7245     }
  7246   compose_text (pos, to, components, Qnil, coding->dst_object);
  7247 }
  7248 
  7249 
  7250 /* Put `charset' property on text in CODING->object according to
  7251    the annotation data at CHARBUF.  CHARBUF is an array:
  7252      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
  7253  */
  7254 
  7255 static void
  7256 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
  7257 {
  7258   ptrdiff_t from = pos - charbuf[2];
  7259   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
  7260 
  7261   Fput_text_property (make_fixnum (from), make_fixnum (pos),
  7262                       Qcharset, CHARSET_NAME (charset),
  7263                       coding->dst_object);
  7264 }
  7265 
  7266 #define MAX_CHARBUF_SIZE 0x4000
  7267 /* How many units decoding functions expect in coding->charbuf at
  7268    most.  Currently, decode_coding_emacs_mule expects the following
  7269    size, and that is the largest value.  */
  7270 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
  7271 
  7272 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
  7273   do {                                                          \
  7274     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
  7275                            MAX_CHARBUF_SIZE);                   \
  7276     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
  7277     coding->charbuf_size = units;                               \
  7278   } while (0)
  7279 
  7280 static void
  7281 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
  7282 {
  7283   int *charbuf = coding->charbuf;
  7284   int *charbuf_end = charbuf + coding->charbuf_used;
  7285 
  7286   if (NILP (coding->dst_object))
  7287     return;
  7288 
  7289   while (charbuf < charbuf_end)
  7290     {
  7291       if (*charbuf >= 0)
  7292         pos++, charbuf++;
  7293       else
  7294         {
  7295           int len = -*charbuf;
  7296 
  7297           if (len > 2)
  7298             switch (charbuf[1])
  7299               {
  7300               case CODING_ANNOTATE_COMPOSITION_MASK:
  7301                 produce_composition (coding, charbuf, pos);
  7302                 break;
  7303               case CODING_ANNOTATE_CHARSET_MASK:
  7304                 produce_charset (coding, charbuf, pos);
  7305                 break;
  7306               default:
  7307                 break;
  7308               }
  7309           charbuf += len;
  7310         }
  7311     }
  7312 }
  7313 
  7314 /* Decode the data at CODING->src_object into CODING->dst_object.
  7315    CODING->src_object is a buffer, a string, or nil.
  7316    CODING->dst_object is a buffer.
  7317 
  7318    If CODING->src_object is a buffer, it must be the current buffer.
  7319    In this case, if CODING->src_pos is positive, it is a position of
  7320    the source text in the buffer, otherwise, the source text is in the
  7321    gap area of the buffer, and CODING->src_pos specifies the offset of
  7322    the text from the end of the gap (and GPT must be equal to PT).
  7323 
  7324    When the text is taken from the gap, it can't be at the beginning
  7325    of the gap because the new decoded text is progressively accumulated
  7326    at the beginning of the gap before it gets inserted at PT (this way,
  7327    as the output grows, the input shrinks, so we only need to allocate
  7328    enough space for `max(IN, OUT)` instead of `IN + OUT`).
  7329 
  7330    If CODING->src_object is a string, CODING->src_pos is an index to
  7331    that string.
  7332 
  7333    If CODING->src_object is nil, CODING->source must already point to
  7334    the non-relocatable memory area.  In this case, CODING->src_pos is
  7335    an offset from CODING->source.
  7336 
  7337    The decoded data is inserted at the current point of the buffer
  7338    CODING->dst_object.
  7339 */
  7340 
  7341 static void
  7342 decode_coding (struct coding_system *coding)
  7343 {
  7344   Lisp_Object attrs;
  7345   Lisp_Object undo_list;
  7346   Lisp_Object translation_table;
  7347   struct ccl_spec cclspec;
  7348   int carryover;
  7349   int i;
  7350 
  7351   USE_SAFE_ALLOCA;
  7352 
  7353   if (BUFFERP (coding->src_object)
  7354       && coding->src_pos > 0
  7355       && coding->src_pos < GPT
  7356       && coding->src_pos + coding->src_chars > GPT)
  7357     move_gap_both (coding->src_pos, coding->src_pos_byte);
  7358 
  7359   undo_list = Qt;
  7360   if (BUFFERP (coding->dst_object))
  7361     {
  7362       set_buffer_internal (XBUFFER (coding->dst_object));
  7363       if (GPT != PT)
  7364         move_gap_both (PT, PT_BYTE);
  7365 
  7366       /* We must disable undo_list in order to record the whole insert
  7367          transaction via record_insert at the end.  But doing so also
  7368          disables the recording of the first change to the undo_list.
  7369          Therefore we check for first change here and record it via
  7370          record_first_change if needed.  */
  7371       if (MODIFF <= SAVE_MODIFF)
  7372         record_first_change ();
  7373 
  7374       undo_list = BVAR (current_buffer, undo_list);
  7375       bset_undo_list (current_buffer, Qt);
  7376     }
  7377 
  7378   coding->consumed = coding->consumed_char = 0;
  7379   coding->produced = coding->produced_char = 0;
  7380   coding->chars_at_source = 0;
  7381   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  7382 
  7383   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
  7384 
  7385   attrs = CODING_ID_ATTRS (coding->id);
  7386   translation_table = get_translation_table (attrs, 0, NULL);
  7387 
  7388   carryover = 0;
  7389   if (coding->decoder == decode_coding_ccl)
  7390     {
  7391       coding->spec.ccl = &cclspec;
  7392       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
  7393     }
  7394   do
  7395     {
  7396       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
  7397 
  7398       coding_set_source (coding);
  7399       coding->annotated = 0;
  7400       coding->charbuf_used = carryover;
  7401       (*(coding->decoder)) (coding);
  7402       coding_set_destination (coding);
  7403       carryover = produce_chars (coding, translation_table, 0);
  7404       if (coding->annotated)
  7405         produce_annotation (coding, pos);
  7406       for (i = 0; i < carryover; i++)
  7407         coding->charbuf[i]
  7408           = coding->charbuf[coding->charbuf_used - carryover + i];
  7409     }
  7410   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
  7411          || (coding->consumed < coding->src_bytes
  7412              && (coding->result == CODING_RESULT_SUCCESS
  7413                  || coding->result == CODING_RESULT_INVALID_SRC)));
  7414 
  7415   if (carryover > 0)
  7416     {
  7417       coding_set_destination (coding);
  7418       coding->charbuf_used = carryover;
  7419       produce_chars (coding, translation_table, 1);
  7420     }
  7421 
  7422   coding->carryover_bytes = 0;
  7423   if (coding->consumed < coding->src_bytes)
  7424     {
  7425       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
  7426       const unsigned char *src;
  7427 
  7428       coding_set_source (coding);
  7429       coding_set_destination (coding);
  7430       src = coding->source + coding->consumed;
  7431 
  7432       if (coding->mode & CODING_MODE_LAST_BLOCK)
  7433         {
  7434           /* Flush out unprocessed data as binary chars.  We are sure
  7435              that the number of data is less than the size of
  7436              coding->charbuf.  */
  7437           coding->charbuf_used = 0;
  7438           coding->chars_at_source = 0;
  7439 
  7440           while (nbytes-- > 0)
  7441             {
  7442               int c;
  7443 
  7444               /* Copy raw bytes in their 2-byte forms from multibyte
  7445                  text as single characters.  */
  7446               if (coding->src_multibyte
  7447                   && CHAR_BYTE8_HEAD_P (*src) && nbytes > 0)
  7448                 {
  7449                   c = string_char_advance (&src);
  7450                   nbytes--;
  7451                 }
  7452               else
  7453                 {
  7454                   c = *src++;
  7455 
  7456                   if (c & 0x80)
  7457                     c = BYTE8_TO_CHAR (c);
  7458                 }
  7459               coding->charbuf[coding->charbuf_used++] = c;
  7460             }
  7461           produce_chars (coding, Qnil, 1);
  7462         }
  7463       else
  7464         {
  7465           /* Record unprocessed bytes in coding->carryover.  We are
  7466              sure that the number of data is less than the size of
  7467              coding->carryover.  */
  7468           unsigned char *p = coding->carryover;
  7469 
  7470           if (nbytes > sizeof coding->carryover)
  7471             nbytes = sizeof coding->carryover;
  7472           coding->carryover_bytes = nbytes;
  7473           while (nbytes-- > 0)
  7474             *p++ = *src++;
  7475         }
  7476       coding->consumed = coding->src_bytes;
  7477     }
  7478 
  7479   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
  7480       && !inhibit_eol_conversion)
  7481     decode_eol (coding);
  7482   if (BUFFERP (coding->dst_object))
  7483     {
  7484       bset_undo_list (current_buffer, undo_list);
  7485       record_insert (coding->dst_pos, coding->produced_char);
  7486     }
  7487 
  7488   SAFE_FREE ();
  7489 }
  7490 
  7491 
  7492 /* Extract an annotation datum from a composition starting at POS and
  7493    ending before LIMIT of CODING->src_object (buffer or string), store
  7494    the data in BUF, set *STOP to a starting position of the next
  7495    composition (if any) or to LIMIT, and return the address of the
  7496    next element of BUF.
  7497 
  7498    If such an annotation is not found, set *STOP to a starting
  7499    position of a composition after POS (if any) or to LIMIT, and
  7500    return BUF.  */
  7501 
  7502 static int *
  7503 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
  7504                                struct coding_system *coding, int *buf,
  7505                                ptrdiff_t *stop)
  7506 {
  7507   ptrdiff_t start, end;
  7508   Lisp_Object prop;
  7509 
  7510   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
  7511       || end > limit)
  7512     *stop = limit;
  7513   else if (start > pos)
  7514     *stop = start;
  7515   else
  7516     {
  7517       if (start == pos)
  7518         {
  7519           /* We found a composition.  Store the corresponding
  7520              annotation data in BUF.  */
  7521           int *head = buf;
  7522           enum composition_method method = composition_method (prop);
  7523           int nchars = COMPOSITION_LENGTH (prop);
  7524 
  7525           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
  7526           if (method != COMPOSITION_RELATIVE)
  7527             {
  7528               Lisp_Object components;
  7529               ptrdiff_t i, len, i_byte;
  7530 
  7531               components = COMPOSITION_COMPONENTS (prop);
  7532               if (VECTORP (components))
  7533                 {
  7534                   len = ASIZE (components);
  7535                   for (i = 0; i < len; i++)
  7536                     *buf++ = XFIXNUM (AREF (components, i));
  7537                 }
  7538               else if (STRINGP (components))
  7539                 {
  7540                   len = SCHARS (components);
  7541                   i = i_byte = 0;
  7542                   while (i < len)
  7543                     *buf++ = fetch_string_char_advance (components,
  7544                                                         &i, &i_byte);
  7545                 }
  7546               else if (FIXNUMP (components))
  7547                 {
  7548                   len = 1;
  7549                   *buf++ = XFIXNUM (components);
  7550                 }
  7551               else if (CONSP (components))
  7552                 {
  7553                   for (len = 0; CONSP (components);
  7554                        len++, components = XCDR (components))
  7555                     *buf++ = XFIXNUM (XCAR (components));
  7556                 }
  7557               else
  7558                 emacs_abort ();
  7559               *head -= len;
  7560             }
  7561         }
  7562 
  7563       if (find_composition (end, limit, &start, &end, &prop,
  7564                             coding->src_object)
  7565           && end <= limit)
  7566         *stop = start;
  7567       else
  7568         *stop = limit;
  7569     }
  7570   return buf;
  7571 }
  7572 
  7573 
  7574 /* Extract an annotation datum from a text property `charset' at POS of
  7575    CODING->src_object (buffer of string), store the data in BUF, set
  7576    *STOP to the position where the value of `charset' property changes
  7577    (limiting by LIMIT), and return the address of the next element of
  7578    BUF.
  7579 
  7580    If the property value is nil, set *STOP to the position where the
  7581    property value is non-nil (limiting by LIMIT), and return BUF.  */
  7582 
  7583 static int *
  7584 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
  7585                            struct coding_system *coding, int *buf,
  7586                            ptrdiff_t *stop)
  7587 {
  7588   Lisp_Object val, next;
  7589   int id;
  7590 
  7591   val = Fget_text_property (make_fixnum (pos), Qcharset, coding->src_object);
  7592   if (! NILP (val) && CHARSETP (val))
  7593     id = XFIXNUM (CHARSET_SYMBOL_ID (val));
  7594   else
  7595     id = -1;
  7596   ADD_CHARSET_DATA (buf, 0, id);
  7597   next = Fnext_single_property_change (make_fixnum (pos), Qcharset,
  7598                                        coding->src_object,
  7599                                        make_fixnum (limit));
  7600   *stop = XFIXNUM (next);
  7601   return buf;
  7602 }
  7603 
  7604 
  7605 static void
  7606 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
  7607                int max_lookup)
  7608 {
  7609   int *buf = coding->charbuf;
  7610   int *buf_end = coding->charbuf + coding->charbuf_size;
  7611   const unsigned char *src = coding->source + coding->consumed;
  7612   const unsigned char *src_end = coding->source + coding->src_bytes;
  7613   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
  7614   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
  7615   bool multibytep = coding->src_multibyte;
  7616   Lisp_Object eol_type;
  7617   int c;
  7618   ptrdiff_t stop, stop_composition, stop_charset;
  7619   int *lookup_buf = NULL;
  7620 
  7621   if (! NILP (translation_table))
  7622     lookup_buf = alloca (sizeof (int) * max_lookup);
  7623 
  7624   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  7625   if (VECTORP (eol_type))
  7626     eol_type = Qunix;
  7627 
  7628   /* Note: composition handling is not yet implemented.  */
  7629   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
  7630 
  7631   if (NILP (coding->src_object))
  7632     stop = stop_composition = stop_charset = end_pos;
  7633   else
  7634     {
  7635       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
  7636         stop = stop_composition = pos;
  7637       else
  7638         stop = stop_composition = end_pos;
  7639       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
  7640         stop = stop_charset = pos;
  7641       else
  7642         stop_charset = end_pos;
  7643     }
  7644 
  7645   /* Compensate for CRLF and conversion.  */
  7646   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
  7647   while (buf < buf_end)
  7648     {
  7649       Lisp_Object trans;
  7650 
  7651       if (pos == stop)
  7652         {
  7653           if (pos == end_pos)
  7654             break;
  7655           if (pos == stop_composition)
  7656             buf = handle_composition_annotation (pos, end_pos, coding,
  7657                                                  buf, &stop_composition);
  7658           if (pos == stop_charset)
  7659             buf = handle_charset_annotation (pos, end_pos, coding,
  7660                                              buf, &stop_charset);
  7661           stop = (stop_composition < stop_charset
  7662                   ? stop_composition : stop_charset);
  7663         }
  7664 
  7665       if (! multibytep)
  7666         {
  7667           if (coding->encoder == encode_coding_raw_text
  7668               || coding->encoder == encode_coding_ccl)
  7669             c = *src++, pos++;
  7670           else
  7671             {
  7672               int bytes = multibyte_length (src, src_end, true, true);
  7673               if (0 < bytes)
  7674                 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
  7675               else
  7676                 c = BYTE8_TO_CHAR (*src), src++, pos++;
  7677             }
  7678         }
  7679       else
  7680         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
  7681       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
  7682         c = '\n';
  7683       if (! EQ (eol_type, Qunix))
  7684         {
  7685           if (c == '\n')
  7686             {
  7687               if (EQ (eol_type, Qdos))
  7688                 *buf++ = '\r';
  7689               else
  7690                 c = '\r';
  7691             }
  7692         }
  7693 
  7694       trans = Qnil;
  7695       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
  7696       if (NILP (trans))
  7697         *buf++ = c;
  7698       else
  7699         {
  7700           ptrdiff_t from_nchars = 1, to_nchars = 1;
  7701           int *lookup_buf_end;
  7702           const unsigned char *p = src;
  7703           int i;
  7704 
  7705           lookup_buf[0] = c;
  7706           for (i = 1; i < max_lookup && p < src_end; i++)
  7707             lookup_buf[i] = string_char_advance (&p);
  7708           lookup_buf_end = lookup_buf + i;
  7709           trans = get_translation (trans, lookup_buf, lookup_buf_end,
  7710                                    &from_nchars);
  7711           if (FIXNUMP (trans))
  7712             c = XFIXNUM (trans);
  7713           else if (VECTORP (trans))
  7714             {
  7715               to_nchars = ASIZE (trans);
  7716               if (buf_end - buf < to_nchars)
  7717                 break;
  7718               c = XFIXNUM (AREF (trans, 0));
  7719             }
  7720           else
  7721             break;
  7722           *buf++ = c;
  7723           for (i = 1; i < to_nchars; i++)
  7724             *buf++ = XFIXNUM (AREF (trans, i));
  7725           for (i = 1; i < from_nchars; i++, pos++)
  7726             src += multibyte_length (src, NULL, false, true);
  7727         }
  7728     }
  7729 
  7730   coding->consumed = src - coding->source;
  7731   coding->consumed_char = pos - coding->src_pos;
  7732   coding->charbuf_used = buf - coding->charbuf;
  7733   coding->chars_at_source = 0;
  7734 }
  7735 
  7736 
  7737 /* Encode the text at CODING->src_object into CODING->dst_object.
  7738    CODING->src_object is a buffer or a string.
  7739    CODING->dst_object is a buffer or nil.
  7740 
  7741    If CODING->src_object is a buffer, it must be the current buffer.
  7742    In this case, if CODING->src_pos is positive, it is a position of
  7743    the source text in the buffer, otherwise. the source text is in the
  7744    gap area of the buffer, and coding->src_pos specifies the offset of
  7745    the text from GPT (which must be the same as PT).  If this is the
  7746    same buffer as CODING->dst_object, CODING->src_pos must be
  7747    negative and CODING should not have `pre-write-conversion'.
  7748 
  7749    If CODING->src_object is a string, CODING should not have
  7750    `pre-write-conversion'.
  7751 
  7752    If CODING->dst_object is a buffer, the encoded data is inserted at
  7753    the current point of that buffer.
  7754 
  7755    If CODING->dst_object is nil, the encoded data is placed at the
  7756    memory area specified by CODING->destination.  */
  7757 
  7758 static void
  7759 encode_coding (struct coding_system *coding)
  7760 {
  7761   Lisp_Object attrs;
  7762   Lisp_Object translation_table;
  7763   int max_lookup;
  7764   struct ccl_spec cclspec;
  7765 
  7766   USE_SAFE_ALLOCA;
  7767 
  7768   attrs = CODING_ID_ATTRS (coding->id);
  7769   if (coding->encoder == encode_coding_raw_text)
  7770     translation_table = Qnil, max_lookup = 0;
  7771   else
  7772     translation_table = get_translation_table (attrs, 1, &max_lookup);
  7773 
  7774   if (BUFFERP (coding->dst_object))
  7775     {
  7776       set_buffer_internal (XBUFFER (coding->dst_object));
  7777       coding->dst_multibyte
  7778         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
  7779     }
  7780 
  7781   coding->consumed = coding->consumed_char = 0;
  7782   coding->produced = coding->produced_char = 0;
  7783   record_conversion_result (coding, CODING_RESULT_SUCCESS);
  7784 
  7785   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
  7786 
  7787   if (coding->encoder == encode_coding_ccl)
  7788     {
  7789       coding->spec.ccl = &cclspec;
  7790       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
  7791     }
  7792   do {
  7793     coding_set_source (coding);
  7794     consume_chars (coding, translation_table, max_lookup);
  7795     coding_set_destination (coding);
  7796     /* The CODING_MODE_LAST_BLOCK flag should be set only for the last
  7797        iteration of the encoding.  */
  7798     unsigned saved_mode = coding->mode;
  7799     if (coding->consumed_char < coding->src_chars)
  7800       coding->mode &= ~CODING_MODE_LAST_BLOCK;
  7801     (*(coding->encoder)) (coding);
  7802     coding->mode = saved_mode;
  7803   } while (coding->consumed_char < coding->src_chars);
  7804 
  7805   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
  7806     insert_from_gap (coding->produced_char, coding->produced, 0);
  7807 
  7808   SAFE_FREE ();
  7809 }
  7810 
  7811 /* Code-conversion operations use internal buffers.  There's a single
  7812    reusable buffer, which is created the first time it is needed, and
  7813    then never killed.  When this reusable buffer is being used, the
  7814    reused_workbuf_in_use flag is set.  If we need another conversion
  7815    buffer while the reusable one is in use (e.g., if code-conversion
  7816    is reentered when another code-conversion is in progress), we
  7817    create temporary buffers using the name of the reusable buffer as
  7818    the base name, see code_conversion_save below.  These temporary
  7819    buffers are killed when the code-conversion operations that use
  7820    them return, see code_conversion_restore below.  */
  7821 
  7822 /* A string that serves as name of the reusable work buffer, and as base
  7823    name of temporary work buffers used for code-conversion operations.  */
  7824 static Lisp_Object Vcode_conversion_workbuf_name;
  7825 
  7826 /* The reusable working buffer, created once and never killed.  */
  7827 static Lisp_Object Vcode_conversion_reused_workbuf;
  7828 
  7829 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
  7830 static bool reused_workbuf_in_use;
  7831 
  7832 static void
  7833 code_conversion_restore (Lisp_Object arg)
  7834 {
  7835   Lisp_Object current, workbuf;
  7836 
  7837   current = XCAR (arg);
  7838   workbuf = XCDR (arg);
  7839   if (! NILP (workbuf))
  7840     {
  7841       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
  7842         reused_workbuf_in_use = false;
  7843       else
  7844         Fkill_buffer (workbuf);
  7845     }
  7846   set_buffer_internal (XBUFFER (current));
  7847 }
  7848 
  7849 Lisp_Object
  7850 code_conversion_save (bool with_work_buf, bool multibyte)
  7851 {
  7852   Lisp_Object workbuf = Qnil;
  7853 
  7854   if (with_work_buf)
  7855     {
  7856       if (reused_workbuf_in_use)
  7857         {
  7858           Lisp_Object name
  7859             = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
  7860           workbuf = Fget_buffer_create (name, Qt);
  7861         }
  7862       else
  7863         {
  7864           if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
  7865             Vcode_conversion_reused_workbuf
  7866               = Fget_buffer_create (Vcode_conversion_workbuf_name, Qt);
  7867           workbuf = Vcode_conversion_reused_workbuf;
  7868         }
  7869     }
  7870   record_unwind_protect (code_conversion_restore,
  7871                          Fcons (Fcurrent_buffer (), workbuf));
  7872   if (!NILP (workbuf))
  7873     {
  7874       struct buffer *current = current_buffer;
  7875       set_buffer_internal (XBUFFER (workbuf));
  7876       /* We can't allow modification hooks to run in the work buffer.  For
  7877          instance, directory_files_internal assumes that file decoding
  7878          doesn't compile new regexps.  */
  7879       Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
  7880       Ferase_buffer ();
  7881       bset_undo_list (current_buffer, Qt);
  7882       bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
  7883       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
  7884         reused_workbuf_in_use = true;
  7885       set_buffer_internal (current);
  7886     }
  7887 
  7888   return workbuf;
  7889 }
  7890 
  7891 static void
  7892 coding_restore_undo_list (Lisp_Object arg)
  7893 {
  7894   Lisp_Object undo_list = XCAR (arg);
  7895   struct buffer *buf = XBUFFER (XCDR (arg));
  7896 
  7897   bset_undo_list (buf, undo_list);
  7898 }
  7899 
  7900 /* Decode the *last* BYTES of the gap and insert them at point.  */
  7901 void
  7902 decode_coding_gap (struct coding_system *coding, ptrdiff_t bytes)
  7903 {
  7904   specpdl_ref count = SPECPDL_INDEX ();
  7905   Lisp_Object attrs;
  7906 
  7907   eassert (GPT_BYTE == PT_BYTE);
  7908 
  7909   coding->src_object = Fcurrent_buffer ();
  7910   coding->src_chars = bytes;
  7911   coding->src_bytes = bytes;
  7912   coding->src_pos = -bytes;
  7913   coding->src_pos_byte = -bytes;
  7914   coding->src_multibyte = false;
  7915   coding->dst_object = coding->src_object;
  7916   coding->dst_pos = PT;
  7917   coding->dst_pos_byte = PT_BYTE;
  7918   eassert (coding->dst_multibyte
  7919            == !NILP (BVAR (current_buffer, enable_multibyte_characters)));
  7920 
  7921   coding->head_ascii = -1;
  7922   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
  7923   coding->eol_seen = EOL_SEEN_NONE;
  7924   if (CODING_REQUIRE_DETECTION (coding))
  7925     detect_coding (coding);
  7926   attrs = CODING_ID_ATTRS (coding->id);
  7927   if (! disable_ascii_optimization
  7928       && ! coding->src_multibyte
  7929       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  7930       && NILP (CODING_ATTR_POST_READ (attrs))
  7931       && NILP (get_translation_table (attrs, 0, NULL)))
  7932     {
  7933       ptrdiff_t chars = coding->head_ascii;
  7934       if (chars < 0)
  7935         chars = check_ascii (coding);
  7936       if (chars != bytes)
  7937         {
  7938           /* There exists a non-ASCII byte.  */
  7939           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
  7940               && coding->detected_utf8_bytes == coding->src_bytes)
  7941             {
  7942               if (coding->detected_utf8_chars >= 0)
  7943                 chars = coding->detected_utf8_chars;
  7944               else
  7945                 chars = check_utf_8 (coding);
  7946               if (CODING_UTF_8_BOM (coding) != utf_without_bom
  7947                   && coding->head_ascii == 0
  7948                   && coding->source[0] == UTF_8_BOM_1
  7949                   && coding->source[1] == UTF_8_BOM_2
  7950                   && coding->source[2] == UTF_8_BOM_3)
  7951                 {
  7952                   chars--;
  7953                   bytes -= 3;
  7954                   coding->src_bytes -= 3;
  7955                 }
  7956             }
  7957           else
  7958             chars = -1;
  7959         }
  7960       if (chars >= 0)
  7961         {
  7962           Lisp_Object eol_type;
  7963 
  7964           eol_type = CODING_ID_EOL_TYPE (coding->id);
  7965           if (VECTORP (eol_type))
  7966             {
  7967               if (coding->eol_seen != EOL_SEEN_NONE)
  7968                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
  7969             }
  7970           if (EQ (eol_type, Qmac))
  7971             {
  7972               unsigned char *src_end = GAP_END_ADDR;
  7973               unsigned char *src = src_end - coding->src_bytes;
  7974 
  7975               while (src < src_end)
  7976                 {
  7977                   if (*src++ == '\r')
  7978                     src[-1] = '\n';
  7979                 }
  7980             }
  7981           else if (EQ (eol_type, Qdos))
  7982             {
  7983               unsigned char *src = GAP_END_ADDR;
  7984               unsigned char *src_beg = src - coding->src_bytes;
  7985               unsigned char *dst = src;
  7986               ptrdiff_t diff;
  7987 
  7988               while (src_beg < src)
  7989                 {
  7990                   *--dst = *--src;
  7991                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
  7992                     src--;
  7993                 }
  7994               diff = dst - src;
  7995               bytes -= diff;
  7996               chars -= diff;
  7997             }
  7998           coding->produced = bytes;
  7999           coding->produced_char = chars;
  8000           insert_from_gap (chars, bytes, 1);
  8001           return;
  8002         }
  8003     }
  8004   code_conversion_save (0, 0);
  8005 
  8006   coding->mode |= CODING_MODE_LAST_BLOCK;
  8007   current_buffer->text->inhibit_shrinking = 1;
  8008   decode_coding (coding);
  8009   current_buffer->text->inhibit_shrinking = 0;
  8010 
  8011   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  8012     {
  8013       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
  8014       Lisp_Object val;
  8015       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
  8016 
  8017       record_unwind_protect (coding_restore_undo_list,
  8018                              Fcons (undo_list, Fcurrent_buffer ()));
  8019       bset_undo_list (current_buffer, Qt);
  8020       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
  8021       val = call1 (CODING_ATTR_POST_READ (attrs),
  8022                    make_fixnum (coding->produced_char));
  8023       CHECK_FIXNAT (val);
  8024       coding->produced_char += Z - prev_Z;
  8025       coding->produced += Z_BYTE - prev_Z_BYTE;
  8026     }
  8027 
  8028   unbind_to (count, Qnil);
  8029 }
  8030 
  8031 
  8032 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
  8033    SRC_OBJECT into DST_OBJECT by coding context CODING.
  8034 
  8035    SRC_OBJECT is a buffer, a string, or Qnil.
  8036 
  8037    If it is a buffer, the text is at point of the buffer.  FROM and TO
  8038    are positions in the buffer.
  8039 
  8040    If it is a string, the text is at the beginning of the string.
  8041    FROM and TO are indices to the string.
  8042 
  8043    If it is nil, the text is at coding->source.  FROM and TO are
  8044    indices to coding->source.
  8045 
  8046    DST_OBJECT is a buffer, Qt, or Qnil.
  8047 
  8048    If it is a buffer, the decoded text is inserted at point of the
  8049    buffer.  If the buffer is the same as SRC_OBJECT, the source text
  8050    is deleted.
  8051 
  8052    If it is Qt, a string is made from the decoded text, and
  8053    set in CODING->dst_object.
  8054 
  8055    If it is Qnil, the decoded text is stored at CODING->destination.
  8056    The caller must allocate CODING->dst_bytes bytes at
  8057    CODING->destination by xmalloc.  If the decoded text is longer than
  8058    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
  8059  */
  8060 
  8061 void
  8062 decode_coding_object (struct coding_system *coding,
  8063                       Lisp_Object src_object,
  8064                       ptrdiff_t from, ptrdiff_t from_byte,
  8065                       ptrdiff_t to, ptrdiff_t to_byte,
  8066                       Lisp_Object dst_object)
  8067 {
  8068   specpdl_ref count = SPECPDL_INDEX ();
  8069   unsigned char *destination UNINIT;
  8070   ptrdiff_t dst_bytes UNINIT;
  8071   ptrdiff_t chars = to - from;
  8072   ptrdiff_t bytes = to_byte - from_byte;
  8073   Lisp_Object attrs;
  8074   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
  8075   bool need_marker_adjustment = 0;
  8076   Lisp_Object old_deactivate_mark;
  8077 
  8078   old_deactivate_mark = Vdeactivate_mark;
  8079 
  8080   if (NILP (dst_object))
  8081     {
  8082       destination = coding->destination;
  8083       dst_bytes = coding->dst_bytes;
  8084     }
  8085 
  8086   coding->src_object = src_object;
  8087   coding->src_chars = chars;
  8088   coding->src_bytes = bytes;
  8089   coding->src_multibyte = chars < bytes;
  8090 
  8091   if (STRINGP (src_object))
  8092     {
  8093       coding->src_pos = from;
  8094       coding->src_pos_byte = from_byte;
  8095     }
  8096   else if (BUFFERP (src_object))
  8097     {
  8098       set_buffer_internal (XBUFFER (src_object));
  8099       if (from != GPT)
  8100         move_gap_both (from, from_byte);
  8101       if (EQ (src_object, dst_object))
  8102         {
  8103           struct Lisp_Marker *tail;
  8104 
  8105           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8106             {
  8107               tail->need_adjustment
  8108                 = tail->charpos == (tail->insertion_type ? from : to);
  8109               need_marker_adjustment |= tail->need_adjustment;
  8110             }
  8111           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8112           TEMP_SET_PT_BOTH (from, from_byte);
  8113           current_buffer->text->inhibit_shrinking = 1;
  8114           del_range_both (from, from_byte, to, to_byte, 1);
  8115           coding->src_pos = -chars;
  8116           coding->src_pos_byte = -bytes;
  8117         }
  8118       else
  8119         {
  8120           coding->src_pos = from;
  8121           coding->src_pos_byte = from_byte;
  8122         }
  8123     }
  8124 
  8125   if (CODING_REQUIRE_DETECTION (coding))
  8126     detect_coding (coding);
  8127   attrs = CODING_ID_ATTRS (coding->id);
  8128 
  8129   if (EQ (dst_object, Qt)
  8130       || (! NILP (CODING_ATTR_POST_READ (attrs))
  8131           && NILP (dst_object)))
  8132     {
  8133       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
  8134       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
  8135       coding->dst_pos = BEG;
  8136       coding->dst_pos_byte = BEG_BYTE;
  8137     }
  8138   else if (BUFFERP (dst_object))
  8139     {
  8140       code_conversion_save (0, 0);
  8141       coding->dst_object = dst_object;
  8142       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
  8143       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
  8144       coding->dst_multibyte
  8145         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
  8146     }
  8147   else
  8148     {
  8149       code_conversion_save (0, 0);
  8150       coding->dst_object = Qnil;
  8151       /* Most callers presume this will return a multibyte result, and they
  8152          won't use `binary' or `raw-text' anyway, so let's not worry about
  8153          CODING_FOR_UNIBYTE.  */
  8154       coding->dst_multibyte = 1;
  8155     }
  8156 
  8157   decode_coding (coding);
  8158 
  8159   if (BUFFERP (coding->dst_object))
  8160     set_buffer_internal (XBUFFER (coding->dst_object));
  8161 
  8162   if (! NILP (CODING_ATTR_POST_READ (attrs)))
  8163     {
  8164       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
  8165       Lisp_Object val;
  8166       Lisp_Object undo_list = BVAR (current_buffer, undo_list);
  8167       specpdl_ref count1 = SPECPDL_INDEX ();
  8168 
  8169       record_unwind_protect (coding_restore_undo_list,
  8170                              Fcons (undo_list, Fcurrent_buffer ()));
  8171       bset_undo_list (current_buffer, Qt);
  8172       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
  8173       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
  8174                         make_fixnum (coding->produced_char));
  8175       CHECK_FIXNAT (val);
  8176       coding->produced_char += Z - prev_Z;
  8177       coding->produced += Z_BYTE - prev_Z_BYTE;
  8178       unbind_to (count1, Qnil);
  8179     }
  8180 
  8181   if (EQ (dst_object, Qt))
  8182     {
  8183       coding->dst_object = Fbuffer_string ();
  8184     }
  8185   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
  8186     {
  8187       set_buffer_internal (XBUFFER (coding->dst_object));
  8188       if (dst_bytes < coding->produced)
  8189         {
  8190           eassert (coding->produced > 0);
  8191           destination = xrealloc (destination, coding->produced);
  8192           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
  8193             move_gap_both (BEGV, BEGV_BYTE);
  8194           memcpy (destination, BEGV_ADDR, coding->produced);
  8195           coding->destination = destination;
  8196         }
  8197     }
  8198 
  8199   if (saved_pt >= 0)
  8200     {
  8201       /* This is the case of:
  8202          (BUFFERP (src_object) && BASE_EQ (src_object, dst_object))
  8203          As we have moved PT while replacing the original buffer
  8204          contents, we must recover it now.  */
  8205       set_buffer_internal (XBUFFER (src_object));
  8206       current_buffer->text->inhibit_shrinking = 0;
  8207       if (saved_pt < from)
  8208         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
  8209       else if (saved_pt < from + chars)
  8210         TEMP_SET_PT_BOTH (from, from_byte);
  8211       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
  8212         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
  8213                           saved_pt_byte + (coding->produced - bytes));
  8214       else
  8215         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
  8216                           saved_pt_byte + (coding->produced - bytes));
  8217 
  8218       if (need_marker_adjustment)
  8219         {
  8220           struct Lisp_Marker *tail;
  8221 
  8222           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8223             if (tail->need_adjustment)
  8224               {
  8225                 tail->need_adjustment = 0;
  8226                 if (tail->insertion_type)
  8227                   {
  8228                     tail->bytepos = from_byte;
  8229                     tail->charpos = from;
  8230                   }
  8231                 else
  8232                   {
  8233                     tail->bytepos = from_byte + coding->produced;
  8234                     tail->charpos
  8235                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
  8236                          ? tail->bytepos : from + coding->produced_char);
  8237                   }
  8238               }
  8239         }
  8240     }
  8241 
  8242   Vdeactivate_mark = old_deactivate_mark;
  8243   unbind_to (count, coding->dst_object);
  8244 }
  8245 
  8246 
  8247 /* Encode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
  8248    SRC_OBJECT into DST_OBJECT by coding context CODING.
  8249 
  8250    SRC_OBJECT is a buffer, a string, or Qnil.
  8251 
  8252    If it is a buffer, the text is at point of the buffer.  FROM and TO
  8253    are positions in the buffer.
  8254 
  8255    If it is a string, the text is at the beginning of the string.
  8256    FROM and TO are indices into the string.
  8257 
  8258    If it is nil, the text is at coding->source.  FROM and TO are
  8259    indices into coding->source.
  8260 
  8261    DST_OBJECT is a buffer, Qt, or Qnil.
  8262 
  8263    If it is a buffer, the encoded text is inserted at point of the
  8264    buffer.  If the buffer is the same as SRC_OBJECT, the source text
  8265    is replaced with the encoded text.
  8266 
  8267    If it is Qt, a string is made from the encoded text, and set in
  8268    CODING->dst_object.  However, if CODING->raw_destination is non-zero,
  8269    the encoded text is instead returned in CODING->destination as a C string,
  8270    and the caller is responsible for freeing CODING->destination.  This
  8271    feature is meant to be used when the caller doesn't need the result as
  8272    a Lisp string, and wants to avoid unnecessary consing of large strings.
  8273 
  8274    If it is Qnil, the encoded text is stored at CODING->destination.
  8275    The caller must allocate CODING->dst_bytes bytes at
  8276    CODING->destination by xmalloc.  If the encoded text is longer than
  8277    CODING->dst_bytes, CODING->destination is reallocated by xrealloc
  8278    (and CODING->dst_bytes is enlarged accordingly).  */
  8279 
  8280 void
  8281 encode_coding_object (struct coding_system *coding,
  8282                       Lisp_Object src_object,
  8283                       ptrdiff_t from, ptrdiff_t from_byte,
  8284                       ptrdiff_t to, ptrdiff_t to_byte,
  8285                       Lisp_Object dst_object)
  8286 {
  8287   specpdl_ref count = SPECPDL_INDEX ();
  8288   ptrdiff_t chars = to - from;
  8289   ptrdiff_t bytes = to_byte - from_byte;
  8290   Lisp_Object attrs;
  8291   ptrdiff_t saved_pt = -1, saved_pt_byte UNINIT;
  8292   bool need_marker_adjustment = 0;
  8293   bool kill_src_buffer = 0;
  8294   Lisp_Object old_deactivate_mark;
  8295 
  8296   old_deactivate_mark = Vdeactivate_mark;
  8297 
  8298   coding->src_object = src_object;
  8299   coding->src_chars = chars;
  8300   coding->src_bytes = bytes;
  8301   coding->src_multibyte = chars < bytes;
  8302 
  8303   attrs = CODING_ID_ATTRS (coding->id);
  8304 
  8305   bool same_buffer = false;
  8306   if (BASE_EQ (src_object, dst_object) && BUFFERP (src_object))
  8307     {
  8308       struct Lisp_Marker *tail;
  8309 
  8310       same_buffer = true;
  8311 
  8312       for (tail = BUF_MARKERS (XBUFFER (src_object)); tail; tail = tail->next)
  8313         {
  8314           tail->need_adjustment
  8315             = tail->charpos == (tail->insertion_type ? from : to);
  8316           need_marker_adjustment |= tail->need_adjustment;
  8317         }
  8318     }
  8319 
  8320   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
  8321     {
  8322       coding->src_object = code_conversion_save (1, coding->src_multibyte);
  8323       set_buffer_internal (XBUFFER (coding->src_object));
  8324       if (STRINGP (src_object))
  8325         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
  8326       else if (BUFFERP (src_object))
  8327         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
  8328       else
  8329         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
  8330 
  8331       if (same_buffer)
  8332         {
  8333           set_buffer_internal (XBUFFER (src_object));
  8334           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8335           del_range_both (from, from_byte, to, to_byte, 1);
  8336           set_buffer_internal (XBUFFER (coding->src_object));
  8337         }
  8338 
  8339       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
  8340                   make_fixnum (BEG), make_fixnum (Z));
  8341       if (XBUFFER (coding->src_object) != current_buffer)
  8342         kill_src_buffer = 1;
  8343       coding->src_object = Fcurrent_buffer ();
  8344       if (BEG != GPT)
  8345         move_gap_both (BEG, BEG_BYTE);
  8346       coding->src_chars = Z - BEG;
  8347       coding->src_bytes = Z_BYTE - BEG_BYTE;
  8348       coding->src_pos = BEG;
  8349       coding->src_pos_byte = BEG_BYTE;
  8350       coding->src_multibyte = Z < Z_BYTE;
  8351     }
  8352   else if (STRINGP (src_object))
  8353     {
  8354       code_conversion_save (0, 0);
  8355       coding->src_pos = from;
  8356       coding->src_pos_byte = from_byte;
  8357     }
  8358   else if (BUFFERP (src_object))
  8359     {
  8360       code_conversion_save (0, 0);
  8361       set_buffer_internal (XBUFFER (src_object));
  8362       if (same_buffer)
  8363         {
  8364           saved_pt = PT, saved_pt_byte = PT_BYTE;
  8365           coding->src_object = del_range_1 (from, to, 1, 1);
  8366           coding->src_pos = 0;
  8367           coding->src_pos_byte = 0;
  8368         }
  8369       else
  8370         {
  8371           if (from < GPT && to >= GPT)
  8372             move_gap_both (from, from_byte);
  8373           coding->src_pos = from;
  8374           coding->src_pos_byte = from_byte;
  8375         }
  8376     }
  8377   else
  8378     {
  8379       code_conversion_save (0, 0);
  8380       coding->src_pos = from;
  8381       coding->src_pos_byte = from_byte;
  8382     }
  8383 
  8384   if (BUFFERP (dst_object))
  8385     {
  8386       coding->dst_object = dst_object;
  8387       if (BASE_EQ (src_object, dst_object))
  8388         {
  8389           coding->dst_pos = from;
  8390           coding->dst_pos_byte = from_byte;
  8391         }
  8392       else
  8393         {
  8394           struct buffer *current = current_buffer;
  8395 
  8396           set_buffer_temp (XBUFFER (dst_object));
  8397           coding->dst_pos = PT;
  8398           coding->dst_pos_byte = PT_BYTE;
  8399           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
  8400           set_buffer_temp (current);
  8401         }
  8402       coding->dst_multibyte
  8403         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
  8404     }
  8405   else if (EQ (dst_object, Qt))
  8406     {
  8407       ptrdiff_t dst_bytes = max (1, coding->src_chars);
  8408       coding->dst_object = Qnil;
  8409       coding->destination = xmalloc (dst_bytes);
  8410       coding->dst_bytes = dst_bytes;
  8411       coding->dst_multibyte = 0;
  8412     }
  8413   else
  8414     {
  8415       coding->dst_object = Qnil;
  8416       coding->dst_multibyte = 0;
  8417     }
  8418 
  8419   encode_coding (coding);
  8420 
  8421   if (EQ (dst_object, Qt))
  8422     {
  8423       if (BUFFERP (coding->dst_object))
  8424         coding->dst_object = Fbuffer_string ();
  8425       else if (coding->raw_destination)
  8426         /* This is used to avoid creating huge Lisp string.
  8427            NOTE: caller who sets `raw_destination' is also
  8428            responsible for freeing `destination' buffer.  */
  8429         coding->dst_object = Qnil;
  8430       else
  8431         {
  8432           coding->dst_object
  8433             = make_unibyte_string ((char *) coding->destination,
  8434                                    coding->produced);
  8435           xfree (coding->destination);
  8436         }
  8437     }
  8438 
  8439   if (saved_pt >= 0)
  8440     {
  8441       /* This is the case of:
  8442          (BUFFERP (src_object) && BASE_EQ (src_object, dst_object))
  8443          As we have moved PT while replacing the original buffer
  8444          contents, we must recover it now.  */
  8445       set_buffer_internal (XBUFFER (src_object));
  8446       if (saved_pt < from)
  8447         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
  8448       else if (saved_pt < from + chars)
  8449         TEMP_SET_PT_BOTH (from, from_byte);
  8450       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
  8451         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
  8452                           saved_pt_byte + (coding->produced - bytes));
  8453       else
  8454         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
  8455                           saved_pt_byte + (coding->produced - bytes));
  8456 
  8457       if (need_marker_adjustment)
  8458         {
  8459           struct Lisp_Marker *tail;
  8460 
  8461           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
  8462             if (tail->need_adjustment)
  8463               {
  8464                 tail->need_adjustment = 0;
  8465                 if (tail->insertion_type)
  8466                   {
  8467                     tail->bytepos = from_byte;
  8468                     tail->charpos = from;
  8469                   }
  8470                 else
  8471                   {
  8472                     tail->bytepos = from_byte + coding->produced;
  8473                     tail->charpos
  8474                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
  8475                          ? tail->bytepos : from + coding->produced_char);
  8476                   }
  8477               }
  8478         }
  8479     }
  8480 
  8481   if (kill_src_buffer)
  8482     Fkill_buffer (coding->src_object);
  8483 
  8484   Vdeactivate_mark = old_deactivate_mark;
  8485   unbind_to (count, Qnil);
  8486 }
  8487 
  8488 
  8489 Lisp_Object
  8490 preferred_coding_system (void)
  8491 {
  8492   int id = coding_categories[coding_priorities[0]].id;
  8493 
  8494   return CODING_ID_NAME (id);
  8495 }
  8496 
  8497 #if defined (WINDOWSNT) || defined (CYGWIN) || defined HAVE_ANDROID
  8498 
  8499 Lisp_Object
  8500 from_unicode (Lisp_Object str)
  8501 {
  8502   CHECK_STRING (str);
  8503   if (!STRING_MULTIBYTE (str) &&
  8504       SBYTES (str) & 1)
  8505     {
  8506       str = Fsubstring (str, make_fixnum (0), make_fixnum (-1));
  8507     }
  8508 
  8509   return code_convert_string_norecord (str, Qutf_16le, 0);
  8510 }
  8511 
  8512 Lisp_Object
  8513 from_unicode_buffer (const wchar_t *wstr)
  8514 {
  8515 #if defined WINDOWSNT || defined CYGWIN
  8516   /* We get one of the two final null bytes for free.  */
  8517   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
  8518   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
  8519   return from_unicode (str);
  8520 #else
  8521   /* This code is used only on Android, where little endian UTF-16
  8522      strings are extended to 32-bit wchar_t.  */
  8523 
  8524   uint16_t *words;
  8525   size_t length, i;
  8526 
  8527   length = wcslen (wstr) + 1;
  8528 
  8529   USE_SAFE_ALLOCA;
  8530   SAFE_NALLOCA (words, sizeof *words, length);
  8531 
  8532   for (i = 0; i < length - 1; ++i)
  8533     words[i] = wstr[i];
  8534 
  8535   words[i] = '\0';
  8536   AUTO_STRING_WITH_LEN (str, (char *) words,
  8537                         (length - 1) * sizeof *words);
  8538   return unbind_to (sa_count, from_unicode (str));
  8539 #endif
  8540 }
  8541 
  8542 wchar_t *
  8543 to_unicode (Lisp_Object str, Lisp_Object *buf)
  8544 {
  8545   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
  8546   /* We need to make another copy (in addition to the one made by
  8547      code_convert_string_norecord) to ensure that the final string is
  8548      _doubly_ zero terminated --- that is, that the string is
  8549      terminated by two zero bytes and one utf-16le null character.
  8550      Because strings are already terminated with a single zero byte,
  8551      we just add one additional zero. */
  8552   str = make_uninit_string (SBYTES (*buf) + 1);
  8553   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
  8554   SDATA (str) [SBYTES (*buf)] = '\0';
  8555   *buf = str;
  8556   return WCSDATA (*buf);
  8557 }
  8558 
  8559 #endif /* WINDOWSNT || CYGWIN || HAVE_ANDROID */
  8560 
  8561 
  8562 /*** 8. Emacs Lisp library functions ***/
  8563 
  8564 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
  8565        doc: /* Return t if OBJECT is nil or a coding-system.
  8566 See the documentation of `define-coding-system' for information
  8567 about coding-system objects.  */)
  8568   (Lisp_Object object)
  8569 {
  8570   if (NILP (object)
  8571       || CODING_SYSTEM_ID (object) >= 0)
  8572     return Qt;
  8573   if (! SYMBOLP (object)
  8574       || NILP (Fget (object, Qcoding_system_define_form)))
  8575     return Qnil;
  8576   return Qt;
  8577 }
  8578 
  8579 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
  8580        Sread_non_nil_coding_system, 1, 1, 0,
  8581        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
  8582   (Lisp_Object prompt)
  8583 {
  8584   Lisp_Object val;
  8585   do
  8586     {
  8587       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
  8588                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
  8589     }
  8590   while (SCHARS (val) == 0);
  8591   return (Fintern (val, Qnil));
  8592 }
  8593 
  8594 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
  8595        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
  8596 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
  8597 Ignores case when completing coding systems (all Emacs coding systems
  8598 are lower-case).  */)
  8599   (Lisp_Object prompt, Lisp_Object default_coding_system)
  8600 {
  8601   Lisp_Object val;
  8602   specpdl_ref count = SPECPDL_INDEX ();
  8603 
  8604   if (SYMBOLP (default_coding_system))
  8605     default_coding_system = SYMBOL_NAME (default_coding_system);
  8606   specbind (Qcompletion_ignore_case, Qt);
  8607   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
  8608                           Qt, Qnil, Qcoding_system_history,
  8609                           default_coding_system, Qnil);
  8610   val = unbind_to (count, val);
  8611   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
  8612 }
  8613 
  8614 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
  8615        1, 1, 0,
  8616        doc: /* Check validity of CODING-SYSTEM.
  8617 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
  8618 It is valid if it is nil or a symbol defined as a coding system by the
  8619 function `define-coding-system'.  */)
  8620   (Lisp_Object coding_system)
  8621 {
  8622   Lisp_Object define_form;
  8623 
  8624   define_form = Fget (coding_system, Qcoding_system_define_form);
  8625   if (! NILP (define_form))
  8626     {
  8627       Fput (coding_system, Qcoding_system_define_form, Qnil);
  8628       safe_eval (define_form);
  8629     }
  8630   if (!NILP (Fcoding_system_p (coding_system)))
  8631     return coding_system;
  8632   xsignal1 (Qcoding_system_error, coding_system);
  8633 }
  8634 
  8635 
  8636 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
  8637    HIGHEST, return the coding system of the highest
  8638    priority among the detected coding systems.  Otherwise return a
  8639    list of detected coding systems sorted by their priorities.  If
  8640    MULTIBYTEP, it is assumed that the bytes are in correct
  8641    multibyte form but contains only ASCII and eight-bit chars.
  8642    Otherwise, the bytes are raw bytes.
  8643 
  8644    CODING-SYSTEM controls the detection as below:
  8645 
  8646    If it is nil, detect both text-format and eol-format.  If the
  8647    text-format part of CODING-SYSTEM is already specified
  8648    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
  8649    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
  8650    detect only text-format.  */
  8651 
  8652 Lisp_Object
  8653 detect_coding_system (const unsigned char *src,
  8654                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
  8655                       bool highest, bool multibytep,
  8656                       Lisp_Object coding_system)
  8657 {
  8658   const unsigned char *src_end = src + src_bytes;
  8659   Lisp_Object attrs, eol_type;
  8660   Lisp_Object val = Qnil;
  8661   struct coding_system coding;
  8662   ptrdiff_t id;
  8663   struct coding_detection_info detect_info = {0};
  8664   enum coding_category base_category;
  8665   bool null_byte_found = 0, eight_bit_found = 0;
  8666 
  8667   if (NILP (coding_system))
  8668     coding_system = Qundecided;
  8669   setup_coding_system (coding_system, &coding);
  8670   attrs = CODING_ID_ATTRS (coding.id);
  8671   eol_type = CODING_ID_EOL_TYPE (coding.id);
  8672   coding_system = CODING_ATTR_BASE_NAME (attrs);
  8673 
  8674   coding.source = src;
  8675   coding.src_chars = src_chars;
  8676   coding.src_bytes = src_bytes;
  8677   coding.src_multibyte = multibytep;
  8678   coding.consumed = 0;
  8679   coding.mode |= CODING_MODE_LAST_BLOCK;
  8680   coding.head_ascii = 0;
  8681 
  8682   /* At first, detect text-format if necessary.  */
  8683   base_category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8684   if (base_category == coding_category_undecided)
  8685     {
  8686       enum coding_category category UNINIT;
  8687       struct coding_system *this UNINIT;
  8688       int c, i;
  8689       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
  8690                                        inhibit_null_byte_detection);
  8691       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
  8692                                        inhibit_iso_escape_detection);
  8693       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
  8694 
  8695       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
  8696       for (; src < src_end; src++)
  8697         {
  8698           c = *src;
  8699           if (c & 0x80)
  8700             {
  8701               eight_bit_found = 1;
  8702               if (null_byte_found)
  8703                 break;
  8704             }
  8705           else if (c < 0x20)
  8706             {
  8707               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
  8708                   && ! inhibit_ied
  8709                   && ! detect_info.checked)
  8710                 {
  8711                   if (detect_coding_iso_2022 (&coding, &detect_info))
  8712                     {
  8713                       /* We have scanned the whole data.  */
  8714                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
  8715                         {
  8716                           /* We didn't find an 8-bit code.  We may
  8717                              have found a null-byte, but it's very
  8718                              rare that a binary file confirm to
  8719                              ISO-2022.  */
  8720                           src = src_end;
  8721                           coding.head_ascii = src - coding.source;
  8722                         }
  8723                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
  8724                       break;
  8725                     }
  8726                 }
  8727               else if (! c && !inhibit_nbd)
  8728                 {
  8729                   null_byte_found = 1;
  8730                   if (eight_bit_found)
  8731                     break;
  8732                 }
  8733               if (! eight_bit_found)
  8734                 coding.head_ascii++;
  8735             }
  8736           else if (! eight_bit_found)
  8737             coding.head_ascii++;
  8738         }
  8739 
  8740       if (null_byte_found || eight_bit_found
  8741           || coding.head_ascii < coding.src_bytes
  8742           || detect_info.found)
  8743         {
  8744           if (coding.head_ascii == coding.src_bytes)
  8745             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
  8746             for (i = 0; i < coding_category_raw_text; i++)
  8747               {
  8748                 category = coding_priorities[i];
  8749                 this = coding_categories + category;
  8750                 if (detect_info.found & (1 << category))
  8751                   break;
  8752               }
  8753           else
  8754             {
  8755               if (null_byte_found)
  8756                 {
  8757                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
  8758                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
  8759                 }
  8760               else if (prefer_utf_8
  8761                        && detect_coding_utf_8 (&coding, &detect_info))
  8762                 {
  8763                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
  8764                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
  8765                 }
  8766               for (i = 0; i < coding_category_raw_text; i++)
  8767                 {
  8768                   category = coding_priorities[i];
  8769                   this = coding_categories + category;
  8770 
  8771                   if (this->id < 0)
  8772                     {
  8773                       /* No coding system of this category is defined.  */
  8774                       detect_info.rejected |= (1 << category);
  8775                     }
  8776                   else if (category >= coding_category_raw_text)
  8777                     continue;
  8778                   else if (detect_info.checked & (1 << category))
  8779                     {
  8780                       if (highest
  8781                           && (detect_info.found & (1 << category)))
  8782                         break;
  8783                     }
  8784                   else if ((*(this->detector)) (&coding, &detect_info)
  8785                            && highest
  8786                            && (detect_info.found & (1 << category)))
  8787                     {
  8788                       if (category == coding_category_utf_16_auto)
  8789                         {
  8790                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  8791                             category = coding_category_utf_16_le;
  8792                           else
  8793                             category = coding_category_utf_16_be;
  8794                         }
  8795                       break;
  8796                     }
  8797                 }
  8798             }
  8799         }
  8800 
  8801       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
  8802           || null_byte_found)
  8803         {
  8804           detect_info.found = CATEGORY_MASK_RAW_TEXT;
  8805           id = CODING_SYSTEM_ID (Qno_conversion);
  8806           val = list1i (id);
  8807         }
  8808       else if (! detect_info.rejected && ! detect_info.found)
  8809         {
  8810           detect_info.found = CATEGORY_MASK_ANY;
  8811           id = coding_categories[coding_category_undecided].id;
  8812           val = list1i (id);
  8813         }
  8814       else if (highest)
  8815         {
  8816           if (detect_info.found)
  8817             {
  8818               detect_info.found = 1 << category;
  8819               val = list1i (this->id);
  8820             }
  8821           else
  8822             for (i = 0; i < coding_category_raw_text; i++)
  8823               if (! (detect_info.rejected & (1 << coding_priorities[i])))
  8824                 {
  8825                   detect_info.found = 1 << coding_priorities[i];
  8826                   id = coding_categories[coding_priorities[i]].id;
  8827                   val = list1i (id);
  8828                   break;
  8829                 }
  8830         }
  8831       else
  8832         {
  8833           int mask = detect_info.rejected | detect_info.found;
  8834           int found = 0;
  8835 
  8836           for (i = coding_category_raw_text - 1; i >= 0; i--)
  8837             {
  8838               category = coding_priorities[i];
  8839               if (! (mask & (1 << category)))
  8840                 {
  8841                   found |= 1 << category;
  8842                   id = coding_categories[category].id;
  8843                   if (id >= 0)
  8844                     val = list1i (id);
  8845                 }
  8846             }
  8847           for (i = coding_category_raw_text - 1; i >= 0; i--)
  8848             {
  8849               category = coding_priorities[i];
  8850               if (detect_info.found & (1 << category))
  8851                 {
  8852                   id = coding_categories[category].id;
  8853                   val = Fcons (make_fixnum (id), val);
  8854                 }
  8855             }
  8856           detect_info.found |= found;
  8857         }
  8858     }
  8859   else if (base_category == coding_category_utf_8_auto)
  8860     {
  8861       if (detect_coding_utf_8 (&coding, &detect_info))
  8862         {
  8863           struct coding_system *this;
  8864 
  8865           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
  8866             this = coding_categories + coding_category_utf_8_sig;
  8867           else
  8868             this = coding_categories + coding_category_utf_8_nosig;
  8869           val = list1i (this->id);
  8870         }
  8871     }
  8872   else if (base_category == coding_category_utf_16_auto)
  8873     {
  8874       if (detect_coding_utf_16 (&coding, &detect_info))
  8875         {
  8876           struct coding_system *this;
  8877 
  8878           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
  8879             this = coding_categories + coding_category_utf_16_le;
  8880           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
  8881             this = coding_categories + coding_category_utf_16_be;
  8882           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
  8883             this = coding_categories + coding_category_utf_16_be_nosig;
  8884           else
  8885             this = coding_categories + coding_category_utf_16_le_nosig;
  8886           val = list1i (this->id);
  8887         }
  8888     }
  8889   else
  8890     {
  8891       detect_info.found = 1 << XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8892       val = list1i (coding.id);
  8893     }
  8894 
  8895   /* Then, detect eol-format if necessary.  */
  8896   {
  8897     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
  8898     Lisp_Object tail;
  8899 
  8900     if (VECTORP (eol_type))
  8901       {
  8902         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
  8903           {
  8904             if (null_byte_found)
  8905               normal_eol = EOL_SEEN_LF;
  8906             else
  8907               normal_eol = detect_eol (coding.source, src_bytes,
  8908                                        coding_category_raw_text);
  8909           }
  8910         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
  8911                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
  8912           utf_16_be_eol = detect_eol (coding.source, src_bytes,
  8913                                       coding_category_utf_16_be);
  8914         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
  8915                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
  8916           utf_16_le_eol = detect_eol (coding.source, src_bytes,
  8917                                       coding_category_utf_16_le);
  8918       }
  8919     else
  8920       {
  8921         if (EQ (eol_type, Qunix))
  8922           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
  8923         else if (EQ (eol_type, Qdos))
  8924           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
  8925         else
  8926           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
  8927       }
  8928 
  8929     for (tail = val; CONSP (tail); tail = XCDR (tail))
  8930       {
  8931         enum coding_category category;
  8932         int this_eol;
  8933 
  8934         id = XFIXNUM (XCAR (tail));
  8935         attrs = CODING_ID_ATTRS (id);
  8936         category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
  8937         eol_type = CODING_ID_EOL_TYPE (id);
  8938         if (VECTORP (eol_type))
  8939           {
  8940             if (category == coding_category_utf_16_be
  8941                 || category == coding_category_utf_16_be_nosig)
  8942               this_eol = utf_16_be_eol;
  8943             else if (category == coding_category_utf_16_le
  8944                      || category == coding_category_utf_16_le_nosig)
  8945               this_eol = utf_16_le_eol;
  8946             else
  8947               this_eol = normal_eol;
  8948 
  8949             if (this_eol == EOL_SEEN_LF)
  8950               XSETCAR (tail, AREF (eol_type, 0));
  8951             else if (this_eol == EOL_SEEN_CRLF)
  8952               XSETCAR (tail, AREF (eol_type, 1));
  8953             else if (this_eol == EOL_SEEN_CR)
  8954               XSETCAR (tail, AREF (eol_type, 2));
  8955             else
  8956               XSETCAR (tail, CODING_ID_NAME (id));
  8957           }
  8958         else
  8959           XSETCAR (tail, CODING_ID_NAME (id));
  8960       }
  8961   }
  8962 
  8963   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
  8964 }
  8965 
  8966 
  8967 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
  8968        2, 3, 0,
  8969        doc: /* Detect coding system of the text in the region between START and END.
  8970 Return a list of possible coding systems ordered by priority.
  8971 The coding systems to try and their priorities follows what
  8972 the function `coding-system-priority-list' (which see) returns.
  8973 
  8974 If only ASCII characters are found (except for such ISO-2022 control
  8975 characters as ESC), it returns a list of single element `undecided'
  8976 or its subsidiary coding system according to a detected end-of-line
  8977 format.
  8978 
  8979 If optional argument HIGHEST is non-nil, return the coding system of
  8980 highest priority.  */)
  8981   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
  8982 {
  8983   ptrdiff_t from, to;
  8984   ptrdiff_t from_byte, to_byte;
  8985 
  8986   validate_region (&start, &end);
  8987   from = XFIXNUM (start), to = XFIXNUM (end);
  8988   from_byte = CHAR_TO_BYTE (from);
  8989   to_byte = CHAR_TO_BYTE (to);
  8990 
  8991   if (from < GPT && to >= GPT)
  8992     move_gap_both (to, to_byte);
  8993 
  8994   return detect_coding_system (BYTE_POS_ADDR (from_byte),
  8995                                to - from, to_byte - from_byte,
  8996                                !NILP (highest),
  8997                                !NILP (BVAR (current_buffer
  8998                                       , enable_multibyte_characters)),
  8999                                Qnil);
  9000 }
  9001 
  9002 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
  9003        1, 2, 0,
  9004        doc: /* Detect coding system of the text in STRING.
  9005 Return a list of possible coding systems ordered by priority.
  9006 The coding systems to try and their priorities follows what
  9007 the function `coding-system-priority-list' (which see) returns.
  9008 
  9009 If only ASCII characters are found (except for such ISO-2022 control
  9010 characters as ESC), it returns a list of single element `undecided'
  9011 or its subsidiary coding system according to a detected end-of-line
  9012 format.
  9013 
  9014 If optional argument HIGHEST is non-nil, return the coding system of
  9015 highest priority.  */)
  9016   (Lisp_Object string, Lisp_Object highest)
  9017 {
  9018   CHECK_STRING (string);
  9019 
  9020   return detect_coding_system (SDATA (string),
  9021                                SCHARS (string), SBYTES (string),
  9022                                !NILP (highest), STRING_MULTIBYTE (string),
  9023                                Qnil);
  9024 }
  9025 
  9026 
  9027 static bool
  9028 char_encodable_p (int c, Lisp_Object attrs)
  9029 {
  9030   Lisp_Object tail;
  9031   struct charset *charset;
  9032   Lisp_Object translation_table;
  9033 
  9034   translation_table = CODING_ATTR_TRANS_TBL (attrs);
  9035   if (! NILP (translation_table))
  9036     c = translate_char (translation_table, c);
  9037   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
  9038        CONSP (tail); tail = XCDR (tail))
  9039     {
  9040       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (tail)));
  9041       if (CHAR_CHARSET_P (c, charset))
  9042         break;
  9043     }
  9044   return (! NILP (tail));
  9045 }
  9046 
  9047 
  9048 /* Return a list of coding systems that safely encode the text between
  9049    START and END.  If EXCLUDE is non-nil, it is a list of coding
  9050    systems not to check.  The returned list doesn't contain any such
  9051    coding systems.  In any case, if the text contains only ASCII or is
  9052    unibyte, return t.  */
  9053 
  9054 DEFUN ("find-coding-systems-region-internal",
  9055        Ffind_coding_systems_region_internal,
  9056        Sfind_coding_systems_region_internal, 2, 3, 0,
  9057        doc: /* Internal use only.  */)
  9058   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
  9059 {
  9060   Lisp_Object coding_attrs_list, safe_codings;
  9061   ptrdiff_t start_byte, end_byte;
  9062   const unsigned char *p, *pbeg, *pend;
  9063   int c;
  9064   Lisp_Object tail, elt, work_table;
  9065 
  9066   if (STRINGP (start))
  9067     {
  9068       if (!STRING_MULTIBYTE (start)
  9069           || SCHARS (start) == SBYTES (start))
  9070         return Qt;
  9071       start_byte = 0;
  9072       end_byte = SBYTES (start);
  9073     }
  9074   else
  9075     {
  9076       EMACS_INT s = fix_position (start);
  9077       EMACS_INT e = fix_position (end);
  9078       if (! (BEG <= s && s <= e && e <= Z))
  9079         args_out_of_range (start, end);
  9080       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
  9081         return Qt;
  9082       start_byte = CHAR_TO_BYTE (s);
  9083       end_byte = CHAR_TO_BYTE (e);
  9084       if (e - s == end_byte - start_byte)
  9085         return Qt;
  9086 
  9087       if (s < GPT && GPT < e)
  9088         {
  9089           if (GPT - s < e - GPT)
  9090             move_gap_both (s, start_byte);
  9091           else
  9092             move_gap_both (e, end_byte);
  9093         }
  9094     }
  9095 
  9096   coding_attrs_list = Qnil;
  9097   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
  9098     if (NILP (exclude)
  9099         || NILP (Fmemq (XCAR (tail), exclude)))
  9100       {
  9101         Lisp_Object attrs;
  9102 
  9103         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
  9104         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
  9105           {
  9106             ASET (attrs, coding_attr_trans_tbl,
  9107                   get_translation_table (attrs, 1, NULL));
  9108             coding_attrs_list = Fcons (attrs, coding_attrs_list);
  9109           }
  9110       }
  9111 
  9112   if (STRINGP (start))
  9113     p = pbeg = SDATA (start);
  9114   else
  9115     p = pbeg = BYTE_POS_ADDR (start_byte);
  9116   pend = p + (end_byte - start_byte);
  9117 
  9118   while (p < pend && ASCII_CHAR_P (*p)) p++;
  9119   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
  9120 
  9121   work_table = Fmake_char_table (Qnil, Qnil);
  9122   while (p < pend)
  9123     {
  9124       if (ASCII_CHAR_P (*p))
  9125         p++;
  9126       else
  9127         {
  9128           c = string_char_advance (&p);
  9129           if (!NILP (char_table_ref (work_table, c)))
  9130             /* This character was already checked.  Ignore it.  */
  9131             continue;
  9132 
  9133           charset_map_loaded = 0;
  9134           for (tail = coding_attrs_list; CONSP (tail);)
  9135             {
  9136               elt = XCAR (tail);
  9137               if (NILP (elt))
  9138                 tail = XCDR (tail);
  9139               else if (char_encodable_p (c, elt))
  9140                 tail = XCDR (tail);
  9141               else if (CONSP (XCDR (tail)))
  9142                 {
  9143                   XSETCAR (tail, XCAR (XCDR (tail)));
  9144                   XSETCDR (tail, XCDR (XCDR (tail)));
  9145                 }
  9146               else
  9147                 {
  9148                   XSETCAR (tail, Qnil);
  9149                   tail = XCDR (tail);
  9150                 }
  9151             }
  9152           if (charset_map_loaded)
  9153             {
  9154               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
  9155 
  9156               if (STRINGP (start))
  9157                 pbeg = SDATA (start);
  9158               else
  9159                 pbeg = BYTE_POS_ADDR (start_byte);
  9160               p = pbeg + p_offset;
  9161               pend = pbeg + pend_offset;
  9162             }
  9163           char_table_set (work_table, c, Qt);
  9164         }
  9165     }
  9166 
  9167   safe_codings = list2 (Qraw_text, Qno_conversion);
  9168   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
  9169     if (! NILP (XCAR (tail)))
  9170       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
  9171 
  9172   return safe_codings;
  9173 }
  9174 
  9175 
  9176 DEFUN ("unencodable-char-position", Funencodable_char_position,
  9177        Sunencodable_char_position, 3, 5, 0,
  9178        doc: /* Return position of first un-encodable character in a region.
  9179 START and END specify the region and CODING-SYSTEM specifies the
  9180 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
  9181 
  9182 If optional 4th argument COUNT is non-nil, it specifies at most how
  9183 many un-encodable characters to search.  In this case, the value is a
  9184 list of positions.
  9185 
  9186 If optional 5th argument STRING is non-nil, it is a string to search
  9187 for un-encodable characters.  In that case, START and END are indexes
  9188 to the string and treated as in `substring'.  */)
  9189   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
  9190    Lisp_Object count, Lisp_Object string)
  9191 {
  9192   EMACS_INT n;
  9193   struct coding_system coding;
  9194   Lisp_Object attrs, charset_list, translation_table;
  9195   Lisp_Object positions;
  9196   ptrdiff_t from, to;
  9197   const unsigned char *p, *stop, *pend;
  9198   bool ascii_compatible;
  9199 
  9200   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
  9201   attrs = CODING_ID_ATTRS (coding.id);
  9202   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
  9203     return Qnil;
  9204   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
  9205   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
  9206   translation_table = get_translation_table (attrs, 1, NULL);
  9207 
  9208   if (NILP (string))
  9209     {
  9210       validate_region (&start, &end);
  9211       from = XFIXNUM (start);
  9212       to = XFIXNUM (end);
  9213       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
  9214           || (ascii_compatible
  9215               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
  9216         return Qnil;
  9217       p = CHAR_POS_ADDR (from);
  9218       pend = CHAR_POS_ADDR (to);
  9219       if (from < GPT && to >= GPT)
  9220         stop = GPT_ADDR;
  9221       else
  9222         stop = pend;
  9223     }
  9224   else
  9225     {
  9226       CHECK_STRING (string);
  9227       validate_subarray (string, start, end, SCHARS (string), &from, &to);
  9228       if (! STRING_MULTIBYTE (string))
  9229         return Qnil;
  9230       p = SDATA (string) + string_char_to_byte (string, from);
  9231       stop = pend = SDATA (string) + string_char_to_byte (string, to);
  9232       if (ascii_compatible && (to - from) == (pend - p))
  9233         return Qnil;
  9234     }
  9235 
  9236   if (NILP (count))
  9237     n = 1;
  9238   else
  9239     {
  9240       CHECK_FIXNAT (count);
  9241       n = XFIXNUM (count);
  9242     }
  9243 
  9244   positions = Qnil;
  9245   charset_map_loaded = 0;
  9246   while (1)
  9247     {
  9248       int c;
  9249 
  9250       if (ascii_compatible)
  9251         while (p < stop && ASCII_CHAR_P (*p))
  9252           p++, from++;
  9253       if (p >= stop)
  9254         {
  9255           if (p >= pend)
  9256             break;
  9257           stop = pend;
  9258           p = GAP_END_ADDR;
  9259         }
  9260 
  9261       c = string_char_advance (&p);
  9262       if (! (ASCII_CHAR_P (c) && ascii_compatible)
  9263           && ! char_charset (translate_char (translation_table, c),
  9264                              charset_list, NULL))
  9265         {
  9266           positions = Fcons (make_fixnum (from), positions);
  9267           n--;
  9268           if (n == 0)
  9269             break;
  9270         }
  9271 
  9272       from++;
  9273       if (charset_map_loaded && NILP (string))
  9274         {
  9275           p = CHAR_POS_ADDR (from);
  9276           pend = CHAR_POS_ADDR (to);
  9277           if (from < GPT && to >= GPT)
  9278             stop = GPT_ADDR;
  9279           else
  9280             stop = pend;
  9281           charset_map_loaded = 0;
  9282         }
  9283     }
  9284 
  9285   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
  9286 }
  9287 
  9288 
  9289 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
  9290        Scheck_coding_systems_region, 3, 3, 0,
  9291        doc: /* Check if text between START and END is encodable by CODING-SYSTEM-LIST.
  9292 
  9293 START and END are buffer positions specifying the region.
  9294 CODING-SYSTEM-LIST is a list of coding systems to check.
  9295 
  9296 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
  9297 function returns nil.
  9298 
  9299 If some of the coding systems cannot encode the whole region, value is
  9300 an alist, each element of which has the form (CODING-SYSTEM POS1 POS2 ...),
  9301 which means that CODING-SYSTEM cannot encode the text at buffer positions
  9302 POS1, POS2, ...
  9303 
  9304 START may be a string.  In that case, check if the string is
  9305 encodable, and the value contains character indices into the string
  9306 instead of buffer positions.  END is ignored in this case.
  9307 
  9308 If the current buffer (or START if it is a string) is unibyte, the value
  9309 is nil.  */)
  9310   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
  9311 {
  9312   Lisp_Object list;
  9313   ptrdiff_t start_byte, end_byte;
  9314   ptrdiff_t pos;
  9315   const unsigned char *p, *pbeg, *pend;
  9316   int c;
  9317   Lisp_Object tail, elt, attrs;
  9318 
  9319   if (STRINGP (start))
  9320     {
  9321       if (!STRING_MULTIBYTE (start)
  9322           || SCHARS (start) == SBYTES (start))
  9323         return Qnil;
  9324       start_byte = 0;
  9325       end_byte = SBYTES (start);
  9326       pos = 0;
  9327     }
  9328   else
  9329     {
  9330       EMACS_INT s = fix_position (start);
  9331       EMACS_INT e = fix_position (end);
  9332       if (! (BEG <= s && s <= e && e <= Z))
  9333         args_out_of_range (start, end);
  9334       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
  9335         return Qnil;
  9336       start_byte = CHAR_TO_BYTE (s);
  9337       end_byte = CHAR_TO_BYTE (e);
  9338       if (e - s == end_byte - start_byte)
  9339         return Qnil;
  9340 
  9341       if (s < GPT && GPT < e)
  9342         {
  9343           if (GPT - s < e - GPT)
  9344             move_gap_both (s, start_byte);
  9345           else
  9346             move_gap_both (e, end_byte);
  9347         }
  9348       pos = s;
  9349     }
  9350 
  9351   list = Qnil;
  9352   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
  9353     {
  9354       elt = XCAR (tail);
  9355       Lisp_Object spec = CODING_SYSTEM_SPEC (elt);
  9356       if (!VECTORP (spec))
  9357         xsignal1 (Qcoding_system_error, elt);
  9358       attrs = AREF (spec, 0);
  9359       ASET (attrs, coding_attr_trans_tbl,
  9360             get_translation_table (attrs, 1, NULL));
  9361       list = Fcons (list2 (elt, attrs), list);
  9362     }
  9363 
  9364   if (STRINGP (start))
  9365     p = pbeg = SDATA (start);
  9366   else
  9367     p = pbeg = BYTE_POS_ADDR (start_byte);
  9368   pend = p + (end_byte - start_byte);
  9369 
  9370   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
  9371   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
  9372 
  9373   while (p < pend)
  9374     {
  9375       if (ASCII_CHAR_P (*p))
  9376         p++;
  9377       else
  9378         {
  9379           c = string_char_advance (&p);
  9380 
  9381           charset_map_loaded = 0;
  9382           for (tail = list; CONSP (tail); tail = XCDR (tail))
  9383             {
  9384               elt = XCDR (XCAR (tail));
  9385               if (! char_encodable_p (c, XCAR (elt)))
  9386                 XSETCDR (elt, Fcons (make_fixnum (pos), XCDR (elt)));
  9387             }
  9388           if (charset_map_loaded)
  9389             {
  9390               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
  9391 
  9392               if (STRINGP (start))
  9393                 pbeg = SDATA (start);
  9394               else
  9395                 pbeg = BYTE_POS_ADDR (start_byte);
  9396               p = pbeg + p_offset;
  9397               pend = pbeg + pend_offset;
  9398             }
  9399         }
  9400       pos++;
  9401     }
  9402 
  9403   tail = list;
  9404   list = Qnil;
  9405   for (; CONSP (tail); tail = XCDR (tail))
  9406     {
  9407       elt = XCAR (tail);
  9408       if (CONSP (XCDR (XCDR (elt))))
  9409         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
  9410                       list);
  9411     }
  9412 
  9413   return list;
  9414 }
  9415 
  9416 
  9417 static Lisp_Object
  9418 code_convert_region (Lisp_Object start, Lisp_Object end,
  9419                      Lisp_Object coding_system, Lisp_Object dst_object,
  9420                      bool encodep, bool norecord)
  9421 {
  9422   struct coding_system coding;
  9423   ptrdiff_t from, from_byte, to, to_byte;
  9424   Lisp_Object src_object;
  9425 
  9426   if (NILP (coding_system))
  9427     coding_system = Qno_conversion;
  9428   else
  9429     CHECK_CODING_SYSTEM (coding_system);
  9430   src_object = Fcurrent_buffer ();
  9431   if (NILP (dst_object))
  9432     dst_object = src_object;
  9433   else if (! EQ (dst_object, Qt))
  9434     CHECK_BUFFER (dst_object);
  9435 
  9436   validate_region (&start, &end);
  9437   from = XFIXNAT (start);
  9438   from_byte = CHAR_TO_BYTE (from);
  9439   to = XFIXNAT (end);
  9440   to_byte = CHAR_TO_BYTE (to);
  9441 
  9442   setup_coding_system (coding_system, &coding);
  9443   coding.mode |= CODING_MODE_LAST_BLOCK;
  9444 
  9445   if (BUFFERP (dst_object) && !BASE_EQ (dst_object, src_object))
  9446     {
  9447       struct buffer *buf = XBUFFER (dst_object);
  9448       ptrdiff_t buf_pt = BUF_PT (buf);
  9449 
  9450       invalidate_buffer_caches (buf, buf_pt, buf_pt);
  9451     }
  9452 
  9453   if (encodep)
  9454     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
  9455                           dst_object);
  9456   else
  9457     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
  9458                           dst_object);
  9459   if (! norecord)
  9460     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
  9461 
  9462   return (BUFFERP (dst_object)
  9463           ? make_fixnum (coding.produced_char)
  9464           : coding.dst_object);
  9465 }
  9466 
  9467 
  9468 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
  9469        3, 4, "r\nzCoding system: ",
  9470        doc: /* Decode the current region using the specified coding system.
  9471 Interactively, prompt for the coding system to decode the region, and
  9472 replace the region with the decoded text.
  9473 
  9474 \"Decoding\" means transforming bytes into readable text (characters).
  9475 If, for instance, you have a region that contains data that represents
  9476 the two bytes #xc2 #xa9, after calling this function with the utf-8
  9477 coding system, the region will contain the single
  9478 character ?\\N{COPYRIGHT SIGN}.
  9479 
  9480 When called from a program, takes four arguments:
  9481         START, END, CODING-SYSTEM, and DESTINATION.
  9482 START and END are buffer positions.
  9483 
  9484 Optional 4th arguments DESTINATION specifies where the decoded text goes.
  9485 If nil, the region between START and END is replaced by the decoded text.
  9486 If buffer, the decoded text is inserted in that buffer after point (point
  9487 does not move).  If that buffer is unibyte, it receives the individual
  9488 bytes of the internal representation of the decoded text.
  9489 In those cases, the length of the decoded text is returned.
  9490 If DESTINATION is t, the decoded text is returned.
  9491 
  9492 This function sets `last-coding-system-used' to the precise coding system
  9493 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
  9494 not fully specified.)  */)
  9495   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
  9496 {
  9497   return code_convert_region (start, end, coding_system, destination, 0, 0);
  9498 }
  9499 
  9500 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
  9501        3, 4, "r\nzCoding system: ",
  9502        doc: /* Encode the current region using th specified coding system.
  9503 Interactively, prompt for the coding system to encode the region, and
  9504 replace the region with the bytes that are the result of the encoding.
  9505 
  9506 What's meant by \"encoding\" is transforming textual data (characters)
  9507 into bytes.  If, for instance, you have a region that contains the
  9508 single character ?\\N{COPYRIGHT SIGN}, after calling this function with
  9509 the utf-8 coding system, the data in the region will represent the two
  9510 bytes #xc2 #xa9.
  9511 
  9512 When called from a program, takes four arguments:
  9513         START, END, CODING-SYSTEM and DESTINATION.
  9514 START and END are buffer positions.
  9515 
  9516 Optional 4th argument DESTINATION specifies where the encoded text goes.
  9517 If nil, the region between START and END is replaced by the encoded text.
  9518 If buffer, the encoded text is inserted in that buffer after point (point
  9519 does not move).
  9520 In those cases, the length of the encoded text is returned.
  9521 If DESTINATION is t, the encoded text is returned.
  9522 
  9523 This function sets `last-coding-system-used' to the precise coding system
  9524 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
  9525 not fully specified.)  */)
  9526   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
  9527 {
  9528   return code_convert_region (start, end, coding_system, destination, 1, 0);
  9529 }
  9530 
  9531 /* Whether STRING only contains chars in the 0..127 range.  */
  9532 bool
  9533 string_ascii_p (Lisp_Object string)
  9534 {
  9535   ptrdiff_t nbytes = SBYTES (string);
  9536   for (ptrdiff_t i = 0; i < nbytes; i++)
  9537     if (SREF (string, i) > 127)
  9538       return false;
  9539   return true;
  9540 }
  9541 
  9542 Lisp_Object
  9543 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
  9544                      Lisp_Object dst_object, bool encodep, bool nocopy,
  9545                      bool norecord)
  9546 {
  9547   struct coding_system coding;
  9548   ptrdiff_t chars, bytes;
  9549 
  9550   CHECK_STRING (string);
  9551   if (NILP (coding_system))
  9552     {
  9553       if (! norecord)
  9554         Vlast_coding_system_used = Qno_conversion;
  9555       if (NILP (dst_object))
  9556         return nocopy ? string : Fcopy_sequence (string);
  9557     }
  9558 
  9559   if (NILP (coding_system))
  9560     coding_system = Qno_conversion;
  9561   else
  9562     CHECK_CODING_SYSTEM (coding_system);
  9563   if (NILP (dst_object))
  9564     dst_object = Qt;
  9565   else if (! EQ (dst_object, Qt))
  9566     CHECK_BUFFER (dst_object);
  9567 
  9568   setup_coding_system (coding_system, &coding);
  9569   coding.mode |= CODING_MODE_LAST_BLOCK;
  9570   chars = SCHARS (string);
  9571   bytes = SBYTES (string);
  9572 
  9573   if (EQ (dst_object, Qt))
  9574     {
  9575       /* Fast path for ASCII-only input and an ASCII-compatible coding:
  9576          act as identity if no EOL conversion is needed.  */
  9577       Lisp_Object attrs = CODING_ID_ATTRS (coding.id);
  9578       if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
  9579           && (STRING_MULTIBYTE (string)
  9580               ? (chars == bytes) : string_ascii_p (string))
  9581           && (EQ (CODING_ID_EOL_TYPE (coding.id), Qunix)
  9582               || inhibit_eol_conversion
  9583               || ! memchr (SDATA (string), encodep ? '\n' : '\r', bytes)))
  9584         {
  9585           if (! norecord)
  9586             Vlast_coding_system_used = coding_system;
  9587           return (nocopy
  9588                   ? string
  9589                   : (encodep
  9590                      ? make_unibyte_string (SSDATA (string), bytes)
  9591                      : make_multibyte_string (SSDATA (string), bytes, bytes)));
  9592         }
  9593     }
  9594   else if (BUFFERP (dst_object))
  9595     {
  9596       struct buffer *buf = XBUFFER (dst_object);
  9597       ptrdiff_t buf_pt = BUF_PT (buf);
  9598 
  9599       invalidate_buffer_caches (buf, buf_pt, buf_pt);
  9600     }
  9601 
  9602   if (encodep)
  9603     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
  9604   else
  9605     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
  9606   if (! norecord)
  9607     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
  9608 
  9609   return (BUFFERP (dst_object)
  9610           ? make_fixnum (coding.produced_char)
  9611           : coding.dst_object);
  9612 }
  9613 
  9614 
  9615 /* Encode or decode STRING according to CODING_SYSTEM.
  9616    Do not set Vlast_coding_system_used.  */
  9617 
  9618 Lisp_Object
  9619 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
  9620                               bool encodep)
  9621 {
  9622   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
  9623 }
  9624 
  9625 
  9626 /* Return the gap address of BUFFER.  If the gap size is less than
  9627    NBYTES, enlarge the gap in advance.  */
  9628 
  9629 static unsigned char *
  9630 get_buffer_gap_address (Lisp_Object buffer, ptrdiff_t nbytes)
  9631 {
  9632   struct buffer *buf = XBUFFER (buffer);
  9633 
  9634   if (BUF_GPT (buf) != BUF_PT (buf))
  9635     {
  9636       struct buffer *oldb = current_buffer;
  9637 
  9638       current_buffer = buf;
  9639       move_gap_both (PT, PT_BYTE);
  9640       current_buffer = oldb;
  9641     }
  9642   if (BUF_GAP_SIZE (buf) < nbytes)
  9643     make_gap_1 (buf, nbytes);
  9644   return BUF_GPT_ADDR (buf);
  9645 }
  9646 
  9647 /* Return a pointer to the byte sequence for C, and its byte length in
  9648    LEN.  This function is used to get a byte sequence for HANDLE_8_BIT
  9649    and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
  9650    decode_string_utf_8 when those arguments are given by
  9651    characters.  */
  9652 
  9653 static unsigned char *
  9654 get_char_bytes (int c, int *len)
  9655 {
  9656   /* Use two caches, since encode/decode_string_utf_8 are called
  9657      repeatedly with the same values for HANDLE_8_BIT and
  9658      HANDLE_OVER_UNI arguments.  */
  9659   static int chars[2];
  9660   static unsigned char bytes[2][6];
  9661   static int nbytes[2];
  9662   static int last_index;
  9663 
  9664   if (chars[last_index] == c)
  9665     {
  9666       *len = nbytes[last_index];
  9667       return bytes[last_index];
  9668     }
  9669   if (chars[1 - last_index] == c)
  9670     {
  9671       *len = nbytes[1 - last_index];
  9672       return bytes[1 - last_index];
  9673     }
  9674   last_index = 1 - last_index;
  9675   chars[last_index] = c;
  9676   *len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
  9677   return bytes[last_index];
  9678 }
  9679 
  9680 /* Encode STRING by the coding system utf-8-unix.
  9681 
  9682    This function is optimized for speed when the input string is
  9683    already a valid sequence of Unicode codepoints in the internal
  9684    representation, i.e. there are neither 8-bit raw bytes nor
  9685    characters beyond the Unicode range in the string's contents.
  9686 
  9687    Ignore any :pre-write-conversion and :encode-translation-table
  9688    properties.
  9689 
  9690    Assume that arguments have values as described below.
  9691    The validity must be enforced and ensured by the caller.
  9692 
  9693    STRING is a multibyte string or an ASCII-only unibyte string.
  9694 
  9695    BUFFER is a unibyte buffer or Qnil.
  9696 
  9697    If BUFFER is a unibyte buffer, insert the encoded result
  9698    after point of the buffer, and return the number of
  9699    inserted characters.  The caller should have made BUFFER ready for
  9700    modifying in advance (e.g., by calling invalidate_buffer_caches).
  9701 
  9702    If BUFFER is nil, return a unibyte string from the encoded result.
  9703 
  9704    If NOCOPY is non-zero, and if STRING contains only Unicode
  9705    characters (i.e., the encoding does not change the byte sequence),
  9706    return STRING even if it is multibyte.  WARNING: This will return a
  9707    _multibyte_ string, something that callers might not expect, especially
  9708    if STRING is not pure-ASCII; only use NOCOPY non-zero if the caller
  9709    will only use the byte sequence of the encoded result accessed by
  9710    SDATA or SSDATA, and the original STRING will _not_ be modified after
  9711    the encoding.  When in doubt, always pass NOCOPY as zero.  You _have_
  9712    been warned!
  9713 
  9714    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a non-Unicode
  9715    character in STRING.  The former is for an eight-bit character (represented
  9716    by a 2-byte overlong sequence in a multibyte STRING).  The latter is
  9717    for a codepoint beyond the end of the Unicode range (a character whose
  9718    code is greater than the maximum Unicode character 0x10FFFF, represented
  9719    by a 4 or 5-byte sequence in a multibyte STRING).
  9720 
  9721    If these two arguments are unibyte strings (typically
  9722    "\357\277\275", the UTF-8 sequence for the Unicode REPLACEMENT
  9723    CHARACTER #xFFFD), encode a non-Unicode character into that
  9724    unibyte sequence.
  9725 
  9726    If the two arguments are characters, encode a non-Unicode
  9727    character as the respective argument characters.
  9728 
  9729    If they are Qignored, skip a non-Unicode character.
  9730 
  9731    If HANDLE-8-BIT is Qt, encode eight-bit characters into single bytes
  9732    of the same value, like the usual Emacs encoding does.
  9733 
  9734    If HANDLE-OVER-UNI is Qt, encode characters beyond the Unicode
  9735    range into the same 4 or 5-byte sequence as used by Emacs
  9736    internally, like the usual Emacs encoding does.
  9737 
  9738    If the two arguments are Qnil, return Qnil if STRING has a
  9739    non-Unicode character.  This allows the caller to signal an error
  9740    if such input strings are not allowed.  */
  9741 
  9742 Lisp_Object
  9743 encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
  9744                      bool nocopy, Lisp_Object handle_8_bit,
  9745                      Lisp_Object handle_over_uni)
  9746 {
  9747   ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
  9748   if (NILP (buffer) && nchars == nbytes && nocopy)
  9749     /* STRING contains only ASCII characters.  */
  9750     return string;
  9751 
  9752   ptrdiff_t num_8_bit = 0;   /* number of eight-bit chars in STRING */
  9753   /* The following two vars are counted only if handle_over_uni is not Qt.  */
  9754   ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
  9755   ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
  9756   ptrdiff_t outbytes;        /* number of bytes of decoding result */
  9757   unsigned char *p = SDATA (string);
  9758   unsigned char *pend = p + nbytes;
  9759   unsigned char *src = NULL, *dst = NULL;
  9760   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
  9761   int replace_8_bit_len = 0, replace_over_uni_len = 0;
  9762   Lisp_Object val;              /* the return value */
  9763 
  9764   /* Scan bytes in STRING twice.  The first scan is to count non-Unicode
  9765      characters, and the second scan is to encode STRING.  If the
  9766      encoding is trivial (no need of changing the byte sequence),
  9767      the second scan is avoided.  */
  9768   for (int scan_count = 0; scan_count < 2; scan_count++)
  9769     {
  9770       while (p < pend)
  9771         {
  9772           if (nchars == pend - p)
  9773             /* There is no multibyte character remaining.  */
  9774             break;
  9775 
  9776           int c = *p;
  9777           int len = BYTES_BY_CHAR_HEAD (c);
  9778 
  9779           nchars--;
  9780           if (len == 1
  9781               || len == 3
  9782               || (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
  9783                   : (EQ (handle_over_uni, Qt)
  9784                      || (len == 4
  9785                          && STRING_CHAR (p) <= MAX_UNICODE_CHAR))))
  9786             {
  9787               p += len;
  9788               continue;
  9789             }
  9790 
  9791           /* A character to change the byte sequence on encoding was
  9792              found.  A rare case.  */
  9793           if (len == 2)
  9794             {
  9795               /* Handle an eight-bit character by handle_8_bit.  */
  9796               if (scan_count == 0)
  9797                 {
  9798                   if (NILP (handle_8_bit))
  9799                     return Qnil;
  9800                   num_8_bit++;
  9801                 }
  9802               else
  9803                 {
  9804                   if (src < p)
  9805                     {
  9806                       memcpy (dst, src, p - src);
  9807                       dst += p - src;
  9808                     }
  9809                   if (replace_8_bit_len > 0)
  9810                     {
  9811                       memcpy (dst, replace_8_bit, replace_8_bit_len);
  9812                       dst += replace_8_bit_len;
  9813                     }
  9814                   else if (EQ (handle_8_bit, Qt))
  9815                     {
  9816                       int char8 = STRING_CHAR (p);
  9817                       *dst++ = CHAR_TO_BYTE8 (char8);
  9818                     }
  9819                 }
  9820             }
  9821           else                  /* len == 4 or 5 */
  9822             {
  9823               /* Handle an over-unicode character by handle_over_uni.  */
  9824               if (scan_count == 0)
  9825                 {
  9826                   if (NILP (handle_over_uni))
  9827                     return Qnil;
  9828                   if (len == 4)
  9829                     num_over_4++;
  9830                   else
  9831                     num_over_5++;
  9832                 }
  9833               else
  9834                 {
  9835                   if (src < p)
  9836                     {
  9837                       memcpy (dst, src, p - src);
  9838                       dst += p - src;
  9839                     }
  9840                   if (replace_over_uni_len > 0)
  9841                     {
  9842                       memcpy (dst, replace_over_uni, replace_over_uni_len);
  9843                       dst += replace_over_uni_len;
  9844                     }
  9845                 }
  9846             }
  9847           p += len;
  9848           src = p;
  9849         }
  9850 
  9851       if (scan_count == 0)
  9852         {
  9853           /* End of the first scan.  */
  9854           outbytes = nbytes;
  9855           if (num_8_bit == 0
  9856               && (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
  9857             {
  9858               /* We can break the loop because there is no need of
  9859                  changing the byte sequence.  This is the typical
  9860                  case.  */
  9861               scan_count = 1;
  9862             }
  9863           else
  9864             {
  9865               /* Prepare for handling non-Unicode characters during
  9866                  the next scan.  */
  9867               if (num_8_bit > 0)
  9868                 {
  9869                   if (CHARACTERP (handle_8_bit))
  9870                     replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
  9871                                                     &replace_8_bit_len);
  9872                   else if (STRINGP (handle_8_bit))
  9873                     {
  9874                       replace_8_bit = SDATA (handle_8_bit);
  9875                       replace_8_bit_len = SBYTES (handle_8_bit);
  9876                     }
  9877                   if (replace_8_bit)
  9878                     outbytes += (replace_8_bit_len - 2) * num_8_bit;
  9879                   else if (EQ (handle_8_bit, Qignored))
  9880                     outbytes -= 2 * num_8_bit;
  9881                   else if (EQ (handle_8_bit, Qt))
  9882                     outbytes -= num_8_bit;
  9883                   else
  9884                     return Qnil;
  9885                 }
  9886               if (num_over_4 + num_over_5 > 0)
  9887                 {
  9888                   if (CHARACTERP (handle_over_uni))
  9889                     replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
  9890                                                        &replace_over_uni_len);
  9891                   else if (STRINGP (handle_over_uni))
  9892                     {
  9893                       replace_over_uni = SDATA (handle_over_uni);
  9894                       replace_over_uni_len = SBYTES (handle_over_uni);
  9895                     }
  9896                   if (num_over_4 > 0)
  9897                     {
  9898                       if (replace_over_uni)
  9899                         outbytes += (replace_over_uni_len - 4) * num_over_4;
  9900                       else if (EQ (handle_over_uni, Qignored))
  9901                         outbytes -= 4 * num_over_4;
  9902                       else if (! EQ (handle_over_uni, Qt))
  9903                         return Qnil;
  9904                     }
  9905                   if (num_over_5 > 0)
  9906                     {
  9907                       if (replace_over_uni)
  9908                         outbytes += (replace_over_uni_len - 5) * num_over_5;
  9909                       else if (EQ (handle_over_uni, Qignored))
  9910                         outbytes -= 5 * num_over_5;
  9911                       else if (! EQ (handle_over_uni, Qt))
  9912                         return Qnil;
  9913                     }
  9914                 }
  9915             }
  9916 
  9917           /* Prepare return value and space to store the encoded bytes.  */
  9918           if (BUFFERP (buffer))
  9919             {
  9920               val = make_fixnum (outbytes);
  9921               dst = get_buffer_gap_address (buffer, nbytes);
  9922             }
  9923           else
  9924             {
  9925               if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
  9926                 return string;
  9927               val = make_uninit_string (outbytes);
  9928               dst = SDATA (val);
  9929             }
  9930           p = src = SDATA (string);
  9931         }
  9932     }
  9933 
  9934   if (src < pend)
  9935     memcpy (dst, src, pend - src);
  9936   if (BUFFERP (buffer))
  9937     {
  9938       struct buffer *oldb = current_buffer;
  9939 
  9940       current_buffer = XBUFFER (buffer);
  9941       insert_from_gap (outbytes, outbytes, false);
  9942       current_buffer = oldb;
  9943     }
  9944   return val;
  9945 }
  9946 
  9947 /* Decode input string by the coding system utf-8-unix.
  9948 
  9949    This function is optimized for speed when the input string is
  9950    already a valid UTF-8 sequence, i.e. there are neither 8-bit raw
  9951    bytes nor any UTF-8 sequences longer than 4 bytes in the string's
  9952    contents.
  9953 
  9954    Ignore any :post-read-conversion and :decode-translation-table
  9955    properties.
  9956 
  9957    Assume that arguments have values as described below.
  9958    The validity must be enforced and ensured by the caller.
  9959 
  9960    STRING is a unibyte string, an ASCII-only multibyte string, or Qnil.
  9961    If STRING is Qnil, the input is a C string pointed by STR whose
  9962    length in bytes is in STR_LEN.
  9963 
  9964    BUFFER is a multibyte buffer or Qnil.
  9965    If BUFFER is a multibyte buffer, insert the decoding result of
  9966    Unicode characters after point of the buffer, and return the number
  9967    of inserted characters.  The caller should have made BUFFER ready
  9968    for modifying in advance (e.g., by calling invalidate_buffer_caches).
  9969 
  9970    If BUFFER is Qnil, return a multibyte string from the decoded result.
  9971 
  9972    NOCOPY non-zero means it is OK to return the input STRING if it
  9973    contains only ASCII characters or only valid UTF-8 sequences of 2
  9974    to 4 bytes.  WARNING: This will return a _unibyte_ string, something
  9975    that callers might not expect, especially if STRING is not
  9976    pure-ASCII; only use NOCOPY non-zero if the caller will only use
  9977    the byte sequence of the decoded result accessed via SDATA or
  9978    SSDATA, and if the original STRING will _not_ be modified after the
  9979    decoding.  When in doubt, always pass NOCOPY as zero.  You _have_
  9980    been warned!
  9981 
  9982    If STRING is Qnil, and the original string is passed via STR, NOCOPY
  9983    is ignored.
  9984 
  9985    HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
  9986    byte sequence.  The former is for a 1-byte invalid sequence that
  9987    violates the fundamental UTF-8 encoding rules.  The latter is for a
  9988    4 or 5-byte overlong sequences that Emacs internally uses to
  9989    represent characters beyond the Unicode range (characters whose
  9990    codepoints are greater than #x10FFFF).  Note that this function does
  9991    not in general treat such overlong UTF-8 sequences as invalid.
  9992 
  9993    If these two arguments are strings (typically a 1-char string of
  9994    the Unicode REPLACEMENT CHARACTER #xFFFD), decode an invalid byte
  9995    sequence into that string.  They must be multibyte strings if they
  9996    contain a non-ASCII character.
  9997 
  9998    If the two arguments are characters, decode an invalid byte
  9999    sequence into the corresponding multibyte representation of the
 10000    respective character.
 10001 
 10002    If they are Qignored, skip an invalid byte sequence without
 10003    producing anything in the decoded string.
 10004 
 10005    If HANDLE-8-BIT is Qt, decode a 1-byte invalid sequence into the
 10006    corresponding eight-bit multibyte representation, like the usual
 10007    Emacs decoding does.
 10008 
 10009    If HANDLE-OVER-UNI is Qt, decode a 4 or 5-byte overlong sequence
 10010    that follows Emacs' internal representation for a character beyond
 10011    Unicode range into the corresponding character, like the usual
 10012    Emacs decoding does.
 10013 
 10014    If the two arguments are Qnil, return Qnil if the input string has
 10015    raw bytes or overlong sequences.  This allows the caller to signal
 10016    an error if such inputs are not allowed.  */
 10017 
 10018 Lisp_Object
 10019 decode_string_utf_8 (Lisp_Object string, const char *str, ptrdiff_t str_len,
 10020                      Lisp_Object buffer, bool nocopy,
 10021                      Lisp_Object handle_8_bit, Lisp_Object handle_over_uni)
 10022 {
 10023   /* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
 10024      and it returns 0 for an invalid sequence.  */
 10025 #define UTF_8_SEQUENCE_LENGTH(c)        \
 10026   ((c) < 0xC2 ? 0                       \
 10027    : (c) < 0xE0 ? 2                     \
 10028    : (c) < 0xF0 ? 3                     \
 10029    : (c) < 0xF8 ? 4                     \
 10030    : (c) == 0xF8 ? 5                    \
 10031    : 0)
 10032 
 10033   ptrdiff_t nbytes = STRINGP (string) ? SBYTES (string) : str_len;
 10034   unsigned char *p = STRINGP (string) ? SDATA (string) : (unsigned char *) str;
 10035   unsigned char *str_orig = p;
 10036   unsigned char *pend = p + nbytes;
 10037   ptrdiff_t num_8_bit = 0;   /* number of invalid 1-byte sequences */
 10038   ptrdiff_t num_over_4 = 0;  /* number of invalid 4-byte sequences */
 10039   ptrdiff_t num_over_5 = 0;  /* number of invalid 5-byte sequences */
 10040   ptrdiff_t outbytes = nbytes;  /* number of decoded bytes */
 10041   ptrdiff_t outchars = 0;    /* number of decoded characters */
 10042   unsigned char *src = NULL, *dst = NULL;
 10043   bool change_byte_sequence = false;
 10044 
 10045   /* Scan input bytes twice.  The first scan is to count invalid
 10046      sequences, and the second scan is to decode input.  If the
 10047      decoding is trivial (no need of changing the byte sequence),
 10048      the second scan is avoided.  */
 10049   while (p < pend)
 10050     {
 10051       src = p;
 10052       /* Try short cut for an ASCII-only case.  */
 10053       while (p < pend && *p < 0x80) p++;
 10054       outchars += (p - src);
 10055       if (p == pend)
 10056         break;
 10057       int c = *p;
 10058       outchars++;
 10059       int len = UTF_8_SEQUENCE_LENGTH (c);
 10060       /* len == 0, 2, 3, 4, 5.  */
 10061       if (UTF_8_EXTRA_OCTET_P (p[1])
 10062           && (len == 2
 10063               || (UTF_8_EXTRA_OCTET_P (p[2])
 10064                   && (len == 3
 10065                       || (UTF_8_EXTRA_OCTET_P (p[3])
 10066                           && len == 4
 10067                           && STRING_CHAR (p) <= MAX_UNICODE_CHAR)))))
 10068         {
 10069           p += len;
 10070           continue;
 10071         }
 10072 
 10073       /* A sequence to change on decoding was found.  A rare case.  */
 10074       if (len == 0)
 10075         {
 10076           if (NILP (handle_8_bit))
 10077             return Qnil;
 10078           num_8_bit++;
 10079           len = 1;
 10080         }
 10081       else                      /* len == 4 or 5 */
 10082         {
 10083           if (NILP (handle_over_uni))
 10084             return Qnil;
 10085           if (len == 4)
 10086             num_over_4++;
 10087           else
 10088             num_over_5++;
 10089         }
 10090       change_byte_sequence = true;
 10091       p += len;
 10092     }
 10093 
 10094   Lisp_Object val;           /* the return value */
 10095 
 10096   if (! change_byte_sequence
 10097       && NILP (buffer))
 10098     {
 10099       if (nocopy && STRINGP (string))
 10100         return string;
 10101       val = make_uninit_multibyte_string (outchars, outbytes);
 10102       memcpy (SDATA (val), str_orig, pend - str_orig);
 10103       return val;
 10104     }
 10105 
 10106   /* Count the number of resulting chars and bytes.  */
 10107   unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
 10108   int replace_8_bit_len = 0, replace_over_uni_len = 0;
 10109 
 10110   if (change_byte_sequence)
 10111     {
 10112       if (num_8_bit > 0)
 10113         {
 10114           if (CHARACTERP (handle_8_bit))
 10115             replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
 10116                                             &replace_8_bit_len);
 10117           else if (STRINGP (handle_8_bit))
 10118             {
 10119               replace_8_bit = SDATA (handle_8_bit);
 10120               replace_8_bit_len = SBYTES (handle_8_bit);
 10121             }
 10122           if (replace_8_bit)
 10123             outbytes += (replace_8_bit_len - 1) * num_8_bit;
 10124           else if (EQ (handle_8_bit, Qignored))
 10125             {
 10126               outbytes -= num_8_bit;
 10127               outchars -= num_8_bit;
 10128             }
 10129           else /* EQ (handle_8_bit, Qt)) */
 10130             outbytes += num_8_bit;
 10131         }
 10132       else if (num_over_4 + num_over_5 > 0)
 10133         {
 10134           if (CHARACTERP (handle_over_uni))
 10135             replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
 10136                                                &replace_over_uni_len);
 10137           else if (STRINGP (handle_over_uni))
 10138             {
 10139               replace_over_uni = SDATA (handle_over_uni);
 10140               replace_over_uni_len = SBYTES (handle_over_uni);
 10141             }
 10142           if (num_over_4 > 0)
 10143             {
 10144               if (replace_over_uni)
 10145                 outbytes += (replace_over_uni_len - 4) * num_over_4;
 10146               else if (EQ (handle_over_uni, Qignored))
 10147                 {
 10148                   outbytes -= 4 * num_over_4;
 10149                   outchars -= num_over_4;
 10150                 }
 10151             }
 10152           if (num_over_5 > 0)
 10153             {
 10154               if (replace_over_uni)
 10155                 outbytes += (replace_over_uni_len - 5) * num_over_5;
 10156               else if (EQ (handle_over_uni, Qignored))
 10157                 {
 10158                   outbytes -= 5 * num_over_5;
 10159                   outchars -= num_over_5;
 10160                 }
 10161             }
 10162         }
 10163     }
 10164 
 10165   /* Prepare return value and  space to store the decoded bytes.  */
 10166   if (BUFFERP (buffer))
 10167     {
 10168       val = make_fixnum (outchars);
 10169       dst = get_buffer_gap_address (buffer, outbytes);
 10170     }
 10171   else
 10172     {
 10173       if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0
 10174           && STRINGP (string))
 10175         return string;
 10176       val = make_uninit_multibyte_string (outchars, outbytes);
 10177       dst = SDATA (val);
 10178     }
 10179 
 10180   src = str_orig;
 10181   if (change_byte_sequence)
 10182     {
 10183       p = src;
 10184       while (p < pend)
 10185         {
 10186           /* Try short cut for an ASCII-only case.  */
 10187           /* while (p < pend && *p < 0x80) p++; */
 10188           /* if (p == pend) */
 10189           /*   break; */
 10190           int c = *p;
 10191           if (c < 0x80)
 10192             {
 10193               p++;
 10194               continue;
 10195             }
 10196           int len = UTF_8_SEQUENCE_LENGTH (c);
 10197           if (len > 1)
 10198             {
 10199               int mlen;
 10200               for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
 10201                    mlen++);
 10202               if (mlen == len
 10203                   && (len <= 3
 10204                       || (len == 4 && STRING_CHAR (p) <= MAX_UNICODE_CHAR)
 10205                       || EQ (handle_over_uni, Qt)))
 10206                 {
 10207                   p += len;
 10208                   continue;
 10209                 }
 10210             }
 10211 
 10212           if (src < p)
 10213             {
 10214               memcpy (dst, src, p - src);
 10215               dst += p - src;
 10216             }
 10217           if (len == 0)
 10218             {
 10219               if (replace_8_bit)
 10220                 {
 10221                   memcpy (dst, replace_8_bit, replace_8_bit_len);
 10222                   dst += replace_8_bit_len;
 10223                 }
 10224               else if (EQ (handle_8_bit, Qt))
 10225                 {
 10226                   dst += BYTE8_STRING (c, dst);
 10227                 }
 10228               len = 1;
 10229             }
 10230           else                  /* len == 4 or 5 */
 10231             {
 10232               /* Handle p[0]... by handle_over_uni.  */
 10233               if (replace_over_uni)
 10234                 {
 10235                   memcpy (dst, replace_over_uni, replace_over_uni_len);
 10236                   dst += replace_over_uni_len;
 10237                 }
 10238             }
 10239           p += len;
 10240           src = p;
 10241         }
 10242     }
 10243 
 10244   if (src < pend)
 10245     memcpy (dst, src, pend - src);
 10246   if (BUFFERP (buffer))
 10247     {
 10248       struct buffer *oldb = current_buffer;
 10249 
 10250       current_buffer = XBUFFER (buffer);
 10251       insert_from_gap (outchars, outbytes, false);
 10252       current_buffer = oldb;
 10253     }
 10254   return val;
 10255 }
 10256 
 10257 /* #define ENABLE_UTF_8_CONVERTER_TEST */
 10258 
 10259 #ifdef ENABLE_UTF_8_CONVERTER_TEST
 10260 
 10261 /* These functions are useful for testing and benchmarking
 10262    encode_string_utf_8 and decode_string_utf_8.  */
 10263 
 10264 /* ENCODE_METHOD specifies which internal decoder to use.
 10265    If it is Qnil, use encode_string_utf_8.
 10266    Otherwise, use code_convert_string.
 10267 
 10268    COUNT, if integer, specifies how many times to call those functions
 10269    with the same arguments (for benchmarking). */
 10270 
 10271 DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
 10272        Sinternal_encode_string_utf_8, 7, 7, 0,
 10273        doc: /* Internal use only.*/)
 10274   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
 10275    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
 10276    Lisp_Object encode_method, Lisp_Object count)
 10277 {
 10278   int repeat_count;
 10279   Lisp_Object val;
 10280 
 10281   /* Check arguments.  Return Qnil when an argument is invalid.  */
 10282   if (! STRINGP (string))
 10283     return Qnil;
 10284   if (! NILP (buffer)
 10285       && (! BUFFERP (buffer)
 10286           || ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
 10287     return Qnil;
 10288   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
 10289       && ! EQ (handle_8_bit, Qignored)
 10290       && ! CHARACTERP (handle_8_bit)
 10291       && (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
 10292     return Qnil;
 10293   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
 10294       && ! EQ (handle_over_uni, Qignored)
 10295       && ! CHARACTERP (handle_over_uni)
 10296       && (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
 10297     return Qnil;
 10298 
 10299   CHECK_FIXNUM (count);
 10300   repeat_count = XFIXNUM (count);
 10301 
 10302   val = Qnil;
 10303   /* Run an encoder according to ENCODE_METHOD.  */
 10304   if (NILP (encode_method))
 10305     {
 10306       for (int i = 0; i < repeat_count; i++)
 10307         val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
 10308                                    handle_8_bit, handle_over_uni);
 10309     }
 10310   else
 10311     {
 10312       for (int i = 0; i < repeat_count; i++)
 10313         val = code_convert_string (string, Qutf_8_unix, Qnil, true,
 10314                                    ! NILP (nocopy), true);
 10315     }
 10316   return val;
 10317 }
 10318 
 10319 /* DECODE_METHOD specifies which internal decoder to use.
 10320    If it is Qnil, use decode_string_utf_8.
 10321    If it is Qt, use code_convert_string.
 10322    Otherwise, use make_string_from_utf8.
 10323 
 10324    COUNT, if integer, specifies how many times to call those functions
 10325    with the same arguments (for benchmarking).  */
 10326 
 10327 DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
 10328        Sinternal_decode_string_utf_8, 7, 7, 0,
 10329        doc: /* Internal use only.*/)
 10330   (Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
 10331    Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
 10332    Lisp_Object decode_method, Lisp_Object count)
 10333 {
 10334   int repeat_count;
 10335   Lisp_Object val;
 10336 
 10337   /* Check arguments.  Return Qnil when an argument is invalid.  */
 10338   if (! STRINGP (string))
 10339     return Qnil;
 10340   if (! NILP (buffer)
 10341       && (! BUFFERP (buffer)
 10342           || NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
 10343     return Qnil;
 10344   if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
 10345       && ! EQ (handle_8_bit, Qignored)
 10346       && ! CHARACTERP (handle_8_bit)
 10347       && (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
 10348     return Qnil;
 10349   if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
 10350       && ! EQ (handle_over_uni, Qignored)
 10351       && ! CHARACTERP (handle_over_uni)
 10352       && (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
 10353     return Qnil;
 10354 
 10355   CHECK_FIXNUM (count);
 10356   repeat_count = XFIXNUM (count);
 10357 
 10358   val = Qnil;
 10359   /* Run a decoder according to DECODE_METHOD.  */
 10360   if (NILP (decode_method))
 10361     {
 10362       for (int i = 0; i < repeat_count; i++)
 10363         val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
 10364                                    handle_8_bit, handle_over_uni);
 10365     }
 10366   else if (EQ (decode_method, Qt))
 10367     {
 10368       if (! BUFFERP (buffer))
 10369         buffer = Qt;
 10370       for (int i = 0; i < repeat_count; i++)
 10371         val = code_convert_string (string, Qutf_8_unix, buffer, false,
 10372                                    ! NILP (nocopy), true);
 10373     }
 10374   else if (! NILP (decode_method))
 10375     {
 10376       for (int i = 0; i < repeat_count; i++)
 10377         val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
 10378     }
 10379   return val;
 10380 }
 10381 
 10382 #endif  /* ENABLE_UTF_8_CONVERTER_TEST */
 10383 
 10384 /* Encode or decode STRING using CODING_SYSTEM, with the possibility of
 10385    returning STRING itself if it equals the result.
 10386    Do not set Vlast_coding_system_used.  */
 10387 static Lisp_Object
 10388 convert_string_nocopy (Lisp_Object string, Lisp_Object coding_system,
 10389                        bool encodep)
 10390 {
 10391   return code_convert_string (string, coding_system, Qt, encodep, 1, 1);
 10392 }
 10393 
 10394 /* Encode or decode a file name, to or from a unibyte string suitable
 10395    for passing to C library functions.  */
 10396 Lisp_Object
 10397 decode_file_name (Lisp_Object fname)
 10398 {
 10399 #ifdef WINDOWSNT
 10400   /* The w32 build pretends to use UTF-8 for file-name encoding, and
 10401      converts the file names either to UTF-16LE or to the system ANSI
 10402      codepage internally, depending on the underlying OS; see w32.c.  */
 10403   if (! NILP (Fcoding_system_p (Qutf_8)))
 10404     return convert_string_nocopy (fname, Qutf_8, 0);
 10405   return fname;
 10406 #else  /* !WINDOWSNT */
 10407   if (! NILP (Vfile_name_coding_system))
 10408     return convert_string_nocopy (fname, Vfile_name_coding_system, 0);
 10409   else if (! NILP (Vdefault_file_name_coding_system))
 10410     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 0);
 10411   else
 10412     return fname;
 10413 #endif
 10414 }
 10415 
 10416 static Lisp_Object
 10417 encode_file_name_1 (Lisp_Object fname)
 10418 {
 10419   /* This is especially important during bootstrap and dumping, when
 10420      file-name encoding is not yet known, and therefore any non-ASCII
 10421      file names are unibyte strings, and could only be thrashed if we
 10422      try to encode them.  */
 10423   if (!STRING_MULTIBYTE (fname))
 10424     return fname;
 10425 #ifdef WINDOWSNT
 10426   /* The w32 build pretends to use UTF-8 for file-name encoding, and
 10427      converts the file names either to UTF-16LE or to the system ANSI
 10428      codepage internally, depending on the underlying OS; see w32.c.  */
 10429   if (! NILP (Fcoding_system_p (Qutf_8)))
 10430     return convert_string_nocopy (fname, Qutf_8, 1);
 10431   return fname;
 10432 #else  /* !WINDOWSNT */
 10433   if (! NILP (Vfile_name_coding_system))
 10434     return convert_string_nocopy (fname, Vfile_name_coding_system, 1);
 10435   else if (! NILP (Vdefault_file_name_coding_system))
 10436     return convert_string_nocopy (fname, Vdefault_file_name_coding_system, 1);
 10437   else
 10438     return fname;
 10439 #endif
 10440 }
 10441 
 10442 Lisp_Object
 10443 encode_file_name (Lisp_Object fname)
 10444 {
 10445   Lisp_Object encoded = encode_file_name_1 (fname);
 10446   /* No system accepts NUL bytes in filenames.  Allowing them can
 10447      cause subtle bugs because the system would silently use a
 10448      different filename than expected.  Perform this check after
 10449      encoding to not miss NUL bytes introduced through encoding.  */
 10450   CHECK_STRING_NULL_BYTES (encoded);
 10451   return encoded;
 10452 }
 10453 
 10454 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
 10455        2, 4, 0,
 10456        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
 10457 
 10458 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
 10459 if the decoding operation is trivial.
 10460 
 10461 Optional fourth arg BUFFER non-nil means that the decoded text is
 10462 inserted in that buffer after point (point does not move).  In this
 10463 case, the return value is the length of the decoded text.  If that
 10464 buffer is unibyte, it receives the individual bytes of the internal
 10465 representation of the decoded text.
 10466 
 10467 This function sets `last-coding-system-used' to the precise coding system
 10468 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
 10469 not fully specified.)  The function does not change the match data.  */)
 10470   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
 10471 {
 10472   return code_convert_string (string, coding_system, buffer,
 10473                               0, ! NILP (nocopy), 0);
 10474 }
 10475 
 10476 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
 10477        2, 4, 0,
 10478        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
 10479 
 10480 Optional third arg NOCOPY non-nil means it is OK to return STRING
 10481 itself if the encoding operation is trivial.
 10482 
 10483 Optional fourth arg BUFFER non-nil means that the encoded text is
 10484 inserted in that buffer after point (point does not move).  In this
 10485 case, the return value is the length of the encoded text.
 10486 
 10487 This function sets `last-coding-system-used' to the precise coding system
 10488 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
 10489 not fully specified.)  The function does not change the match data.  */)
 10490   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
 10491 {
 10492   return code_convert_string (string, coding_system, buffer,
 10493                               1, ! NILP (nocopy), 0);
 10494 }
 10495 
 10496 
 10497 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
 10498        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
 10499 Return the corresponding character.  */)
 10500   (Lisp_Object code)
 10501 {
 10502   Lisp_Object spec, attrs, val;
 10503   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
 10504   EMACS_INT ch;
 10505   int c;
 10506 
 10507   CHECK_FIXNAT (code);
 10508   ch = XFIXNAT (code);
 10509   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
 10510   attrs = AREF (spec, 0);
 10511 
 10512   if (ASCII_CHAR_P (ch)
 10513       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10514     return code;
 10515 
 10516   val = CODING_ATTR_CHARSET_LIST (attrs);
 10517   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10518   charset_kana = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10519   charset_kanji = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
 10520 
 10521   if (ch <= 0x7F)
 10522     {
 10523       c = ch;
 10524       charset = charset_roman;
 10525     }
 10526   else if (ch >= 0xA0 && ch < 0xDF)
 10527     {
 10528       c = ch - 0x80;
 10529       charset = charset_kana;
 10530     }
 10531   else
 10532     {
 10533       EMACS_INT c1 = ch >> 8;
 10534       int c2 = ch & 0xFF;
 10535 
 10536       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
 10537           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
 10538         error ("Invalid code: %"pI"d", ch);
 10539       c = ch;
 10540       SJIS_TO_JIS (c);
 10541       charset = charset_kanji;
 10542     }
 10543   c = DECODE_CHAR (charset, c);
 10544   if (c < 0)
 10545     error ("Invalid code: %"pI"d", ch);
 10546   return make_fixnum (c);
 10547 }
 10548 
 10549 
 10550 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
 10551        doc: /* Encode a Japanese character CH to shift_jis encoding.
 10552 Return the corresponding code in SJIS.  */)
 10553   (Lisp_Object ch)
 10554 {
 10555   Lisp_Object spec, attrs, charset_list;
 10556   int c;
 10557   struct charset *charset;
 10558   unsigned code;
 10559 
 10560   CHECK_CHARACTER (ch);
 10561   c = XFIXNAT (ch);
 10562   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
 10563   attrs = AREF (spec, 0);
 10564 
 10565   if (ASCII_CHAR_P (c)
 10566       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10567     return ch;
 10568 
 10569   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 10570   charset = char_charset (c, charset_list, &code);
 10571   if (code == CHARSET_INVALID_CODE (charset))
 10572     error ("Can't encode by shift_jis encoding: %c", c);
 10573   JIS_TO_SJIS (code);
 10574 
 10575   return make_fixnum (code);
 10576 }
 10577 
 10578 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
 10579        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
 10580 Return the corresponding character.  */)
 10581   (Lisp_Object code)
 10582 {
 10583   Lisp_Object spec, attrs, val;
 10584   struct charset *charset_roman, *charset_big5, *charset;
 10585   EMACS_INT ch;
 10586   int c;
 10587 
 10588   CHECK_FIXNAT (code);
 10589   ch = XFIXNAT (code);
 10590   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
 10591   attrs = AREF (spec, 0);
 10592 
 10593   if (ASCII_CHAR_P (ch)
 10594       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10595     return code;
 10596 
 10597   val = CODING_ATTR_CHARSET_LIST (attrs);
 10598   charset_roman = CHARSET_FROM_ID (XFIXNUM (XCAR (val))), val = XCDR (val);
 10599   charset_big5 = CHARSET_FROM_ID (XFIXNUM (XCAR (val)));
 10600 
 10601   if (ch <= 0x7F)
 10602     {
 10603       c = ch;
 10604       charset = charset_roman;
 10605     }
 10606   else
 10607     {
 10608       EMACS_INT b1 = ch >> 8;
 10609       int b2 = ch & 0x7F;
 10610       if (b1 < 0xA1 || b1 > 0xFE
 10611           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
 10612         error ("Invalid code: %"pI"d", ch);
 10613       c = ch;
 10614       charset = charset_big5;
 10615     }
 10616   c = DECODE_CHAR (charset, c);
 10617   if (c < 0)
 10618     error ("Invalid code: %"pI"d", ch);
 10619   return make_fixnum (c);
 10620 }
 10621 
 10622 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
 10623        doc: /* Encode the Big5 character CH to BIG5 coding system.
 10624 Return the corresponding character code in Big5.  */)
 10625   (Lisp_Object ch)
 10626 {
 10627   Lisp_Object spec, attrs, charset_list;
 10628   struct charset *charset;
 10629   int c;
 10630   unsigned code;
 10631 
 10632   CHECK_CHARACTER (ch);
 10633   c = XFIXNAT (ch);
 10634   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
 10635   attrs = AREF (spec, 0);
 10636   if (ASCII_CHAR_P (c)
 10637       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
 10638     return ch;
 10639 
 10640   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
 10641   charset = char_charset (c, charset_list, &code);
 10642   if (code == CHARSET_INVALID_CODE (charset))
 10643     error ("Can't encode by Big5 encoding: %c", c);
 10644 
 10645   return make_fixnum (code);
 10646 }
 10647 
 10648 
 10649 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
 10650        Sset_terminal_coding_system_internal, 1, 2, 0,
 10651        doc: /* Internal use only.  */)
 10652   (Lisp_Object coding_system, Lisp_Object terminal)
 10653 {
 10654   struct terminal *term = decode_live_terminal (terminal);
 10655   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
 10656   CHECK_SYMBOL (coding_system);
 10657   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
 10658   /* We had better not send unsafe characters to terminal.  */
 10659   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
 10660   /* Character composition should be disabled.  */
 10661   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10662   terminal_coding->src_multibyte = 1;
 10663   terminal_coding->dst_multibyte = 0;
 10664   tset_charset_list
 10665     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
 10666             ? coding_charset_list (terminal_coding)
 10667             : list1i (charset_ascii)));
 10668   return Qnil;
 10669 }
 10670 
 10671 DEFUN ("set-safe-terminal-coding-system-internal",
 10672        Fset_safe_terminal_coding_system_internal,
 10673        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
 10674        doc: /* Internal use only.  */)
 10675   (Lisp_Object coding_system)
 10676 {
 10677   CHECK_SYMBOL (coding_system);
 10678   setup_coding_system (Fcheck_coding_system (coding_system),
 10679                        &safe_terminal_coding);
 10680   /* Character composition should be disabled.  */
 10681   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10682   safe_terminal_coding.src_multibyte = 1;
 10683   safe_terminal_coding.dst_multibyte = 0;
 10684   return Qnil;
 10685 }
 10686 
 10687 DEFUN ("terminal-coding-system", Fterminal_coding_system,
 10688        Sterminal_coding_system, 0, 1, 0,
 10689        doc: /* Return coding system specified for terminal output on the given terminal.
 10690 TERMINAL may be a terminal object, a frame, or nil for the selected
 10691 frame's terminal device.  */)
 10692   (Lisp_Object terminal)
 10693 {
 10694   struct coding_system *terminal_coding
 10695     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
 10696   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
 10697 
 10698   /* For backward compatibility, return nil if it is `undecided'.  */
 10699   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
 10700 }
 10701 
 10702 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
 10703        Sset_keyboard_coding_system_internal, 1, 2, 0,
 10704        doc: /* Internal use only.  */)
 10705   (Lisp_Object coding_system, Lisp_Object terminal)
 10706 {
 10707   struct terminal *t = decode_live_terminal (terminal);
 10708   CHECK_SYMBOL (coding_system);
 10709   if (NILP (coding_system))
 10710     coding_system = Qno_conversion;
 10711   else
 10712     Fcheck_coding_system (coding_system);
 10713   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
 10714   /* Character composition should be disabled.  */
 10715   TERMINAL_KEYBOARD_CODING (t)->common_flags
 10716     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
 10717   return Qnil;
 10718 }
 10719 
 10720 DEFUN ("keyboard-coding-system",
 10721        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
 10722        doc: /* Return coding system specified for decoding keyboard input.  */)
 10723   (Lisp_Object terminal)
 10724 {
 10725   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
 10726                          (decode_live_terminal (terminal))->id);
 10727 }
 10728 
 10729 
 10730 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
 10731        Sfind_operation_coding_system,  1, MANY, 0,
 10732        doc: /* Choose a coding system for an operation based on the target name.
 10733 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
 10734 DECODING-SYSTEM is the coding system to use for decoding
 10735 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
 10736 for encoding (in case OPERATION does encoding).
 10737 
 10738 The first argument OPERATION specifies an I/O primitive:
 10739   For file I/O, `insert-file-contents' or `write-region'.
 10740   For process I/O, `call-process', `call-process-region', or `start-process'.
 10741   For network I/O, `open-network-stream'.
 10742 
 10743 The remaining arguments should be the same arguments that were passed
 10744 to the primitive.  Depending on which primitive, one of those arguments
 10745 is selected as the TARGET.  For example, if OPERATION does file I/O,
 10746 whichever argument specifies the file name is TARGET.
 10747 
 10748 TARGET has a meaning which depends on OPERATION:
 10749   For file I/O, TARGET is a file name (except for the special case below).
 10750   For process I/O, TARGET is a process name.
 10751   For network I/O, TARGET is a service name or a port number.
 10752 
 10753 This function looks up what is specified for TARGET in
 10754 `file-coding-system-alist', `process-coding-system-alist',
 10755 or `network-coding-system-alist' depending on OPERATION.
 10756 They may specify a coding system, a cons of coding systems,
 10757 or a function symbol to call.
 10758 In the last case, we call the function with one argument,
 10759 which is a list of all the arguments given to this function.
 10760 If the function can't decide a coding system, it can return
 10761 `undecided' so that the normal code-detection is performed.
 10762 
 10763 If OPERATION is `insert-file-contents', the argument corresponding to
 10764 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
 10765 file name to look up, and BUFFER is a buffer that contains the file's
 10766 contents (not yet decoded).  If `file-coding-system-alist' specifies a
 10767 function to call for FILENAME, that function should examine the
 10768 contents of BUFFER instead of reading the file.
 10769 
 10770 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
 10771   (ptrdiff_t nargs, Lisp_Object *args)
 10772 {
 10773   Lisp_Object operation, target_idx, target, val;
 10774   register Lisp_Object chain;
 10775 
 10776   if (nargs < 2)
 10777     error ("Too few arguments");
 10778   operation = args[0];
 10779   if (!SYMBOLP (operation)
 10780       || (target_idx = Fget (operation, Qtarget_idx), !FIXNATP (target_idx)))
 10781     error ("Invalid first argument");
 10782   if (nargs <= 1 + XFIXNAT (target_idx))
 10783     error ("Too few arguments for operation `%s'",
 10784            SDATA (SYMBOL_NAME (operation)));
 10785   target = args[XFIXNAT (target_idx) + 1];
 10786   if (!(STRINGP (target)
 10787         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
 10788             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
 10789         || (EQ (operation, Qopen_network_stream)
 10790             && (FIXNUMP (target) || EQ (target, Qt)))))
 10791     error ("Invalid argument %"pI"d of operation `%s'",
 10792            XFIXNAT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
 10793   if (CONSP (target))
 10794     target = XCAR (target);
 10795 
 10796   chain = ((EQ (operation, Qinsert_file_contents)
 10797             || EQ (operation, Qwrite_region))
 10798            ? Vfile_coding_system_alist
 10799            : (EQ (operation, Qopen_network_stream)
 10800               ? Vnetwork_coding_system_alist
 10801               : Vprocess_coding_system_alist));
 10802   if (NILP (chain))
 10803     return Qnil;
 10804 
 10805   for (; CONSP (chain); chain = XCDR (chain))
 10806     {
 10807       Lisp_Object elt;
 10808 
 10809       elt = XCAR (chain);
 10810       if (CONSP (elt)
 10811           && ((STRINGP (target)
 10812                && STRINGP (XCAR (elt))
 10813                && fast_string_match (XCAR (elt), target) >= 0)
 10814               || (FIXNUMP (target) && BASE_EQ (target, XCAR (elt)))))
 10815         {
 10816           val = XCDR (elt);
 10817           /* Here, if VAL is both a valid coding system and a valid
 10818              function symbol, we return VAL as a coding system.  */
 10819           if (CONSP (val))
 10820             return val;
 10821           if (! SYMBOLP (val))
 10822             return Qnil;
 10823           if (! NILP (Fcoding_system_p (val)))
 10824             return Fcons (val, val);
 10825           if (! NILP (Ffboundp (val)))
 10826             {
 10827               /* We use call1 rather than safe_call1
 10828                  so as to get bug reports about functions called here
 10829                  which don't handle the current interface.  */
 10830               val = call1 (val, Flist (nargs, args));
 10831               if (CONSP (val))
 10832                 return val;
 10833               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
 10834                 return Fcons (val, val);
 10835             }
 10836           return Qnil;
 10837         }
 10838     }
 10839   return Qnil;
 10840 }
 10841 
 10842 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
 10843        Sset_coding_system_priority, 0, MANY, 0,
 10844        doc: /* Assign higher priority to the coding systems given as arguments.
 10845 If multiple coding systems belong to the same category,
 10846 all but the first one are ignored.
 10847 
 10848 usage: (set-coding-system-priority &rest coding-systems)  */)
 10849   (ptrdiff_t nargs, Lisp_Object *args)
 10850 {
 10851   ptrdiff_t i, j;
 10852   bool changed[coding_category_max];
 10853   enum coding_category priorities[coding_category_max];
 10854 
 10855   memset (changed, 0, sizeof changed);
 10856 
 10857   for (i = j = 0; i < nargs; i++)
 10858     {
 10859       enum coding_category category;
 10860       Lisp_Object spec, attrs;
 10861 
 10862       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
 10863       attrs = AREF (spec, 0);
 10864       category = XFIXNUM (CODING_ATTR_CATEGORY (attrs));
 10865       if (changed[category])
 10866         /* Ignore this coding system because a coding system of the
 10867            same category already had a higher priority.  */
 10868         continue;
 10869       changed[category] = 1;
 10870       priorities[j++] = category;
 10871       if (coding_categories[category].id >= 0
 10872           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
 10873         setup_coding_system (args[i], &coding_categories[category]);
 10874       Fset (AREF (Vcoding_category_table, category), args[i]);
 10875     }
 10876 
 10877   /* Now we have decided top J priorities.  Reflect the order of the
 10878      original priorities to the remaining priorities.  */
 10879 
 10880   for (i = j, j = 0; i < coding_category_max; i++, j++)
 10881     {
 10882       while (j < coding_category_max
 10883              && changed[coding_priorities[j]])
 10884         j++;
 10885       if (j == coding_category_max)
 10886         emacs_abort ();
 10887       priorities[i] = coding_priorities[j];
 10888     }
 10889 
 10890   memcpy (coding_priorities, priorities, sizeof priorities);
 10891 
 10892   /* Update `coding-category-list'.  */
 10893   Vcoding_category_list = Qnil;
 10894   for (i = coding_category_max; i-- > 0; )
 10895     Vcoding_category_list
 10896       = Fcons (AREF (Vcoding_category_table, priorities[i]),
 10897                Vcoding_category_list);
 10898 
 10899   return Qnil;
 10900 }
 10901 
 10902 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
 10903        Scoding_system_priority_list, 0, 1, 0,
 10904        doc: /* Return a list of coding systems ordered by their priorities.
 10905 The list contains a subset of coding systems; i.e. coding systems
 10906 assigned to each coding category (see `coding-category-list').
 10907 
 10908 HIGHESTP non-nil means just return the highest priority one.  */)
 10909   (Lisp_Object highestp)
 10910 {
 10911   int i;
 10912   Lisp_Object val;
 10913 
 10914   for (i = 0, val = Qnil; i < coding_category_max; i++)
 10915     {
 10916       enum coding_category category = coding_priorities[i];
 10917       int id = coding_categories[category].id;
 10918       Lisp_Object attrs;
 10919 
 10920       if (id < 0)
 10921         continue;
 10922       attrs = CODING_ID_ATTRS (id);
 10923       if (! NILP (highestp))
 10924         return CODING_ATTR_BASE_NAME (attrs);
 10925       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
 10926     }
 10927   return Fnreverse (val);
 10928 }
 10929 
 10930 static Lisp_Object
 10931 make_subsidiaries (Lisp_Object base)
 10932 {
 10933   static char const suffixes[][8] = { "-unix", "-dos", "-mac" };
 10934   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
 10935   USE_SAFE_ALLOCA;
 10936   char *buf = SAFE_ALLOCA (base_name_len + 6);
 10937 
 10938   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
 10939   Lisp_Object subsidiaries = make_nil_vector (3);
 10940   for (int i = 0; i < 3; i++)
 10941     {
 10942       strcpy (buf + base_name_len, suffixes[i]);
 10943       ASET (subsidiaries, i, intern (buf));
 10944     }
 10945   SAFE_FREE ();
 10946   return subsidiaries;
 10947 }
 10948 
 10949 
 10950 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
 10951        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
 10952        doc: /* For internal use only.
 10953 usage: (define-coding-system-internal ...)  */)
 10954   (ptrdiff_t nargs, Lisp_Object *args)
 10955 {
 10956   enum coding_category category;
 10957   int max_charset_id = 0;
 10958 
 10959   if (nargs < coding_arg_max)
 10960     goto short_args;
 10961 
 10962   Lisp_Object attrs = make_nil_vector (coding_attr_last_index);
 10963 
 10964   Lisp_Object name = args[coding_arg_name];
 10965   CHECK_SYMBOL (name);
 10966   ASET (attrs, coding_attr_base_name, name);
 10967 
 10968   Lisp_Object val = args[coding_arg_mnemonic];
 10969   /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
 10970   if (STRINGP (val))
 10971     val = make_fixnum (STRING_CHAR (SDATA (val)));
 10972   else
 10973     CHECK_CHARACTER (val);
 10974   ASET (attrs, coding_attr_mnemonic, val);
 10975 
 10976   Lisp_Object coding_type = args[coding_arg_coding_type];
 10977   CHECK_SYMBOL (coding_type);
 10978   ASET (attrs, coding_attr_type, coding_type);
 10979 
 10980   Lisp_Object charset_list = args[coding_arg_charset_list];
 10981   if (SYMBOLP (charset_list))
 10982     {
 10983       if (EQ (charset_list, Qiso_2022))
 10984         {
 10985           if (! EQ (coding_type, Qiso_2022))
 10986             error ("Invalid charset-list");
 10987           charset_list = Viso_2022_charset_list;
 10988         }
 10989       else if (EQ (charset_list, Qemacs_mule))
 10990         {
 10991           if (! EQ (coding_type, Qemacs_mule))
 10992             error ("Invalid charset-list");
 10993           charset_list = Vemacs_mule_charset_list;
 10994         }
 10995       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 10996         {
 10997           if (! RANGED_FIXNUMP (0, XCAR (tail), INT_MAX - 1))
 10998             error ("Invalid charset-list");
 10999           if (max_charset_id < XFIXNAT (XCAR (tail)))
 11000             max_charset_id = XFIXNAT (XCAR (tail));
 11001         }
 11002     }
 11003   else
 11004     {
 11005       charset_list = Fcopy_sequence (charset_list);
 11006       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 11007         {
 11008           struct charset *charset;
 11009 
 11010           val = XCAR (tail);
 11011           CHECK_CHARSET_GET_CHARSET (val, charset);
 11012           if (EQ (coding_type, Qiso_2022)
 11013               ? CHARSET_ISO_FINAL (charset) < 0
 11014               : EQ (coding_type, Qemacs_mule)
 11015               ? CHARSET_EMACS_MULE_ID (charset) < 0
 11016               : 0)
 11017             error ("Can't handle charset `%s'",
 11018                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11019 
 11020           XSETCAR (tail, make_fixnum (charset->id));
 11021           if (max_charset_id < charset->id)
 11022             max_charset_id = charset->id;
 11023         }
 11024     }
 11025   ASET (attrs, coding_attr_charset_list, charset_list);
 11026 
 11027   Lisp_Object safe_charsets = make_uninit_string (max_charset_id + 1);
 11028   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
 11029   for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 11030     SSET (safe_charsets, XFIXNAT (XCAR (tail)), 0);
 11031   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
 11032 
 11033   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
 11034 
 11035   val = args[coding_arg_decode_translation_table];
 11036   if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11037     CHECK_SYMBOL (val);
 11038   ASET (attrs, coding_attr_decode_tbl, val);
 11039 
 11040   val = args[coding_arg_encode_translation_table];
 11041   if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11042     CHECK_SYMBOL (val);
 11043   ASET (attrs, coding_attr_encode_tbl, val);
 11044 
 11045   val = args[coding_arg_post_read_conversion];
 11046   CHECK_SYMBOL (val);
 11047   ASET (attrs, coding_attr_post_read, val);
 11048 
 11049   val = args[coding_arg_pre_write_conversion];
 11050   CHECK_SYMBOL (val);
 11051   ASET (attrs, coding_attr_pre_write, val);
 11052 
 11053   val = args[coding_arg_default_char];
 11054   if (NILP (val))
 11055     ASET (attrs, coding_attr_default_char, make_fixnum (' '));
 11056   else
 11057     {
 11058       CHECK_CHARACTER (val);
 11059       ASET (attrs, coding_attr_default_char, val);
 11060     }
 11061 
 11062   val = args[coding_arg_for_unibyte];
 11063   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
 11064 
 11065   val = args[coding_arg_plist];
 11066   CHECK_LIST (val);
 11067   ASET (attrs, coding_attr_plist, val);
 11068 
 11069   if (EQ (coding_type, Qcharset))
 11070     {
 11071       /* Generate a lisp vector of 256 elements.  Each element is nil,
 11072          integer, or a list of charset IDs.
 11073 
 11074          If Nth element is nil, the byte code N is invalid in this
 11075          coding system.
 11076 
 11077          If Nth element is a number NUM, N is the first byte of a
 11078          charset whose ID is NUM.
 11079 
 11080          If Nth element is a list of charset IDs, N is the first byte
 11081          of one of them.  The list is sorted by dimensions of the
 11082          charsets.  A charset of smaller dimension comes first. */
 11083       val = make_nil_vector (256);
 11084 
 11085       for (Lisp_Object tail = charset_list; CONSP (tail); tail = XCDR (tail))
 11086         {
 11087           struct charset *charset = CHARSET_FROM_ID (XFIXNAT (XCAR (tail)));
 11088           int dim = CHARSET_DIMENSION (charset);
 11089           int idx = (dim - 1) * 4;
 11090 
 11091           if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11092             ASET (attrs, coding_attr_ascii_compat, Qt);
 11093 
 11094           for (int i = charset->code_space[idx];
 11095                i <= charset->code_space[idx + 1]; i++)
 11096             {
 11097               Lisp_Object tmp, tmp2;
 11098               int dim2;
 11099 
 11100               tmp = AREF (val, i);
 11101               if (NILP (tmp))
 11102                 tmp = XCAR (tail);
 11103               else if (FIXNATP (tmp))
 11104                 {
 11105                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (tmp)));
 11106                   if (dim < dim2)
 11107                     tmp = list2 (XCAR (tail), tmp);
 11108                   else
 11109                     tmp = list2 (tmp, XCAR (tail));
 11110                 }
 11111               else
 11112                 {
 11113                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
 11114                     {
 11115                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFIXNAT (XCAR (tmp2))));
 11116                       if (dim < dim2)
 11117                         break;
 11118                     }
 11119                   if (NILP (tmp2))
 11120                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
 11121                   else
 11122                     {
 11123                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
 11124                       XSETCAR (tmp2, XCAR (tail));
 11125                     }
 11126                 }
 11127               ASET (val, i, tmp);
 11128             }
 11129         }
 11130       ASET (attrs, coding_attr_charset_valids, val);
 11131       category = coding_category_charset;
 11132     }
 11133   else if (EQ (coding_type, Qccl))
 11134     {
 11135       Lisp_Object valids;
 11136 
 11137       if (nargs < coding_arg_ccl_max)
 11138         goto short_args;
 11139 
 11140       val = args[coding_arg_ccl_decoder];
 11141       CHECK_CCL_PROGRAM (val);
 11142       if (VECTORP (val))
 11143         val = Fcopy_sequence (val);
 11144       ASET (attrs, coding_attr_ccl_decoder, val);
 11145 
 11146       val = args[coding_arg_ccl_encoder];
 11147       CHECK_CCL_PROGRAM (val);
 11148       if (VECTORP (val))
 11149         val = Fcopy_sequence (val);
 11150       ASET (attrs, coding_attr_ccl_encoder, val);
 11151 
 11152       val = args[coding_arg_ccl_valids];
 11153       valids = Fmake_string (make_fixnum (256), make_fixnum (0), Qnil);
 11154       for (Lisp_Object tail = val; CONSP (tail); tail = XCDR (tail))
 11155         {
 11156           int from, to;
 11157 
 11158           val = XCAR (tail);
 11159           if (FIXNUMP (val))
 11160             {
 11161               if (! (0 <= XFIXNUM (val) && XFIXNUM (val) <= 255))
 11162                 args_out_of_range_3 (val, make_fixnum (0), make_fixnum (255));
 11163               from = to = XFIXNUM (val);
 11164             }
 11165           else
 11166             {
 11167               CHECK_CONS (val);
 11168               from = check_integer_range (XCAR (val), 0, 255);
 11169               to = check_integer_range (XCDR (val), from, 255);
 11170             }
 11171           for (int i = from; i <= to; i++)
 11172             SSET (valids, i, 1);
 11173         }
 11174       ASET (attrs, coding_attr_ccl_valids, valids);
 11175 
 11176       category = coding_category_ccl;
 11177     }
 11178   else if (EQ (coding_type, Qutf_16))
 11179     {
 11180       Lisp_Object bom, endian;
 11181 
 11182       ASET (attrs, coding_attr_ascii_compat, Qnil);
 11183 
 11184       if (nargs < coding_arg_utf16_max)
 11185         goto short_args;
 11186 
 11187       bom = args[coding_arg_utf16_bom];
 11188       if (! NILP (bom) && ! EQ (bom, Qt))
 11189         {
 11190           CHECK_CONS (bom);
 11191           val = XCAR (bom);
 11192           CHECK_CODING_SYSTEM (val);
 11193           val = XCDR (bom);
 11194           CHECK_CODING_SYSTEM (val);
 11195         }
 11196       ASET (attrs, coding_attr_utf_bom, bom);
 11197 
 11198       endian = args[coding_arg_utf16_endian];
 11199       CHECK_SYMBOL (endian);
 11200       if (NILP (endian))
 11201         endian = Qbig;
 11202       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
 11203         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
 11204       ASET (attrs, coding_attr_utf_16_endian, endian);
 11205 
 11206       category = (CONSP (bom)
 11207                   ? coding_category_utf_16_auto
 11208                   : NILP (bom)
 11209                   ? (EQ (endian, Qbig)
 11210                      ? coding_category_utf_16_be_nosig
 11211                      : coding_category_utf_16_le_nosig)
 11212                   : (EQ (endian, Qbig)
 11213                      ? coding_category_utf_16_be
 11214                      : coding_category_utf_16_le));
 11215     }
 11216   else if (EQ (coding_type, Qiso_2022))
 11217     {
 11218       Lisp_Object initial, reg_usage, request, flags;
 11219 
 11220       if (nargs < coding_arg_iso2022_max)
 11221         goto short_args;
 11222 
 11223       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
 11224       CHECK_VECTOR (initial);
 11225       for (int i = 0; i < 4; i++)
 11226         {
 11227           val = AREF (initial, i);
 11228           if (! NILP (val))
 11229             {
 11230               struct charset *charset;
 11231 
 11232               CHECK_CHARSET_GET_CHARSET (val, charset);
 11233               ASET (initial, i, make_fixnum (CHARSET_ID (charset)));
 11234               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
 11235                 ASET (attrs, coding_attr_ascii_compat, Qt);
 11236             }
 11237           else
 11238             ASET (initial, i, make_fixnum (-1));
 11239         }
 11240 
 11241       reg_usage = args[coding_arg_iso2022_reg_usage];
 11242       CHECK_CONS (reg_usage);
 11243       CHECK_FIXNUM (XCAR (reg_usage));
 11244       CHECK_FIXNUM (XCDR (reg_usage));
 11245 
 11246       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
 11247       for (Lisp_Object tail = request; CONSP (tail); tail = XCDR (tail))
 11248         {
 11249           int id;
 11250 
 11251           val = XCAR (tail);
 11252           CHECK_CONS (val);
 11253           CHECK_CHARSET_GET_ID (XCAR (val), id);
 11254           check_integer_range (XCDR (val), 0, 3);
 11255           XSETCAR (val, make_fixnum (id));
 11256         }
 11257 
 11258       flags = args[coding_arg_iso2022_flags];
 11259       CHECK_FIXNAT (flags);
 11260       int i = XFIXNUM (flags) & INT_MAX;
 11261       if (EQ (args[coding_arg_charset_list], Qiso_2022))
 11262         i |= CODING_ISO_FLAG_FULL_SUPPORT;
 11263       flags = make_fixnum (i);
 11264 
 11265       ASET (attrs, coding_attr_iso_initial, initial);
 11266       ASET (attrs, coding_attr_iso_usage, reg_usage);
 11267       ASET (attrs, coding_attr_iso_request, request);
 11268       ASET (attrs, coding_attr_iso_flags, flags);
 11269       setup_iso_safe_charsets (attrs);
 11270 
 11271       if (i & CODING_ISO_FLAG_SEVEN_BITS)
 11272         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
 11273                           | CODING_ISO_FLAG_SINGLE_SHIFT))
 11274                     ? coding_category_iso_7_else
 11275                     : EQ (args[coding_arg_charset_list], Qiso_2022)
 11276                     ? coding_category_iso_7
 11277                     : coding_category_iso_7_tight);
 11278       else
 11279         {
 11280           int id = XFIXNUM (AREF (initial, 1));
 11281 
 11282           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
 11283                        || EQ (args[coding_arg_charset_list], Qiso_2022)
 11284                        || id < 0)
 11285                       ? coding_category_iso_8_else
 11286                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
 11287                       ? coding_category_iso_8_1
 11288                       : coding_category_iso_8_2);
 11289         }
 11290       if (category != coding_category_iso_8_1
 11291           && category != coding_category_iso_8_2)
 11292         ASET (attrs, coding_attr_ascii_compat, Qnil);
 11293     }
 11294   else if (EQ (coding_type, Qemacs_mule))
 11295     {
 11296       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
 11297         ASET (attrs, coding_attr_emacs_mule_full, Qt);
 11298       ASET (attrs, coding_attr_ascii_compat, Qt);
 11299       category = coding_category_emacs_mule;
 11300     }
 11301   else if (EQ (coding_type, Qshift_jis))
 11302     {
 11303       ptrdiff_t charset_list_len = list_length (charset_list);
 11304       if (charset_list_len != 3 && charset_list_len != 4)
 11305         error ("There should be three or four charsets");
 11306 
 11307       struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11308       if (CHARSET_DIMENSION (charset) != 1)
 11309         error ("Dimension of charset %s is not one",
 11310                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11311       if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11312         ASET (attrs, coding_attr_ascii_compat, Qt);
 11313 
 11314       charset_list = XCDR (charset_list);
 11315       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11316       if (CHARSET_DIMENSION (charset) != 1)
 11317         error ("Dimension of charset %s is not one",
 11318                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11319 
 11320       charset_list = XCDR (charset_list);
 11321       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11322       if (CHARSET_DIMENSION (charset) != 2)
 11323         error ("Dimension of charset %s is not two",
 11324                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11325 
 11326       charset_list = XCDR (charset_list);
 11327       if (! NILP (charset_list))
 11328         {
 11329           charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11330           if (CHARSET_DIMENSION (charset) != 2)
 11331             error ("Dimension of charset %s is not two",
 11332                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11333         }
 11334 
 11335       category = coding_category_sjis;
 11336       Vsjis_coding_system = name;
 11337     }
 11338   else if (EQ (coding_type, Qbig5))
 11339     {
 11340       struct charset *charset;
 11341 
 11342       if (list_length (charset_list) != 2)
 11343         error ("There should be just two charsets");
 11344 
 11345       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11346       if (CHARSET_DIMENSION (charset) != 1)
 11347         error ("Dimension of charset %s is not one",
 11348                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11349       if (CHARSET_ASCII_COMPATIBLE_P (charset))
 11350         ASET (attrs, coding_attr_ascii_compat, Qt);
 11351 
 11352       charset_list = XCDR (charset_list);
 11353       charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
 11354       if (CHARSET_DIMENSION (charset) != 2)
 11355         error ("Dimension of charset %s is not two",
 11356                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
 11357 
 11358       category = coding_category_big5;
 11359       Vbig5_coding_system = name;
 11360     }
 11361   else if (EQ (coding_type, Qraw_text))
 11362     {
 11363       category = coding_category_raw_text;
 11364       ASET (attrs, coding_attr_ascii_compat, Qt);
 11365     }
 11366   else if (EQ (coding_type, Qutf_8))
 11367     {
 11368       Lisp_Object bom;
 11369 
 11370       if (nargs < coding_arg_utf8_max)
 11371         goto short_args;
 11372 
 11373       bom = args[coding_arg_utf8_bom];
 11374       if (! NILP (bom) && ! EQ (bom, Qt))
 11375         {
 11376           CHECK_CONS (bom);
 11377           val = XCAR (bom);
 11378           CHECK_CODING_SYSTEM (val);
 11379           val = XCDR (bom);
 11380           CHECK_CODING_SYSTEM (val);
 11381         }
 11382       ASET (attrs, coding_attr_utf_bom, bom);
 11383       if (NILP (bom))
 11384         ASET (attrs, coding_attr_ascii_compat, Qt);
 11385 
 11386       category = (CONSP (bom) ? coding_category_utf_8_auto
 11387                   : NILP (bom) ? coding_category_utf_8_nosig
 11388                   : coding_category_utf_8_sig);
 11389     }
 11390   else if (EQ (coding_type, Qundecided))
 11391     {
 11392       if (nargs < coding_arg_undecided_max)
 11393         goto short_args;
 11394       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
 11395             args[coding_arg_undecided_inhibit_null_byte_detection]);
 11396       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
 11397             args[coding_arg_undecided_inhibit_iso_escape_detection]);
 11398       ASET (attrs, coding_attr_undecided_prefer_utf_8,
 11399             args[coding_arg_undecided_prefer_utf_8]);
 11400       category = coding_category_undecided;
 11401     }
 11402   else
 11403     error ("Invalid coding system type: %s",
 11404            SDATA (SYMBOL_NAME (coding_type)));
 11405 
 11406   ASET (attrs, coding_attr_category, make_fixnum (category));
 11407   ASET (attrs, coding_attr_plist,
 11408         Fcons (QCcategory,
 11409                Fcons (AREF (Vcoding_category_table, category),
 11410                       CODING_ATTR_PLIST (attrs))));
 11411   ASET (attrs, coding_attr_plist,
 11412         Fcons (QCascii_compatible_p,
 11413                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
 11414                       CODING_ATTR_PLIST (attrs))));
 11415 
 11416   Lisp_Object eol_type = args[coding_arg_eol_type];
 11417   if (! NILP (eol_type)
 11418       && ! EQ (eol_type, Qunix)
 11419       && ! EQ (eol_type, Qdos)
 11420       && ! EQ (eol_type, Qmac))
 11421     error ("Invalid eol-type");
 11422 
 11423   Lisp_Object aliases = list1 (name);
 11424 
 11425   if (NILP (eol_type))
 11426     {
 11427       eol_type = make_subsidiaries (name);
 11428       for (int i = 0; i < 3; i++)
 11429         {
 11430           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
 11431 
 11432           this_name = AREF (eol_type, i);
 11433           this_aliases = list1 (this_name);
 11434           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
 11435           this_spec = make_uninit_vector (3);
 11436           ASET (this_spec, 0, attrs);
 11437           ASET (this_spec, 1, this_aliases);
 11438           ASET (this_spec, 2, this_eol_type);
 11439           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
 11440           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
 11441           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist, Qnil);
 11442           if (NILP (val))
 11443             Vcoding_system_alist
 11444               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
 11445                        Vcoding_system_alist);
 11446         }
 11447     }
 11448 
 11449   Lisp_Object spec_vec = make_uninit_vector (3);
 11450   ASET (spec_vec, 0, attrs);
 11451   ASET (spec_vec, 1, aliases);
 11452   ASET (spec_vec, 2, eol_type);
 11453 
 11454   Fputhash (name, spec_vec, Vcoding_system_hash_table);
 11455   Vcoding_system_list = Fcons (name, Vcoding_system_list);
 11456   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist, Qnil);
 11457   if (NILP (val))
 11458     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
 11459                                   Vcoding_system_alist);
 11460 
 11461   int id = coding_categories[category].id;
 11462   if (id < 0 || EQ (name, CODING_ID_NAME (id)))
 11463       setup_coding_system (name, &coding_categories[category]);
 11464 
 11465   return Qnil;
 11466 
 11467  short_args:
 11468   Fsignal (Qwrong_number_of_arguments,
 11469            Fcons (intern ("define-coding-system-internal"),
 11470                   make_fixnum (nargs)));
 11471 }
 11472 
 11473 
 11474 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
 11475        3, 3, 0,
 11476        doc: /* Change value of CODING-SYSTEM's property PROP to VAL.
 11477 
 11478 The following properties, if set by this function, override the values
 11479 of the corresponding attributes set by `define-coding-system':
 11480 
 11481   `:mnemonic', `:default-char', `:ascii-compatible-p'
 11482   `:decode-translation-table', `:encode-translation-table',
 11483   `:post-read-conversion', `:pre-write-conversion'
 11484 
 11485 See `define-coding-system' for the description of these properties.
 11486 See `coding-system-get' and `coding-system-plist' for accessing the
 11487 property list of a coding-system.  */)
 11488   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
 11489 {
 11490   Lisp_Object spec, attrs;
 11491 
 11492   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11493   attrs = AREF (spec, 0);
 11494   if (EQ (prop, QCmnemonic))
 11495     {
 11496       /* decode_mode_spec_coding assumes the mnemonic is a single character.  */
 11497       if (STRINGP (val))
 11498         val = make_fixnum (STRING_CHAR (SDATA (val)));
 11499       else
 11500         CHECK_CHARACTER (val);
 11501       ASET (attrs, coding_attr_mnemonic, val);
 11502     }
 11503   else if (EQ (prop, QCdefault_char))
 11504     {
 11505       if (NILP (val))
 11506         val = make_fixnum (' ');
 11507       else
 11508         CHECK_CHARACTER (val);
 11509       ASET (attrs, coding_attr_default_char, val);
 11510     }
 11511   else if (EQ (prop, QCdecode_translation_table))
 11512     {
 11513       if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11514         CHECK_SYMBOL (val);
 11515       ASET (attrs, coding_attr_decode_tbl, val);
 11516     }
 11517   else if (EQ (prop, QCencode_translation_table))
 11518     {
 11519       if (! CHAR_TABLE_P (val) && ! CONSP (val))
 11520         CHECK_SYMBOL (val);
 11521       ASET (attrs, coding_attr_encode_tbl, val);
 11522     }
 11523   else if (EQ (prop, QCpost_read_conversion))
 11524     {
 11525       CHECK_SYMBOL (val);
 11526       ASET (attrs, coding_attr_post_read, val);
 11527     }
 11528   else if (EQ (prop, QCpre_write_conversion))
 11529     {
 11530       CHECK_SYMBOL (val);
 11531       ASET (attrs, coding_attr_pre_write, val);
 11532     }
 11533   else if (EQ (prop, QCascii_compatible_p))
 11534     {
 11535       ASET (attrs, coding_attr_ascii_compat, val);
 11536     }
 11537 
 11538   ASET (attrs, coding_attr_plist,
 11539         plist_put (CODING_ATTR_PLIST (attrs), prop, val));
 11540   return val;
 11541 }
 11542 
 11543 
 11544 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
 11545        Sdefine_coding_system_alias, 2, 2, 0,
 11546        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
 11547   (Lisp_Object alias, Lisp_Object coding_system)
 11548 {
 11549   Lisp_Object spec, aliases, eol_type, val;
 11550 
 11551   CHECK_SYMBOL (alias);
 11552   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11553   aliases = AREF (spec, 1);
 11554   /* ALIASES should be a list of length more than zero, and the first
 11555      element is a base coding system.  Append ALIAS at the tail of the
 11556      list.  */
 11557   while (!NILP (XCDR (aliases)))
 11558     aliases = XCDR (aliases);
 11559   XSETCDR (aliases, list1 (alias));
 11560 
 11561   eol_type = AREF (spec, 2);
 11562   if (VECTORP (eol_type))
 11563     {
 11564       Lisp_Object subsidiaries;
 11565       int i;
 11566 
 11567       subsidiaries = make_subsidiaries (alias);
 11568       for (i = 0; i < 3; i++)
 11569         Fdefine_coding_system_alias (AREF (subsidiaries, i),
 11570                                      AREF (eol_type, i));
 11571     }
 11572 
 11573   Fputhash (alias, spec, Vcoding_system_hash_table);
 11574   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
 11575   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist, Qnil);
 11576   if (NILP (val))
 11577     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
 11578                                   Vcoding_system_alist);
 11579 
 11580   return Qnil;
 11581 }
 11582 
 11583 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
 11584        1, 1, 0,
 11585        doc: /* Return the base of CODING-SYSTEM.
 11586 Any alias or subsidiary coding system is not a base coding system.  */)
 11587   (Lisp_Object coding_system)
 11588 {
 11589   Lisp_Object spec, attrs;
 11590 
 11591   if (NILP (coding_system))
 11592     return (Qno_conversion);
 11593   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11594   attrs = AREF (spec, 0);
 11595   return CODING_ATTR_BASE_NAME (attrs);
 11596 }
 11597 
 11598 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
 11599        1, 1, 0,
 11600        doc: /* Return the property list of CODING-SYSTEM.  */)
 11601   (Lisp_Object coding_system)
 11602 {
 11603   Lisp_Object spec, attrs;
 11604 
 11605   if (NILP (coding_system))
 11606     coding_system = Qno_conversion;
 11607   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11608   attrs = AREF (spec, 0);
 11609   return CODING_ATTR_PLIST (attrs);
 11610 }
 11611 
 11612 
 11613 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
 11614        1, 1, 0,
 11615        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
 11616   (Lisp_Object coding_system)
 11617 {
 11618   Lisp_Object spec;
 11619 
 11620   if (NILP (coding_system))
 11621     coding_system = Qno_conversion;
 11622   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
 11623   return AREF (spec, 1);
 11624 }
 11625 
 11626 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
 11627        Scoding_system_eol_type, 1, 1, 0,
 11628        doc: /* Return eol-type of CODING-SYSTEM.
 11629 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
 11630 
 11631 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
 11632 and CR respectively.
 11633 
 11634 A vector value indicates that a format of end-of-line should be
 11635 detected automatically.  Nth element of the vector is the subsidiary
 11636 coding system whose eol-type is N.  */)
 11637   (Lisp_Object coding_system)
 11638 {
 11639   Lisp_Object spec, eol_type;
 11640   int n;
 11641 
 11642   if (NILP (coding_system))
 11643     coding_system = Qno_conversion;
 11644   if (! CODING_SYSTEM_P (coding_system))
 11645     return Qnil;
 11646   spec = CODING_SYSTEM_SPEC (coding_system);
 11647   eol_type = AREF (spec, 2);
 11648   if (VECTORP (eol_type))
 11649     return Fcopy_sequence (eol_type);
 11650   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
 11651   return make_fixnum (n);
 11652 }
 11653 
 11654 
 11655 /*** 9. Post-amble ***/
 11656 
 11657 void
 11658 init_coding_once (void)
 11659 {
 11660   int i;
 11661 
 11662   for (i = 0; i < coding_category_max; i++)
 11663     {
 11664       coding_categories[i].id = -1;
 11665       coding_priorities[i] = i;
 11666     }
 11667 
 11668   PDUMPER_REMEMBER_SCALAR (coding_categories);
 11669   PDUMPER_REMEMBER_SCALAR (coding_priorities);
 11670 
 11671   /* ISO2022 specific initialize routine.  */
 11672   for (i = 0; i < 0x20; i++)
 11673     iso_code_class[i] = ISO_control_0;
 11674   for (i = 0x21; i < 0x7F; i++)
 11675     iso_code_class[i] = ISO_graphic_plane_0;
 11676   for (i = 0x80; i < 0xA0; i++)
 11677     iso_code_class[i] = ISO_control_1;
 11678   for (i = 0xA1; i < 0xFF; i++)
 11679     iso_code_class[i] = ISO_graphic_plane_1;
 11680   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
 11681   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
 11682   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
 11683   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
 11684   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
 11685   iso_code_class[ISO_CODE_ESC] = ISO_escape;
 11686   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
 11687   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
 11688   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
 11689 
 11690   PDUMPER_REMEMBER_SCALAR (iso_code_class);
 11691 
 11692   for (i = 0; i < 256; i++)
 11693     {
 11694       emacs_mule_bytes[i] = 1;
 11695     }
 11696   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
 11697   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
 11698   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
 11699   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
 11700 
 11701   PDUMPER_REMEMBER_SCALAR (emacs_mule_bytes);
 11702 }
 11703 
 11704 static void reset_coding_after_pdumper_load (void);
 11705 
 11706 void
 11707 syms_of_coding (void)
 11708 {
 11709   staticpro (&Vcoding_system_hash_table);
 11710   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
 11711 
 11712   staticpro (&Vsjis_coding_system);
 11713   Vsjis_coding_system = Qnil;
 11714 
 11715   staticpro (&Vbig5_coding_system);
 11716   Vbig5_coding_system = Qnil;
 11717 
 11718   staticpro (&Vcode_conversion_reused_workbuf);
 11719   Vcode_conversion_reused_workbuf = Qnil;
 11720 
 11721   staticpro (&Vcode_conversion_workbuf_name);
 11722   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
 11723 
 11724   reused_workbuf_in_use = false;
 11725   PDUMPER_REMEMBER_SCALAR (reused_workbuf_in_use);
 11726 
 11727   DEFSYM (Qcharset, "charset");
 11728   DEFSYM (Qtarget_idx, "target-idx");
 11729   DEFSYM (Qcoding_system_history, "coding-system-history");
 11730   Fset (Qcoding_system_history, Qnil);
 11731 
 11732   /* Target FILENAME is the first argument.  */
 11733   Fput (Qinsert_file_contents, Qtarget_idx, make_fixnum (0));
 11734   /* Target FILENAME is the third argument.  */
 11735   Fput (Qwrite_region, Qtarget_idx, make_fixnum (2));
 11736 
 11737   DEFSYM (Qcall_process, "call-process");
 11738   /* Target PROGRAM is the first argument.  */
 11739   Fput (Qcall_process, Qtarget_idx, make_fixnum (0));
 11740 
 11741   DEFSYM (Qcall_process_region, "call-process-region");
 11742   /* Target PROGRAM is the third argument.  */
 11743   Fput (Qcall_process_region, Qtarget_idx, make_fixnum (2));
 11744 
 11745   DEFSYM (Qstart_process, "start-process");
 11746   /* Target PROGRAM is the third argument.  */
 11747   Fput (Qstart_process, Qtarget_idx, make_fixnum (2));
 11748 
 11749   DEFSYM (Qopen_network_stream, "open-network-stream");
 11750   /* Target SERVICE is the fourth argument.  */
 11751   Fput (Qopen_network_stream, Qtarget_idx, make_fixnum (3));
 11752 
 11753   DEFSYM (Qunix, "unix");
 11754   DEFSYM (Qdos, "dos");
 11755   DEFSYM (Qmac, "mac");
 11756 
 11757   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
 11758   DEFSYM (Qundecided, "undecided");
 11759   DEFSYM (Qno_conversion, "no-conversion");
 11760   DEFSYM (Qraw_text, "raw-text");
 11761   DEFSYM (Qus_ascii, "us-ascii");
 11762 
 11763   DEFSYM (Qiso_2022, "iso-2022");
 11764 
 11765   DEFSYM (Qutf_8, "utf-8");
 11766   DEFSYM (Qutf_8_unix, "utf-8-unix");
 11767   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
 11768 
 11769 #if defined (WINDOWSNT) || defined (CYGWIN) || defined HAVE_ANDROID
 11770   /* No, not utf-16-le: that one has a BOM.  */
 11771   DEFSYM (Qutf_16le, "utf-16le");
 11772 #endif
 11773 
 11774   DEFSYM (Qutf_16, "utf-16");
 11775   DEFSYM (Qbig, "big");
 11776   DEFSYM (Qlittle, "little");
 11777 
 11778   DEFSYM (Qshift_jis, "shift-jis");
 11779   DEFSYM (Qbig5, "big5");
 11780 
 11781   DEFSYM (Qcoding_system_p, "coding-system-p");
 11782 
 11783   /* Error signaled when there's a problem with detecting a coding system.  */
 11784   DEFSYM (Qcoding_system_error, "coding-system-error");
 11785   Fput (Qcoding_system_error, Qerror_conditions,
 11786         pure_list (Qcoding_system_error, Qerror));
 11787   Fput (Qcoding_system_error, Qerror_message,
 11788         build_pure_c_string ("Invalid coding system"));
 11789 
 11790   DEFSYM (Qtranslation_table, "translation-table");
 11791   Fput (Qtranslation_table, Qchar_table_extra_slots, make_fixnum (2));
 11792   DEFSYM (Qtranslation_table_id, "translation-table-id");
 11793 
 11794   /* Coding system emacs-mule and raw-text are for converting only
 11795      end-of-line format.  */
 11796   DEFSYM (Qemacs_mule, "emacs-mule");
 11797 
 11798   DEFSYM (QCcategory, ":category");
 11799   DEFSYM (QCmnemonic, ":mnemonic");
 11800   DEFSYM (QCdefault_char, ":default-char");
 11801   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
 11802   DEFSYM (QCencode_translation_table, ":encode-translation-table");
 11803   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
 11804   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
 11805   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
 11806 
 11807   Vcoding_category_table = make_nil_vector (coding_category_max);
 11808   staticpro (&Vcoding_category_table);
 11809   /* Followings are target of code detection.  */
 11810   ASET (Vcoding_category_table, coding_category_iso_7,
 11811         intern_c_string ("coding-category-iso-7"));
 11812   ASET (Vcoding_category_table, coding_category_iso_7_tight,
 11813         intern_c_string ("coding-category-iso-7-tight"));
 11814   ASET (Vcoding_category_table, coding_category_iso_8_1,
 11815         intern_c_string ("coding-category-iso-8-1"));
 11816   ASET (Vcoding_category_table, coding_category_iso_8_2,
 11817         intern_c_string ("coding-category-iso-8-2"));
 11818   ASET (Vcoding_category_table, coding_category_iso_7_else,
 11819         intern_c_string ("coding-category-iso-7-else"));
 11820   ASET (Vcoding_category_table, coding_category_iso_8_else,
 11821         intern_c_string ("coding-category-iso-8-else"));
 11822   ASET (Vcoding_category_table, coding_category_utf_8_auto,
 11823         intern_c_string ("coding-category-utf-8-auto"));
 11824   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
 11825         intern_c_string ("coding-category-utf-8"));
 11826   ASET (Vcoding_category_table, coding_category_utf_8_sig,
 11827         intern_c_string ("coding-category-utf-8-sig"));
 11828   ASET (Vcoding_category_table, coding_category_utf_16_be,
 11829         intern_c_string ("coding-category-utf-16-be"));
 11830   ASET (Vcoding_category_table, coding_category_utf_16_auto,
 11831         intern_c_string ("coding-category-utf-16-auto"));
 11832   ASET (Vcoding_category_table, coding_category_utf_16_le,
 11833         intern_c_string ("coding-category-utf-16-le"));
 11834   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
 11835         intern_c_string ("coding-category-utf-16-be-nosig"));
 11836   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
 11837         intern_c_string ("coding-category-utf-16-le-nosig"));
 11838   ASET (Vcoding_category_table, coding_category_charset,
 11839         intern_c_string ("coding-category-charset"));
 11840   ASET (Vcoding_category_table, coding_category_sjis,
 11841         intern_c_string ("coding-category-sjis"));
 11842   ASET (Vcoding_category_table, coding_category_big5,
 11843         intern_c_string ("coding-category-big5"));
 11844   ASET (Vcoding_category_table, coding_category_ccl,
 11845         intern_c_string ("coding-category-ccl"));
 11846   ASET (Vcoding_category_table, coding_category_emacs_mule,
 11847         intern_c_string ("coding-category-emacs-mule"));
 11848   /* Followings are NOT target of code detection.  */
 11849   ASET (Vcoding_category_table, coding_category_raw_text,
 11850         intern_c_string ("coding-category-raw-text"));
 11851   ASET (Vcoding_category_table, coding_category_undecided,
 11852         intern_c_string ("coding-category-undecided"));
 11853 
 11854   DEFSYM (Qinsufficient_source, "insufficient-source");
 11855   DEFSYM (Qinvalid_source, "invalid-source");
 11856   DEFSYM (Qinterrupted, "interrupted");
 11857 
 11858   /* If a symbol has this property, evaluate the value to define the
 11859      symbol as a coding system.  */
 11860   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
 11861 
 11862   DEFSYM (Qignored, "ignored");
 11863 
 11864   DEFSYM (Qutf_8_string_p, "utf-8-string-p");
 11865   DEFSYM (Qfilenamep, "filenamep");
 11866 
 11867   defsubr (&Scoding_system_p);
 11868   defsubr (&Sread_coding_system);
 11869   defsubr (&Sread_non_nil_coding_system);
 11870   defsubr (&Scheck_coding_system);
 11871   defsubr (&Sdetect_coding_region);
 11872   defsubr (&Sdetect_coding_string);
 11873   defsubr (&Sfind_coding_systems_region_internal);
 11874   defsubr (&Sunencodable_char_position);
 11875   defsubr (&Scheck_coding_systems_region);
 11876   defsubr (&Sdecode_coding_region);
 11877   defsubr (&Sencode_coding_region);
 11878   defsubr (&Sdecode_coding_string);
 11879   defsubr (&Sencode_coding_string);
 11880 #ifdef ENABLE_UTF_8_CONVERTER_TEST
 11881   defsubr (&Sinternal_encode_string_utf_8);
 11882   defsubr (&Sinternal_decode_string_utf_8);
 11883 #endif  /* ENABLE_UTF_8_CONVERTER_TEST */
 11884   defsubr (&Sdecode_sjis_char);
 11885   defsubr (&Sencode_sjis_char);
 11886   defsubr (&Sdecode_big5_char);
 11887   defsubr (&Sencode_big5_char);
 11888   defsubr (&Sset_terminal_coding_system_internal);
 11889   defsubr (&Sset_safe_terminal_coding_system_internal);
 11890   defsubr (&Sterminal_coding_system);
 11891   defsubr (&Sset_keyboard_coding_system_internal);
 11892   defsubr (&Skeyboard_coding_system);
 11893   defsubr (&Sfind_operation_coding_system);
 11894   defsubr (&Sset_coding_system_priority);
 11895   defsubr (&Sdefine_coding_system_internal);
 11896   defsubr (&Sdefine_coding_system_alias);
 11897   defsubr (&Scoding_system_put);
 11898   defsubr (&Scoding_system_base);
 11899   defsubr (&Scoding_system_plist);
 11900   defsubr (&Scoding_system_aliases);
 11901   defsubr (&Scoding_system_eol_type);
 11902   defsubr (&Scoding_system_priority_list);
 11903 
 11904   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
 11905                doc: /* List of coding systems.
 11906 
 11907 Do not alter the value of this variable manually.  This variable should be
 11908 updated by the functions `define-coding-system' and
 11909 `define-coding-system-alias'.  */);
 11910   Vcoding_system_list = Qnil;
 11911 
 11912   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
 11913                doc: /* Alist of coding system names.
 11914 Each element is one element list of coding system name.
 11915 This variable is given to `completing-read' as COLLECTION argument.
 11916 
 11917 Do not alter the value of this variable manually.  This variable should be
 11918 updated by `define-coding-system-alias'.  */);
 11919   Vcoding_system_alist = Qnil;
 11920 
 11921   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
 11922                doc: /* List of coding-categories (symbols) ordered by priority.
 11923 
 11924 On detecting a coding system, Emacs tries code detection algorithms
 11925 associated with each coding-category one by one in this order.  When
 11926 one algorithm agrees with a byte sequence of source text, the coding
 11927 system bound to the corresponding coding-category is selected.
 11928 
 11929 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
 11930   {
 11931     int i;
 11932 
 11933     Vcoding_category_list = Qnil;
 11934     for (i = coding_category_max - 1; i >= 0; i--)
 11935       Vcoding_category_list
 11936         = Fcons (AREF (Vcoding_category_table, i),
 11937                  Vcoding_category_list);
 11938   }
 11939 
 11940   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
 11941                doc: /* Specify the coding system for read operations.
 11942 It is useful to bind this variable with `let', but do not set it globally.
 11943 If the value is a coding system, it is used for decoding on read operation.
 11944 If not, an appropriate element is used from one of the coding system alists.
 11945 There are three such tables: `file-coding-system-alist',
 11946 `process-coding-system-alist', and `network-coding-system-alist'.  */);
 11947   Vcoding_system_for_read = Qnil;
 11948 
 11949   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
 11950                doc: /* Specify the coding system for write operations.
 11951 Programs bind this variable with `let', but you should not set it globally.
 11952 If the value is a coding system, it is used for encoding of output,
 11953 when writing it to a file and when sending it to a file or subprocess.
 11954 
 11955 If this does not specify a coding system, an appropriate element
 11956 is used from one of the coding system alists.
 11957 There are three such tables: `file-coding-system-alist',
 11958 `process-coding-system-alist', and `network-coding-system-alist'.
 11959 For output to files, if the above procedure does not specify a coding system,
 11960 the value of `buffer-file-coding-system' is used.  */);
 11961   Vcoding_system_for_write = Qnil;
 11962 
 11963   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
 11964                doc: /*
 11965 Coding system used in the latest file or process I/O.  */);
 11966   Vlast_coding_system_used = Qnil;
 11967 
 11968   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
 11969                doc: /*
 11970 Error status of the last code conversion.
 11971 
 11972 When an error was detected in the last code conversion, this variable
 11973 is set to one of the following symbols.
 11974   `insufficient-source'
 11975   `inconsistent-eol'
 11976   `invalid-source'
 11977   `interrupted'
 11978   `insufficient-memory'
 11979 When no error was detected, the value doesn't change.  So, to check
 11980 the error status of a code conversion by this variable, you must
 11981 explicitly set this variable to nil before performing code
 11982 conversion.  */);
 11983   Vlast_code_conversion_error = Qnil;
 11984 
 11985   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
 11986                doc: /*
 11987 Non-nil means always inhibit code conversion of end-of-line format.
 11988 See info node `Coding Systems' and info node `Text and Binary' concerning
 11989 such conversion.  */);
 11990   inhibit_eol_conversion = 0;
 11991 
 11992   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
 11993                doc: /*
 11994 Non-nil means process buffer inherits coding system of process output.
 11995 Bind it to t if the process output is to be treated as if it were a file
 11996 read from some filesystem.  */);
 11997   inherit_process_coding_system = 0;
 11998 
 11999   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
 12000                doc: /*
 12001 Alist to decide a coding system to use for a file I/O operation.
 12002 The format is ((PATTERN . VAL) ...),
 12003 where PATTERN is a regular expression matching a file name,
 12004 VAL is a coding system, a cons of coding systems, or a function symbol.
 12005 If VAL is a coding system, it is used for both decoding and encoding
 12006 the file contents.
 12007 If VAL is a cons of coding systems, the car part is used for decoding,
 12008 and the cdr part is used for encoding.
 12009 If VAL is a function symbol, the function must return a coding system
 12010 or a cons of coding systems which are used as above.  The function is
 12011 called with an argument that is a list of the arguments with which
 12012 `find-operation-coding-system' was called.  If the function can't decide
 12013 a coding system, it can return `undecided' so that the normal
 12014 code-detection is performed.
 12015 
 12016 See also the function `find-operation-coding-system'
 12017 and the variable `auto-coding-alist'.  */);
 12018   Vfile_coding_system_alist = Qnil;
 12019 
 12020   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
 12021                doc: /*
 12022 Alist to decide a coding system to use for a process I/O operation.
 12023 The format is ((PATTERN . VAL) ...),
 12024 where PATTERN is a regular expression matching a program name,
 12025 VAL is a coding system, a cons of coding systems, or a function symbol.
 12026 If VAL is a coding system, it is used for both decoding what received
 12027 from the program and encoding what sent to the program.
 12028 If VAL is a cons of coding systems, the car part is used for decoding,
 12029 and the cdr part is used for encoding.
 12030 If VAL is a function symbol, the function must return a coding system
 12031 or a cons of coding systems which are used as above.
 12032 
 12033 See also the function `find-operation-coding-system'.  */);
 12034   Vprocess_coding_system_alist = Qnil;
 12035 
 12036   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
 12037                doc: /*
 12038 Alist to decide a coding system to use for a network I/O operation.
 12039 The format is ((PATTERN . VAL) ...),
 12040 where PATTERN is a regular expression matching a network service name
 12041 or is a port number to connect to,
 12042 VAL is a coding system, a cons of coding systems, or a function symbol.
 12043 If VAL is a coding system, it is used for both decoding what received
 12044 from the network stream and encoding what sent to the network stream.
 12045 If VAL is a cons of coding systems, the car part is used for decoding,
 12046 and the cdr part is used for encoding.
 12047 If VAL is a function symbol, the function must return a coding system
 12048 or a cons of coding systems which are used as above.
 12049 
 12050 See also the function `find-operation-coding-system'.  */);
 12051   Vnetwork_coding_system_alist = Qnil;
 12052 
 12053   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
 12054     doc: /* Coding system to use with system messages.
 12055 Potentially also used for decoding keyboard input on X Windows, and is
 12056 used for encoding standard output and error streams.  */);
 12057   Vlocale_coding_system = Qnil;
 12058 
 12059   /* The eol mnemonics are reset in startup.el system-dependently.  */
 12060   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
 12061                doc: /*
 12062 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
 12063   eol_mnemonic_unix = build_pure_c_string (":");
 12064 
 12065   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
 12066                doc: /*
 12067 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
 12068   eol_mnemonic_dos = build_pure_c_string ("\\");
 12069 
 12070   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
 12071                doc: /*
 12072 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
 12073   eol_mnemonic_mac = build_pure_c_string ("/");
 12074 
 12075   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
 12076                doc: /*
 12077 String displayed in mode line when end-of-line format is not yet determined.  */);
 12078   eol_mnemonic_undecided = build_pure_c_string (":");
 12079 
 12080   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
 12081                doc: /*
 12082 Non-nil enables character translation while encoding and decoding.  */);
 12083   Venable_character_translation = Qt;
 12084 
 12085   DEFVAR_LISP ("standard-translation-table-for-decode",
 12086                Vstandard_translation_table_for_decode,
 12087                doc: /* Table for translating characters while decoding.  */);
 12088   Vstandard_translation_table_for_decode = Qnil;
 12089 
 12090   DEFVAR_LISP ("standard-translation-table-for-encode",
 12091                Vstandard_translation_table_for_encode,
 12092                doc: /* Table for translating characters while encoding.  */);
 12093   Vstandard_translation_table_for_encode = Qnil;
 12094 
 12095   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
 12096                doc: /* Alist of charsets vs revision numbers.
 12097 While encoding, if a charset (car part of an element) is found,
 12098 designate it with the escape sequence identifying revision (cdr part
 12099 of the element).  */);
 12100   Vcharset_revision_table = Qnil;
 12101 
 12102   DEFVAR_LISP ("default-process-coding-system",
 12103                Vdefault_process_coding_system,
 12104                doc: /* Cons of coding systems used for process I/O by default.
 12105 The car part is used for decoding a process output,
 12106 the cdr part is used for encoding a text to be sent to a process.  */);
 12107   Vdefault_process_coding_system = Qnil;
 12108 
 12109   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
 12110                doc: /*
 12111 Table of extra Latin codes in the range 128..159 (inclusive).
 12112 This is a vector of length 256.
 12113 If Nth element is non-nil, the existence of code N in a file
 12114 \(or output of subprocess) doesn't prevent it to be detected as
 12115 a coding system of ISO 2022 variant which has a flag
 12116 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
 12117 or reading output of a subprocess.
 12118 Only 128th through 159th elements have a meaning.  */);
 12119   Vlatin_extra_code_table = make_nil_vector (256);
 12120 
 12121   DEFVAR_LISP ("select-safe-coding-system-function",
 12122                Vselect_safe_coding_system_function,
 12123                doc: /*
 12124 Function to call to select safe coding system for encoding a text.
 12125 
 12126 If set, this function is called to force a user to select a proper
 12127 coding system which can encode the text in the case that a default
 12128 coding system used in each operation can't encode the text.  The
 12129 function should take care that the buffer is not modified while
 12130 the coding system is being selected.
 12131 
 12132 The default value is `select-safe-coding-system' (which see).  */);
 12133   Vselect_safe_coding_system_function = Qnil;
 12134 
 12135   DEFVAR_BOOL ("coding-system-require-warning",
 12136                coding_system_require_warning,
 12137                doc: /* Internal use only.
 12138 If non-nil, on writing a file, `select-safe-coding-system-function' is
 12139 called even if `coding-system-for-write' is non-nil.  The command
 12140 `universal-coding-system-argument' binds this variable to t temporarily.  */);
 12141   coding_system_require_warning = 0;
 12142 
 12143 
 12144   DEFVAR_BOOL ("inhibit-iso-escape-detection",
 12145                inhibit_iso_escape_detection,
 12146                doc: /*
 12147 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
 12148 
 12149 When Emacs reads text, it tries to detect how the text is encoded.
 12150 This code detection is sensitive to escape sequences.  If Emacs sees
 12151 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
 12152 of the ISO2022 encodings, and decodes text by the corresponding coding
 12153 system (e.g. `iso-2022-7bit').
 12154 
 12155 However, there may be a case that you want to read escape sequences in
 12156 a file as is.  In such a case, you can set this variable to non-nil.
 12157 Then the code detection will ignore any escape sequences, and no text is
 12158 detected as encoded in some ISO-2022 encoding.  The result is that all
 12159 escape sequences become visible in a buffer.
 12160 
 12161 The default value is nil, and it is strongly recommended not to change
 12162 it.  That is because many Emacs Lisp source files that contain
 12163 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
 12164 in Emacs's distribution, and they won't be decoded correctly on
 12165 reading if you suppress escape sequence detection.
 12166 
 12167 The other way to read escape sequences in a file without decoding is
 12168 to explicitly specify some coding system that doesn't use ISO-2022
 12169 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
 12170   inhibit_iso_escape_detection = 0;
 12171 
 12172   DEFVAR_BOOL ("inhibit-null-byte-detection",
 12173                inhibit_null_byte_detection,
 12174                doc: /* If non-nil, Emacs ignores null bytes on code detection.
 12175 By default, Emacs treats it as binary data, and does not attempt to
 12176 decode it.  The effect is as if you specified `no-conversion' for
 12177 reading that text.
 12178 
 12179 Set this to non-nil when a regular text happens to include null bytes.
 12180 Examples are Index nodes of Info files and null-byte delimited output
 12181 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
 12182 decode text as usual.  */);
 12183   inhibit_null_byte_detection = 0;
 12184 
 12185   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
 12186                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
 12187 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
 12188   disable_ascii_optimization = 0;
 12189 
 12190   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
 12191                doc: /* Char table for translating self-inserting characters.
 12192 This is applied to the result of input methods, not their input.
 12193 See also `keyboard-translate-table'.
 12194 
 12195 Use of this variable for character code unification was rendered
 12196 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
 12197 internal character representation.  */);
 12198   Vtranslation_table_for_input = Qnil;
 12199 
 12200   Lisp_Object args[coding_arg_undecided_max];
 12201   memclear (args, sizeof args);
 12202 
 12203   Lisp_Object plist[] =
 12204     {
 12205       QCname,
 12206       args[coding_arg_name] = Qno_conversion,
 12207       QCmnemonic,
 12208       args[coding_arg_mnemonic] = make_fixnum ('='),
 12209       intern_c_string (":coding-type"),
 12210       args[coding_arg_coding_type] = Qraw_text,
 12211       QCascii_compatible_p,
 12212       args[coding_arg_ascii_compatible_p] = Qt,
 12213       QCdefault_char,
 12214       args[coding_arg_default_char] = make_fixnum (0),
 12215       intern_c_string (":for-unibyte"),
 12216       args[coding_arg_for_unibyte] = Qt,
 12217       intern_c_string (":docstring"),
 12218       (build_pure_c_string
 12219        ("Do no conversion.\n"
 12220         "\n"
 12221         "When you visit a file with this coding, the file is read into a\n"
 12222         "unibyte buffer as is, thus each byte of a file is treated as a\n"
 12223         "character.")),
 12224       intern_c_string (":eol-type"),
 12225       args[coding_arg_eol_type] = Qunix,
 12226     };
 12227   args[coding_arg_plist] = CALLMANY (Flist, plist);
 12228   Fdefine_coding_system_internal (coding_arg_max, args);
 12229 
 12230   plist[1] = args[coding_arg_name] = Qundecided;
 12231   plist[3] = args[coding_arg_mnemonic] = make_fixnum ('-');
 12232   plist[5] = args[coding_arg_coding_type] = Qundecided;
 12233   /* This is already set.
 12234      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
 12235   plist[8] = intern_c_string (":charset-list");
 12236   plist[9] = args[coding_arg_charset_list] = list1 (Qascii);
 12237   plist[11] = args[coding_arg_for_unibyte] = Qnil;
 12238   plist[13] = build_pure_c_string ("No conversion on encoding, "
 12239                                    "automatic conversion on decoding.");
 12240   plist[15] = args[coding_arg_eol_type] = Qnil;
 12241   args[coding_arg_plist] = CALLMANY (Flist, plist);
 12242   args[coding_arg_undecided_inhibit_null_byte_detection] = make_fixnum (0);
 12243   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_fixnum (0);
 12244   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
 12245 
 12246   setup_coding_system (Qno_conversion, &safe_terminal_coding);
 12247 
 12248   for (int i = 0; i < coding_category_max; i++)
 12249     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
 12250 
 12251   pdumper_do_now_and_after_load (reset_coding_after_pdumper_load);
 12252 }
 12253 
 12254 static void
 12255 reset_coding_after_pdumper_load (void)
 12256 {
 12257   if (!dumped_with_pdumper_p ())
 12258     return;
 12259   for (struct coding_system *this = &coding_categories[0];
 12260        this < &coding_categories[coding_category_max];
 12261        ++this)
 12262     {
 12263       int id = this->id;
 12264       if (id >= 0)
 12265         {
 12266           /* Need to rebuild the coding system object because we
 12267              persisted it as a scalar and it's full of gunk that's now
 12268              invalid.  */
 12269           memset (this, 0, sizeof (*this));
 12270           setup_coding_system (CODING_ID_NAME (id), this);
 12271         }
 12272     }
 12273   /* In temacs the below is done by mule-conf.el, because we need to
 12274      define us-ascii first.  But in dumped Emacs us-ascii is restored
 12275      by the above loop, and mule-conf.el will not be loaded, so we set
 12276      it up now; otherwise safe_terminal_coding will remain zeroed.  */
 12277   Fset_safe_terminal_coding_system_internal (Qus_ascii);
 12278 }

/* [<][>][^][v][top][bottom][index][help] */