src/coding.h

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
surrogates_to_codepoint
build_string_from_utf8
     1 /* Header for coding system handler.
     2    Copyright (C) 2001-2023 Free Software Foundation, Inc.
     3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
     4      2005, 2006, 2007, 2008, 2009, 2010, 2011
     5      National Institute of Advanced Industrial Science and Technology (AIST)
     6      Registration Number H14PRO021
     7    Copyright (C) 2003
     8      National Institute of Advanced Industrial Science and Technology (AIST)
     9      Registration Number H13PRO009
    10 
    11 This file is part of GNU Emacs.
    12 
    13 GNU Emacs is free software: you can redistribute it and/or modify
    14 it under the terms of the GNU General Public License as published by
    15 the Free Software Foundation, either version 3 of the License, or (at
    16 your option) any later version.
    17 
    18 GNU Emacs is distributed in the hope that it will be useful,
    19 but WITHOUT ANY WARRANTY; without even the implied warranty of
    20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    21 GNU General Public License for more details.
    22 
    23 You should have received a copy of the GNU General Public License
    24 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
    25 
    26 #ifndef EMACS_CODING_H
    27 #define EMACS_CODING_H
    28 
    29 #include "lisp.h"
    30 
    31 INLINE_HEADER_BEGIN
    32 
    33 /* Index to arguments of Fdefine_coding_system_internal.  */
    34 
    35 enum define_coding_system_arg_index
    36   {
    37     coding_arg_name,
    38     coding_arg_mnemonic,
    39     coding_arg_coding_type,
    40     coding_arg_charset_list,
    41     coding_arg_ascii_compatible_p,
    42     coding_arg_decode_translation_table,
    43     coding_arg_encode_translation_table,
    44     coding_arg_post_read_conversion,
    45     coding_arg_pre_write_conversion,
    46     coding_arg_default_char,
    47     coding_arg_for_unibyte,
    48     coding_arg_plist,
    49     coding_arg_eol_type,
    50     coding_arg_max
    51   };
    52 
    53 enum define_coding_iso2022_arg_index
    54   {
    55     coding_arg_iso2022_initial = coding_arg_max,
    56     coding_arg_iso2022_reg_usage,
    57     coding_arg_iso2022_request,
    58     coding_arg_iso2022_flags,
    59     coding_arg_iso2022_max
    60   };
    61 
    62 enum define_coding_utf8_arg_index
    63   {
    64     coding_arg_utf8_bom = coding_arg_max,
    65     coding_arg_utf8_max
    66   };
    67 
    68 enum define_coding_utf16_arg_index
    69   {
    70     coding_arg_utf16_bom = coding_arg_max,
    71     coding_arg_utf16_endian,
    72     coding_arg_utf16_max
    73   };
    74 
    75 enum define_coding_ccl_arg_index
    76   {
    77     coding_arg_ccl_decoder = coding_arg_max,
    78     coding_arg_ccl_encoder,
    79     coding_arg_ccl_valids,
    80     coding_arg_ccl_max
    81   };
    82 
    83 enum define_coding_undecided_arg_index
    84   {
    85     coding_arg_undecided_inhibit_null_byte_detection = coding_arg_max,
    86     coding_arg_undecided_inhibit_iso_escape_detection,
    87     coding_arg_undecided_prefer_utf_8,
    88     coding_arg_undecided_max
    89   };
    90 
    91 /* Hash table for all coding systems.  Keys are coding system symbols
    92    and values are spec vectors of the corresponding coding system.  A
    93    spec vector has the form [ ATTRS ALIASES EOL-TYPE ].  ATTRS is a
    94    vector of attribute of the coding system.  ALIASES is a list of
    95    aliases (symbols) of the coding system.  EOL-TYPE is `unix', `dos',
    96    `mac' or a vector of coding systems (symbols).  */
    97 
    98 extern Lisp_Object Vcoding_system_hash_table;
    99 
   100 /* Enumeration of index to an attribute vector of a coding system.  */
   101 
   102 enum coding_attr_index
   103   {
   104     coding_attr_base_name,
   105     coding_attr_docstring,
   106     coding_attr_mnemonic,
   107     coding_attr_type,
   108     coding_attr_charset_list,
   109     coding_attr_ascii_compat,
   110     coding_attr_decode_tbl,
   111     coding_attr_encode_tbl,
   112     coding_attr_trans_tbl,
   113     coding_attr_post_read,
   114     coding_attr_pre_write,
   115     coding_attr_default_char,
   116     coding_attr_for_unibyte,
   117     coding_attr_plist,
   118 
   119     coding_attr_category,
   120     coding_attr_safe_charsets,
   121 
   122     /* The followings are extra attributes for each type.  */
   123     coding_attr_charset_valids,
   124 
   125     coding_attr_ccl_decoder,
   126     coding_attr_ccl_encoder,
   127     coding_attr_ccl_valids,
   128 
   129     coding_attr_iso_initial,
   130     coding_attr_iso_usage,
   131     coding_attr_iso_request,
   132     coding_attr_iso_flags,
   133 
   134     coding_attr_utf_bom,
   135     coding_attr_utf_16_endian,
   136 
   137     coding_attr_emacs_mule_full,
   138 
   139     coding_attr_undecided_inhibit_null_byte_detection,
   140     coding_attr_undecided_inhibit_iso_escape_detection,
   141     coding_attr_undecided_prefer_utf_8,
   142 
   143     coding_attr_last_index
   144   };
   145 
   146 
   147 /* Macros to access an element of an attribute vector.  */
   148 
   149 #define CODING_ATTR_BASE_NAME(attrs)    AREF (attrs, coding_attr_base_name)
   150 #define CODING_ATTR_TYPE(attrs)         AREF (attrs, coding_attr_type)
   151 #define CODING_ATTR_CHARSET_LIST(attrs) AREF (attrs, coding_attr_charset_list)
   152 #define CODING_ATTR_MNEMONIC(attrs)     AREF (attrs, coding_attr_mnemonic)
   153 #define CODING_ATTR_DOCSTRING(attrs)    AREF (attrs, coding_attr_docstring)
   154 #define CODING_ATTR_ASCII_COMPAT(attrs) AREF (attrs, coding_attr_ascii_compat)
   155 #define CODING_ATTR_DECODE_TBL(attrs)   AREF (attrs, coding_attr_decode_tbl)
   156 #define CODING_ATTR_ENCODE_TBL(attrs)   AREF (attrs, coding_attr_encode_tbl)
   157 #define CODING_ATTR_TRANS_TBL(attrs)    AREF (attrs, coding_attr_trans_tbl)
   158 #define CODING_ATTR_POST_READ(attrs)    AREF (attrs, coding_attr_post_read)
   159 #define CODING_ATTR_PRE_WRITE(attrs)    AREF (attrs, coding_attr_pre_write)
   160 #define CODING_ATTR_DEFAULT_CHAR(attrs) AREF (attrs, coding_attr_default_char)
   161 #define CODING_ATTR_FOR_UNIBYTE(attrs)  AREF (attrs, coding_attr_for_unibyte)
   162 #define CODING_ATTR_PLIST(attrs)        AREF (attrs, coding_attr_plist)
   163 #define CODING_ATTR_CATEGORY(attrs)     AREF (attrs, coding_attr_category)
   164 #define CODING_ATTR_SAFE_CHARSETS(attrs)AREF (attrs, coding_attr_safe_charsets)
   165 
   166 
   167 /* Return the name of a coding system specified by ID.  */
   168 #define CODING_ID_NAME(id) \
   169   (HASH_KEY (XHASH_TABLE (Vcoding_system_hash_table), id))
   170 
   171 /* Return the attribute vector of a coding system specified by ID.  */
   172 
   173 #define CODING_ID_ATTRS(id)     \
   174   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 0))
   175 
   176 /* Return the list of aliases of a coding system specified by ID.  */
   177 
   178 #define CODING_ID_ALIASES(id)   \
   179   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 1))
   180 
   181 /* Return the eol-type of a coding system specified by ID.  */
   182 
   183 #define CODING_ID_EOL_TYPE(id)  \
   184   (AREF (HASH_VALUE (XHASH_TABLE (Vcoding_system_hash_table), id), 2))
   185 
   186 
   187 /* Return the spec vector of CODING_SYSTEM_SYMBOL.  */
   188 
   189 #define CODING_SYSTEM_SPEC(coding_system_symbol)        \
   190   (Fgethash (coding_system_symbol, Vcoding_system_hash_table, Qnil))
   191 
   192 
   193 /* Return the ID of CODING_SYSTEM_SYMBOL.  */
   194 
   195 #define CODING_SYSTEM_ID(coding_system_symbol)                  \
   196   hash_lookup (XHASH_TABLE (Vcoding_system_hash_table),         \
   197                coding_system_symbol, NULL)
   198 
   199 /* Return true if CODING_SYSTEM_SYMBOL is a coding system.  */
   200 
   201 #define CODING_SYSTEM_P(coding_system_symbol)           \
   202   (CODING_SYSTEM_ID (coding_system_symbol) >= 0         \
   203    || (! NILP (coding_system_symbol)                    \
   204        && ! NILP (Fcoding_system_p (coding_system_symbol))))
   205 
   206 /* Check if X is a coding system or not.  */
   207 
   208 #define CHECK_CODING_SYSTEM(x)                          \
   209   do {                                                  \
   210     if (CODING_SYSTEM_ID (x) < 0                        \
   211         && NILP (Fcheck_coding_system (x)))             \
   212       wrong_type_argument (Qcoding_system_p, (x));      \
   213   } while (false)
   214 
   215 
   216 /* Check if X is a coding system or not.  If it is, set SEPC to the
   217    spec vector of the coding system.  */
   218 
   219 #define CHECK_CODING_SYSTEM_GET_SPEC(x, spec)           \
   220   do {                                                  \
   221     spec = CODING_SYSTEM_SPEC (x);                      \
   222     if (NILP (spec))                                    \
   223       {                                                 \
   224         Fcheck_coding_system (x);                       \
   225         spec = CODING_SYSTEM_SPEC (x);                  \
   226       }                                                 \
   227     if (NILP (spec))                                    \
   228       wrong_type_argument (Qcoding_system_p, (x));      \
   229   } while (false)
   230 
   231 
   232 /* Check if X is a coding system or not.  If it is, set ID to the
   233    ID of the coding system.  */
   234 
   235 #define CHECK_CODING_SYSTEM_GET_ID(x, id)                       \
   236   do                                                            \
   237     {                                                           \
   238       id = CODING_SYSTEM_ID (x);                                \
   239       if (id < 0)                                               \
   240         {                                                       \
   241           Fcheck_coding_system (x);                             \
   242           id = CODING_SYSTEM_ID (x);                            \
   243         }                                                       \
   244       if (id < 0)                                               \
   245         wrong_type_argument (Qcoding_system_p, (x));    \
   246     } while (false)
   247 
   248 
   249 /*** GENERAL section ***/
   250 
   251 /* Enumeration of result code of code conversion.  */
   252 enum coding_result_code
   253   {
   254     CODING_RESULT_SUCCESS,
   255     CODING_RESULT_INSUFFICIENT_SRC,
   256     CODING_RESULT_INSUFFICIENT_DST,
   257     CODING_RESULT_INVALID_SRC,
   258     CODING_RESULT_INTERRUPT
   259   };
   260 
   261 
   262 /* Macros used for the member `mode' of the struct coding_system.  */
   263 
   264 /* If set, the decoding/encoding routines treat the current data as
   265    the last block of the whole text to be converted, and do the
   266    appropriate finishing job.  */
   267 #define CODING_MODE_LAST_BLOCK                  0x01
   268 
   269 /* If set, it means that the current source text is in a buffer which
   270    enables selective display.  */
   271 #define CODING_MODE_SELECTIVE_DISPLAY           0x02
   272 
   273 /* This flag is used by the decoding/encoding routines on the fly.  If
   274    set, it means that right-to-left text is being processed.  */
   275 #define CODING_MODE_DIRECTION                   0x04
   276 
   277 #define CODING_MODE_FIXED_DESTINATION           0x08
   278 
   279 /* If set, it means that the encoding routines produces some safe
   280    ASCII characters (usually '?') for unsupported characters.  */
   281 #define CODING_MODE_SAFE_ENCODING               0x10
   282 
   283   /* For handling composition sequence.  */
   284 #include "composite.h"
   285 
   286 enum composition_state
   287   {
   288     COMPOSING_NO,
   289     COMPOSING_CHAR,
   290     COMPOSING_RULE,
   291     COMPOSING_COMPONENT_CHAR,
   292     COMPOSING_COMPONENT_RULE
   293   };
   294 
   295 /* Structure for the current composition status.  */
   296 struct composition_status
   297 {
   298   enum composition_state state;
   299   enum composition_method method;
   300   bool old_form;          /* true if pre-21 form */
   301   int length;             /* number of elements produced in charbuf */
   302   int nchars;             /* number of characters composed */
   303   int ncomps;             /* number of composition components */
   304   /* Maximum carryover is for the case of COMPOSITION_WITH_RULE_ALTCHARS.
   305      See the comment in coding.c.  */
   306   int carryover[4               /* annotation header */
   307                 + MAX_COMPOSITION_COMPONENTS * 3 - 2 /* ALTs and RULEs */
   308                 + 2                                  /* intermediate -1 -1 */
   309                 + MAX_COMPOSITION_COMPONENTS         /* CHARs */
   310                 ];
   311 };
   312 
   313 
   314 /* Structure of the field `spec.iso_2022' in the structure
   315    `coding_system'.  */
   316 struct iso_2022_spec
   317 {
   318   /* Bit-wise-or of CODING_ISO_FLAG_XXX.  */
   319   unsigned flags;
   320 
   321   /* The current graphic register invoked to each graphic plane.  */
   322   int current_invocation[2];
   323 
   324   /* The current charset designated to each graphic register.  The
   325      value -1 means that not charset is designated, -2 means that
   326      there was an invalid designation previously.  */
   327   int current_designation[4];
   328 
   329   /* If positive, we are now scanning CTEXT extended segment.  */
   330   int ctext_extended_segment_len;
   331 
   332   /* True temporarily only when graphic register 2 or 3 is invoked by
   333      single-shift while encoding.  */
   334   bool_bf single_shifting : 1;
   335 
   336   /* True temporarily only when processing at beginning of line.  */
   337   bool_bf bol : 1;
   338 
   339   /* If true, we are now scanning embedded UTF-8 sequence.  */
   340   bool_bf embedded_utf_8 : 1;
   341 
   342   /* The current composition.  */
   343   struct composition_status cmp_status;
   344 };
   345 
   346 struct emacs_mule_spec
   347 {
   348   struct composition_status cmp_status;
   349 };
   350 
   351 struct undecided_spec
   352 {
   353   /* Inhibit null byte detection.  1 means always inhibit,
   354      -1 means do not inhibit, 0 means rely on user variable.  */
   355   int inhibit_nbd;
   356 
   357   /* Inhibit ISO escape detection.  -1, 0, 1 as above.  */
   358   int inhibit_ied;
   359 
   360   /* Prefer UTF-8 when the input could be other encodings.  */
   361   bool prefer_utf_8;
   362 };
   363 
   364 enum utf_bom_type
   365   {
   366     utf_detect_bom,
   367     utf_without_bom,
   368     utf_with_bom
   369   };
   370 
   371 enum utf_16_endian_type
   372   {
   373     utf_16_big_endian,
   374     utf_16_little_endian
   375   };
   376 
   377 struct utf_16_spec
   378 {
   379   enum utf_bom_type bom;
   380   enum utf_16_endian_type endian;
   381   int surrogate;
   382 };
   383 
   384 struct coding_detection_info
   385 {
   386   /* Values of these members are bitwise-OR of CATEGORY_MASK_XXXs.  */
   387   /* Which categories are already checked.  */
   388   int checked;
   389   /* Which categories are strongly found.  */
   390   int found;
   391   /* Which categories are rejected.  */
   392   int rejected;
   393 };
   394 
   395 
   396 struct coding_system
   397 {
   398   /* ID number of the coding system.  This is an index to
   399      Vcoding_system_hash_table.  This value is set by
   400      setup_coding_system.  At the early stage of building time, this
   401      value is -1 in the array coding_categories to indicate that no
   402      coding-system of that category is yet defined.  */
   403   ptrdiff_t id;
   404 
   405   /* Flag bits of the coding system.  The meaning of each bit is common
   406      to all types of coding systems.  */
   407   unsigned common_flags : 14;
   408 
   409   /* Mode bits of the coding system.  See the comments of the macros
   410      CODING_MODE_XXX.  */
   411   unsigned mode : 5;
   412 
   413   /* The following two members specify how binary 8-bit code 128..255
   414      are represented in source and destination text respectively.  True
   415      means they are represented by 2-byte sequence, false means they are
   416      represented by 1-byte as is (see the comment in character.h).  */
   417   bool_bf src_multibyte : 1;
   418   bool_bf dst_multibyte : 1;
   419 
   420   /* True if the source of conversion is not in the member
   421      `charbuf', but at `src_object'.  */
   422   bool_bf chars_at_source : 1;
   423 
   424   /* Nonzero if the result of conversion is in `destination'
   425      buffer rather than in `dst_object'.  */
   426   bool_bf raw_destination : 1;
   427 
   428   /* Set to true if charbuf contains an annotation.  */
   429   bool_bf annotated : 1;
   430 
   431   /* Used internally in coding.c.  See the comment of detect_ascii.  */
   432   unsigned eol_seen : 3;
   433 
   434   /* Finish status of code conversion.  */
   435   ENUM_BF (coding_result_code) result : 3;
   436 
   437   int max_charset_id;
   438 
   439   /* Detailed information specific to each type of coding system.  */
   440   union
   441     {
   442       struct iso_2022_spec iso_2022;
   443       struct ccl_spec *ccl;     /* Defined in ccl.h.  */
   444       struct utf_16_spec utf_16;
   445       enum utf_bom_type utf_8_bom;
   446       struct emacs_mule_spec emacs_mule;
   447       struct undecided_spec undecided;
   448     } spec;
   449 
   450   unsigned char *safe_charsets;
   451 
   452   /* How many heading bytes we can skip for decoding.  This is set to
   453      -1 in setup_coding_system, and updated by detect_coding.  So,
   454      when this is equal to the byte length of the text being
   455      converted, we can skip the actual conversion process except for
   456      the eol format.  */
   457   ptrdiff_t head_ascii;
   458 
   459   /* How many bytes/chars at the source are detected as valid utf-8
   460      sequence.  Set by detect_coding_utf_8.  */
   461   ptrdiff_t detected_utf8_bytes, detected_utf8_chars;
   462 
   463   /* The following members are set by encoding/decoding routine.  */
   464   ptrdiff_t produced, produced_char, consumed, consumed_char;
   465 
   466   ptrdiff_t src_pos, src_pos_byte, src_chars, src_bytes;
   467   Lisp_Object src_object;
   468   const unsigned char *source;
   469 
   470   ptrdiff_t dst_pos, dst_pos_byte, dst_bytes;
   471   Lisp_Object dst_object;
   472   unsigned char *destination;
   473 
   474   /* If an element is non-negative, it is a character code.
   475 
   476      If it is in the range -128..-1, it is a 8-bit character code
   477      minus 256.
   478 
   479      If it is less than -128, it specifies the start of an annotation
   480      chunk.  The length of the chunk is -128 minus the value of the
   481      element.  The following elements are OFFSET, ANNOTATION-TYPE, and
   482      a sequence of actual data for the annotation.  OFFSET is a
   483      character position offset from dst_pos or src_pos,
   484      ANNOTATION-TYPE specifies the meaning of the annotation and how to
   485      handle the following data..  */
   486   int *charbuf;
   487   int charbuf_size, charbuf_used;
   488 
   489   unsigned char carryover[64];
   490   int carryover_bytes;
   491 
   492   int default_char;
   493 
   494   bool (*detector) (struct coding_system *, struct coding_detection_info *);
   495   void (*decoder) (struct coding_system *);
   496   bool (*encoder) (struct coding_system *);
   497 };
   498 
   499 /* Meanings of bits in the member `common_flags' of the structure
   500    coding_system.  The lowest 8 bits are reserved for various kind of
   501    annotations (currently two of them are used).  */
   502 #define CODING_ANNOTATION_MASK                  0x00FF
   503 #define CODING_ANNOTATE_COMPOSITION_MASK        0x0001
   504 #define CODING_ANNOTATE_DIRECTION_MASK          0x0002
   505 #define CODING_ANNOTATE_CHARSET_MASK            0x0003
   506 #define CODING_FOR_UNIBYTE_MASK                 0x0100
   507 #define CODING_REQUIRE_FLUSHING_MASK            0x0200
   508 #define CODING_REQUIRE_DECODING_MASK            0x0400
   509 #define CODING_REQUIRE_ENCODING_MASK            0x0800
   510 #define CODING_REQUIRE_DETECTION_MASK           0x1000
   511 #define CODING_RESET_AT_BOL_MASK                0x2000
   512 
   513 /* Return nonzero if the coding context CODING requires annotation
   514    handling.  */
   515 #define CODING_REQUIRE_ANNOTATION(coding) \
   516   ((coding)->common_flags & CODING_ANNOTATION_MASK)
   517 
   518 /* Return nonzero if the coding context CODING prefers decoding into
   519    unibyte.  */
   520 #define CODING_FOR_UNIBYTE(coding) \
   521   ((coding)->common_flags & CODING_FOR_UNIBYTE_MASK)
   522 
   523 /* Return nonzero if the coding context CODING requires specific code to be
   524    attached at the tail of converted text.  */
   525 #define CODING_REQUIRE_FLUSHING(coding) \
   526   ((coding)->common_flags & CODING_REQUIRE_FLUSHING_MASK)
   527 
   528 /* Return nonzero if the coding context CODING requires code conversion on
   529    decoding.  */
   530 #define CODING_REQUIRE_DECODING(coding) \
   531   ((coding)->dst_multibyte              \
   532    || (coding)->common_flags & CODING_REQUIRE_DECODING_MASK)
   533 
   534 
   535 /* Return nonzero if the coding context CODING requires code conversion on
   536    encoding.
   537    The non-multibyte part of the condition is to support encoding of
   538    unibyte strings/buffers generated by string-as-unibyte or
   539    (set-buffer-multibyte nil) from multibyte strings/buffers.  */
   540 #define CODING_REQUIRE_ENCODING(coding)                         \
   541   ((coding)->src_multibyte                                      \
   542    || (coding)->common_flags & CODING_REQUIRE_ENCODING_MASK     \
   543    || (coding)->mode & CODING_MODE_SELECTIVE_DISPLAY)
   544 
   545 
   546 /* Return nonzero if the coding context CODING requires some kind of code
   547    detection.  */
   548 #define CODING_REQUIRE_DETECTION(coding) \
   549   ((coding)->common_flags & CODING_REQUIRE_DETECTION_MASK)
   550 
   551 /* Return nonzero if the coding context CODING requires code conversion on
   552    decoding or some kind of code detection.  */
   553 #define CODING_MAY_REQUIRE_DECODING(coding)     \
   554   (CODING_REQUIRE_DECODING (coding)             \
   555    || CODING_REQUIRE_DETECTION (coding))
   556 
   557 /* Macros to decode or encode a character of JISX0208 in SJIS.  S1 and
   558    S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding
   559    system.  C1 and C2 are the 1st and 2nd position codes of Emacs'
   560    internal format.  */
   561 
   562 #define SJIS_TO_JIS(code)                               \
   563   do {                                                  \
   564     int s1, s2, j1, j2;                                 \
   565                                                         \
   566     s1 = (code) >> 8, s2 = (code) & 0xFF;               \
   567                                                         \
   568     if (s2 >= 0x9F)                                     \
   569       (j1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0),       \
   570        j2 = s2 - 0x7E);                                 \
   571     else                                                \
   572       (j1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1),     \
   573        j2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F));         \
   574     (code) = (j1 << 8) | j2;                            \
   575   } while (false)
   576 
   577 #define SJIS_TO_JIS2(code)                              \
   578   do {                                                  \
   579     int s1, s2, j1, j2;                                 \
   580                                                         \
   581     s1 = (code) >> 8, s2 = (code) & 0xFF;               \
   582                                                         \
   583     if (s2 >= 0x9F)                                     \
   584       {                                                 \
   585         j1 = (s1 == 0xF0 ? 0x28                         \
   586               : s1 == 0xF1 ? 0x24                       \
   587               : s1 == 0xF2 ? 0x2C                       \
   588               : s1 == 0xF3 ? 0x2E                       \
   589               : 0x6E + (s1 - 0xF4) * 2);                \
   590         j2 = s2 - 0x7E;                                 \
   591       }                                                 \
   592     else                                                \
   593       {                                                 \
   594         j1 = (s1 <= 0xF2 ? 0x21 + (s1 - 0xF0) * 2       \
   595               : s1 <= 0xF4 ? 0x2D + (s1 - 0xF3) * 2     \
   596               : 0x6F + (s1 - 0xF5) * 2);                \
   597         j2 = s2 - ((s2 >= 0x7F ? 0x20 : 0x1F));         \
   598       }                                                 \
   599     (code) = (j1 << 8) | j2;                            \
   600   } while (false)
   601 
   602 
   603 #define JIS_TO_SJIS(code)                               \
   604   do {                                                  \
   605     int s1, s2, j1, j2;                                 \
   606                                                         \
   607     j1 = (code) >> 8, j2 = (code) & 0xFF;               \
   608     if (j1 & 1)                                         \
   609       (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x71 : 0xB1),       \
   610        s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F));         \
   611     else                                                \
   612       (s1 = j1 / 2 + ((j1 < 0x5F) ? 0x70 : 0xB0),       \
   613        s2 = j2 + 0x7E);                                 \
   614     (code) = (s1 << 8) | s2;                            \
   615   } while (false)
   616 
   617 #define JIS_TO_SJIS2(code)                              \
   618   do {                                                  \
   619     int s1, s2, j1, j2;                                 \
   620                                                         \
   621     j1 = (code) >> 8, j2 = (code) & 0xFF;               \
   622     if (j1 & 1)                                         \
   623       {                                                 \
   624         s1 = (j1 <= 0x25 ? 0xF0 + (j1 - 0x21) / 2       \
   625               : j1 <= 0x2F ? 0xF3 + (j1 - 0x2D) / 2     \
   626               : 0xF5 + (j1 - 0x6F) / 2);                \
   627         s2 = j2 + ((j2 >= 0x60) ? 0x20 : 0x1F);         \
   628       }                                                 \
   629     else                                                \
   630       {                                                 \
   631         s1 = (j1 == 0x28 ? 0xF0                         \
   632               : j1 == 0x24 ? 0xF1                       \
   633               : j1 == 0x2C ? 0xF2                       \
   634               : j1 == 0x2E ? 0xF3                       \
   635               : 0xF4 + (j1 - 0x6E) / 2);                \
   636         s2 = j2 + 0x7E;                                 \
   637       }                                                 \
   638     (code) = (s1 << 8) | s2;                            \
   639   } while (false)
   640 
   641 /* Encode the file name NAME using the specified coding system
   642    for file names, if any.  May return NAME itself.  */
   643 #define ENCODE_FILE(NAME)  encode_file_name (NAME)
   644 
   645 /* Decode the file name NAME using the specified coding system
   646    for file names, if any.  May return NAME itself.  */
   647 #define DECODE_FILE(NAME)  decode_file_name (NAME)
   648 
   649 /* Encode the string STR using the specified coding system
   650    for system functions, if any.  */
   651 #define ENCODE_SYSTEM(str)                                                 \
   652   (! NILP (Vlocale_coding_system)                                          \
   653    ? code_convert_string_norecord (str, Vlocale_coding_system, true)       \
   654    : str)
   655 
   656 /* Decode the string STR using the specified coding system
   657    for system functions, if any.  */
   658 #define DECODE_SYSTEM(str)                                                 \
   659   (! NILP (Vlocale_coding_system)                                          \
   660    ? code_convert_string_norecord (str, Vlocale_coding_system, false)      \
   661    : str)
   662 
   663 /* Note that this encodes utf-8, not utf-8-emacs, so it's not a no-op.  */
   664 #define ENCODE_UTF_8(str) code_convert_string_norecord (str, Qutf_8, true)
   665 
   666 /* Return true if VAL is a high surrogate.  VAL must be a 16-bit code
   667    unit.  */
   668 
   669 #define UTF_16_HIGH_SURROGATE_P(val) \
   670   (((val) & 0xFC00) == 0xD800)
   671 
   672 /* Return true if VAL is a low surrogate.  VAL must be a 16-bit code
   673    unit.  */
   674 
   675 #define UTF_16_LOW_SURROGATE_P(val) \
   676   (((val) & 0xFC00) == 0xDC00)
   677 
   678 /* Extern declarations.  */
   679 extern Lisp_Object code_conversion_save (bool, bool);
   680 extern bool encode_coding_utf_8 (struct coding_system *);
   681 extern bool utf8_string_p (Lisp_Object);
   682 extern void setup_coding_system (Lisp_Object, struct coding_system *);
   683 extern Lisp_Object coding_charset_list (struct coding_system *);
   684 extern Lisp_Object coding_system_charset_list (Lisp_Object);
   685 extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
   686                                         Lisp_Object, bool, bool, bool);
   687 extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
   688                                                  bool);
   689 extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
   690                                         Lisp_Object, Lisp_Object);
   691 extern Lisp_Object decode_string_utf_8 (Lisp_Object, const char *, ptrdiff_t,
   692                                         Lisp_Object, bool,
   693                                         Lisp_Object, Lisp_Object);
   694 extern Lisp_Object encode_file_name (Lisp_Object);
   695 extern Lisp_Object decode_file_name (Lisp_Object);
   696 extern Lisp_Object raw_text_coding_system (Lisp_Object);
   697 extern bool raw_text_coding_system_p (struct coding_system *);
   698 extern Lisp_Object coding_inherit_eol_type (Lisp_Object, Lisp_Object);
   699 extern Lisp_Object complement_process_encoding_system (Lisp_Object);
   700 extern Lisp_Object make_string_from_utf8 (const char *, ptrdiff_t);
   701 
   702 extern void decode_coding_gap (struct coding_system *, ptrdiff_t);
   703 extern void decode_coding_object (struct coding_system *,
   704                                   Lisp_Object, ptrdiff_t, ptrdiff_t,
   705                                   ptrdiff_t, ptrdiff_t, Lisp_Object);
   706 extern void encode_coding_object (struct coding_system *,
   707                                   Lisp_Object, ptrdiff_t, ptrdiff_t,
   708                                   ptrdiff_t, ptrdiff_t, Lisp_Object);
   709 /* Defined in this file.  */
   710 INLINE int surrogates_to_codepoint (int, int);
   711 
   712 #if defined (WINDOWSNT) || defined (CYGWIN) || defined HAVE_ANDROID
   713 
   714 /* These functions use Lisp string objects to store the UTF-16LE
   715    strings that modern versions of Windows expect.  These strings are
   716    not particularly useful to Lisp, and all Lisp strings should be
   717    native Emacs multibyte.  */
   718 
   719 /* Access the wide-character string stored in a Lisp string object.  */
   720 #define WCSDATA(x) ((wchar_t *) SDATA (x))
   721 
   722 /* Convert the multi-byte string in STR to UTF-16LE encoded unibyte
   723    string, and store it in *BUF.  BUF may safely point to STR on entry.  */
   724 extern wchar_t *to_unicode (Lisp_Object str, Lisp_Object *buf);
   725 
   726 /* Convert STR, a UTF-16LE encoded string embedded in a unibyte string
   727    object, to a multi-byte Emacs string and return it.  This function
   728    calls code_convert_string_norecord internally and has all its
   729    failure modes.  STR itself is not modified.  */
   730 extern Lisp_Object from_unicode (Lisp_Object str);
   731 
   732 /* Convert WSTR to an Emacs string.  */
   733 extern Lisp_Object from_unicode_buffer (const wchar_t *wstr);
   734 
   735 #endif /* WINDOWSNT || CYGWIN || HAVE_ANDROID */
   736 
   737 /* Macros for backward compatibility.  */
   738 
   739 #define encode_coding_string(coding, string, nocopy)                    \
   740   (STRING_MULTIBYTE(string) ?                                           \
   741     (encode_coding_object (coding, string, 0, 0, SCHARS (string),       \
   742                            SBYTES (string), Qt),                        \
   743      (coding)->dst_object) : (string))
   744 
   745 
   746 #define decode_coding_c_string(coding, src, bytes, dst_object)          \
   747   do {                                                                  \
   748     (coding)->source = (src);                                           \
   749     (coding)->src_chars = (coding)->src_bytes = (bytes);                \
   750     decode_coding_object ((coding), Qnil, 0, 0, (bytes), (bytes),       \
   751                           (dst_object));                                \
   752   } while (false)
   753 
   754 
   755 /* Return the Unicode code point for the given UTF-16 surrogates.  */
   756 
   757 INLINE int
   758 surrogates_to_codepoint (int low, int high)
   759 {
   760   eassert (0 <= low && low <= 0xFFFF);
   761   eassert (0 <= high && high <= 0xFFFF);
   762   eassert (UTF_16_LOW_SURROGATE_P (low));
   763   eassert (UTF_16_HIGH_SURROGATE_P (high));
   764   return 0x10000 + (low - 0xDC00) + ((high - 0xD800) * 0x400);
   765 }
   766 
   767 /* Like build_string, but always returns a multibyte string, and is
   768    optimized for speed when STR is a UTF-8 encoded text string.  */
   769 
   770 INLINE Lisp_Object
   771 build_string_from_utf8 (const char *str)
   772 {
   773   return make_string_from_utf8 (str, strlen (str));
   774 }
   775 
   776 
   777 extern Lisp_Object preferred_coding_system (void);
   778 
   779 /* Coding system to be used to encode text for terminal display when
   780    terminal coding system is nil.  */
   781 extern struct coding_system safe_terminal_coding;
   782 
   783 extern char emacs_mule_bytes[256];
   784 
   785 INLINE_HEADER_END
   786 
   787 #endif /* EMACS_CODING_H */
/* [<][>][^][v][top][bottom][index][help] */
root/src/coding.h

INCLUDED FROM

DEFINITIONS