src/character.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
char_resolve_modifier_mask
char_string
translate_char
DEFUN
DEFUN
DEFUN
char_width
DEFUN
c_string_width
strwidth
lisp_string_width
chars_in_text
multibyte_chars_in_text
parse_str_as_multibyte
str_as_multibyte
count_size_as_multibyte
str_to_multibyte
str_as_unibyte
string_count_byte8
string_escape_byte8
DEFUN
alphabeticp
alphanumericp
graphicp
printablep
graphic_base_p
blankp
syms_of_character
     1 /* Basic character support.
     2 
     3 Copyright (C) 2001-2023 Free Software Foundation, Inc.
     4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
     5   Licensed to the Free Software Foundation.
     6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
     7   National Institute of Advanced Industrial Science and Technology (AIST)
     8   Registration Number H13PRO009
     9 
    10 This file is part of GNU Emacs.
    11 
    12 GNU Emacs is free software: you can redistribute it and/or modify
    13 it under the terms of the GNU General Public License as published by
    14 the Free Software Foundation, either version 3 of the License, or (at
    15 your option) any later version.
    16 
    17 GNU Emacs is distributed in the hope that it will be useful,
    18 but WITHOUT ANY WARRANTY; without even the implied warranty of
    19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    20 GNU General Public License for more details.
    21 
    22 You should have received a copy of the GNU General Public License
    23 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
    24 
    25 /* At first, see the document in `character.h' to understand the code
    26    in this file.  */
    27 
    28 #include <config.h>
    29 
    30 #include <stdio.h>
    31 
    32 #include <sys/types.h>
    33 #include <intprops.h>
    34 #include "lisp.h"
    35 #include "character.h"
    36 #include "buffer.h"
    37 #include "frame.h"
    38 #include "dispextern.h"
    39 #include "composite.h"
    40 #include "disptab.h"
    41 
    42 /* Char-table of information about which character to unify to which
    43    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
    44 Lisp_Object Vchar_unify_table;
    45 
    46 
    47 
    48 /* If character code C has modifier masks, reflect them to the
    49    character code if possible.  Return the resulting code.  */
    50 
    51 EMACS_INT
    52 char_resolve_modifier_mask (EMACS_INT c)
    53 {
    54   /* A non-ASCII character can't reflect modifier bits to the code.  */
    55   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
    56     return c;
    57 
    58   /* For Meta, Shift, and Control modifiers, we need special care.  */
    59   if (c & CHAR_SHIFT)
    60     {
    61       /* Shift modifier is valid only with [A-Za-z].  */
    62       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
    63         c &= ~CHAR_SHIFT;
    64       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
    65         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
    66       /* Shift modifier for control characters and SPC is ignored.  */
    67       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
    68         c &= ~CHAR_SHIFT;
    69     }
    70   if (c & CHAR_CTL)
    71     {
    72       /* Simulate the code in lread.c.  */
    73       /* Allow `\C- ' and `\C-?'.  */
    74       if ((c & 0377) == ' ')
    75         c &= ~0177 & ~ CHAR_CTL;
    76       else if ((c & 0377) == '?')
    77         c = 0177 | (c & ~0177 & ~CHAR_CTL);
    78       /* ASCII control chars are made from letters (both cases),
    79          as well as the non-letters within 0100...0137.  */
    80       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
    81         c &= (037 | (~0177 & ~CHAR_CTL));
    82       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
    83         c &= (037 | (~0177 & ~CHAR_CTL));
    84     }
    85 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
    86   if (c & CHAR_META)
    87     {
    88       /* Move the meta bit to the right place for a string.  */
    89       c = (c & ~CHAR_META) | 0x80;
    90     }
    91 #endif
    92 
    93   return c;
    94 }
    95 
    96 
    97 /* Store multibyte form of character C at P.  If C has modifier bits,
    98    handle them appropriately.  */
    99 
   100 int
   101 char_string (unsigned int c, unsigned char *p)
   102 {
   103   int bytes;
   104 
   105   if (c & CHAR_MODIFIER_MASK)
   106     {
   107       c = char_resolve_modifier_mask (c);
   108       /* If C still has any modifier bits, just ignore it.  */
   109       c &= ~CHAR_MODIFIER_MASK;
   110     }
   111 
   112   if (c <= MAX_3_BYTE_CHAR)
   113     {
   114       bytes = CHAR_STRING (c, p);
   115     }
   116   else if (c <= MAX_4_BYTE_CHAR)
   117     {
   118       p[0] = (0xF0 | (c >> 18));
   119       p[1] = (0x80 | ((c >> 12) & 0x3F));
   120       p[2] = (0x80 | ((c >> 6) & 0x3F));
   121       p[3] = (0x80 | (c & 0x3F));
   122       bytes = 4;
   123     }
   124   else if (c <= MAX_5_BYTE_CHAR)
   125     {
   126       p[0] = 0xF8;
   127       p[1] = (0x80 | ((c >> 18) & 0x0F));
   128       p[2] = (0x80 | ((c >> 12) & 0x3F));
   129       p[3] = (0x80 | ((c >> 6) & 0x3F));
   130       p[4] = (0x80 | (c & 0x3F));
   131       bytes = 5;
   132     }
   133   else if (c <= MAX_CHAR)
   134     {
   135       c = CHAR_TO_BYTE8 (c);
   136       bytes = BYTE8_STRING (c, p);
   137     }
   138   else
   139     error ("Invalid character: %x", c);
   140 
   141   return bytes;
   142 }
   143 
   144 
   145 /* Translate character C by translation table TABLE.  If no translation is
   146    found in TABLE, return the untranslated character.  If TABLE is a list,
   147    elements are char tables.  In that case, recursively translate C by all the
   148    tables in the list.  */
   149 
   150 int
   151 translate_char (Lisp_Object table, int c)
   152 {
   153   if (CHAR_TABLE_P (table))
   154     {
   155       Lisp_Object ch;
   156 
   157       ch = CHAR_TABLE_REF (table, c);
   158       if (CHARACTERP (ch))
   159         c = XFIXNUM (ch);
   160     }
   161   else
   162     {
   163       for (; CONSP (table); table = XCDR (table))
   164         c = translate_char (XCAR (table), c);
   165     }
   166   return c;
   167 }
   168 
   169 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
   170        doc: /* Return non-nil if OBJECT is a character.
   171 In Emacs Lisp, characters are represented by character codes, which
   172 are non-negative integers.  The function `max-char' returns the
   173 maximum character code.
   174 usage: (characterp OBJECT)  */
   175        attributes: const)
   176   (Lisp_Object object, Lisp_Object ignore)
   177 {
   178   return (CHARACTERP (object) ? Qt : Qnil);
   179 }
   180 
   181 DEFUN ("max-char", Fmax_char, Smax_char, 0, 1, 0,
   182        doc: /* Return the maximum character code.
   183 If UNICODE is non-nil, return the maximum character code defined
   184 by the Unicode Standard.  */
   185        attributes: const)
   186   (Lisp_Object unicode)
   187 {
   188   return (!NILP (unicode)
   189           ? make_fixnum (MAX_UNICODE_CHAR)
   190           : make_fixnum (MAX_CHAR));
   191 }
   192 
   193 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
   194        Sunibyte_char_to_multibyte, 1, 1, 0,
   195        doc: /* Convert the byte CH to multibyte character.  */)
   196   (Lisp_Object ch)
   197 {
   198   int c;
   199 
   200   CHECK_CHARACTER (ch);
   201   c = XFIXNAT (ch);
   202   if (c >= 0x100)
   203     error ("Not a unibyte character: %d", c);
   204   return make_fixnum (make_char_multibyte (c));
   205 }
   206 
   207 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
   208        Smultibyte_char_to_unibyte, 1, 1, 0,
   209        doc: /* Convert the multibyte character CH to a byte.
   210 If the multibyte character does not represent a byte, return -1.  */)
   211   (Lisp_Object ch)
   212 {
   213   int cm;
   214 
   215   CHECK_CHARACTER (ch);
   216   cm = XFIXNAT (ch);
   217   if (cm < 256)
   218     /* Can't distinguish a byte read from a unibyte buffer from
   219        a latin1 char, so let's let it slide.  */
   220     return ch;
   221   else
   222     {
   223       int cu = CHAR_TO_BYTE_SAFE (cm);
   224       return make_fixnum (cu);
   225     }
   226 }
   227 
   228 
   229 /* Return width (columns) of C considering the buffer display table DP. */
   230 
   231 static ptrdiff_t
   232 char_width (int c, struct Lisp_Char_Table *dp)
   233 {
   234   ptrdiff_t width = CHARACTER_WIDTH (c);
   235 
   236   if (dp)
   237     {
   238       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
   239       int i;
   240 
   241       if (VECTORP (disp))
   242         for (i = 0, width = 0; i < ASIZE (disp); i++)
   243           {
   244             int c = -1;
   245             ch = AREF (disp, i);
   246             if (GLYPH_CODE_P (ch))
   247               c = GLYPH_CODE_CHAR (ch);
   248             else if (CHARACTERP (ch))
   249               c = XFIXNUM (ch);
   250             if (c >= 0)
   251               {
   252                 int w = CHARACTER_WIDTH (c);
   253                 if (ckd_add (&width, width, w))
   254                   string_overflow ();
   255               }
   256           }
   257     }
   258   return width;
   259 }
   260 
   261 
   262 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
   263        doc: /* Return width of CHAR in columns when displayed in the current buffer.
   264 The width of CHAR is measured by how many columns it will occupy on the screen.
   265 This is based on data in `char-width-table', and ignores the actual
   266 metrics of the character's glyph as determined by its font.
   267 If the display table in effect replaces CHAR on display with
   268 something else, the function returns the width of the replacement.
   269 Tab is taken to occupy `tab-width' columns.
   270 usage: (char-width CHAR)  */)
   271   (Lisp_Object ch)
   272 {
   273   int c;
   274   ptrdiff_t width;
   275 
   276   CHECK_CHARACTER (ch);
   277   c = XFIXNUM (ch);
   278   width = char_width (c, buffer_display_table ());
   279   return make_fixnum (width);
   280 }
   281 
   282 /* Return width of string STR of length LEN when displayed in the
   283    current buffer.  The width is measured by how many columns it
   284    occupies on the screen.  If PRECISION > 0, return the width of
   285    longest substring that doesn't exceed PRECISION, and set number of
   286    characters and bytes of the substring in *NCHARS and *NBYTES
   287    respectively.  */
   288 
   289 ptrdiff_t
   290 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
   291                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
   292 {
   293   ptrdiff_t i = 0, i_byte = 0;
   294   ptrdiff_t width = 0;
   295   struct Lisp_Char_Table *dp = buffer_display_table ();
   296 
   297   while (i_byte < len)
   298     {
   299       int bytes, c = string_char_and_length (str + i_byte, &bytes);
   300       ptrdiff_t thiswidth = char_width (c, dp);
   301 
   302       if (0 < precision && precision - width < thiswidth)
   303         {
   304           *nchars = i;
   305           *nbytes = i_byte;
   306           return width;
   307         }
   308       if (ckd_add (&width, width, thiswidth))
   309         string_overflow ();
   310       i++;
   311       i_byte += bytes;
   312   }
   313 
   314   if (precision > 0)
   315     {
   316       *nchars = i;
   317       *nbytes = i_byte;
   318     }
   319 
   320   return width;
   321 }
   322 
   323 /* Return width of string STR of length LEN when displayed in the
   324    current buffer.  The width is measured by how many columns it
   325    occupies on the screen.  */
   326 
   327 ptrdiff_t
   328 strwidth (const char *str, ptrdiff_t len)
   329 {
   330   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
   331 }
   332 
   333 /* Return width of a (substring of a) Lisp string STRING when
   334    displayed in the current buffer.  The width is measured by how many
   335    columns it occupies on the screen while paying attention to
   336    compositions.  If PRECISION > 0, return the width of longest
   337    substring that doesn't exceed PRECISION, and set number of
   338    characters and bytes of the substring in *NCHARS and *NBYTES
   339    respectively.  FROM and TO are zero-based character indices that
   340    define the substring of STRING to consider.  If AUTO_COMP is
   341    non-zero, account for automatic compositions in STRING.  */
   342 
   343 ptrdiff_t
   344 lisp_string_width (Lisp_Object string, ptrdiff_t from, ptrdiff_t to,
   345                    ptrdiff_t precision, ptrdiff_t *nchars, ptrdiff_t *nbytes,
   346                    bool auto_comp)
   347 {
   348   /* This set multibyte to 0 even if STRING is multibyte when it
   349      contains only ascii and eight-bit-graphic, but that's
   350      intentional.  */
   351   bool multibyte = SCHARS (string) < SBYTES (string);
   352   ptrdiff_t i = from, i_byte = from ? string_char_to_byte (string, from) : 0;
   353   ptrdiff_t from_byte = i_byte;
   354   ptrdiff_t width = 0;
   355   struct Lisp_Char_Table *dp = buffer_display_table ();
   356 #ifdef HAVE_WINDOW_SYSTEM
   357   struct frame *f =
   358     (FRAMEP (selected_frame) && FRAME_LIVE_P (XFRAME (selected_frame)))
   359     ? XFRAME (selected_frame)
   360     : NULL;
   361   int font_width = -1;
   362   Lisp_Object default_font, frame_font;
   363 #endif
   364 
   365   eassert (precision <= 0 || (nchars && nbytes));
   366 
   367   while (i < to)
   368     {
   369       ptrdiff_t chars, bytes, thiswidth;
   370       Lisp_Object val;
   371       ptrdiff_t cmp_id;
   372       ptrdiff_t ignore, end;
   373 
   374       if (find_composition (i, -1, &ignore, &end, &val, string)
   375           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
   376               >= 0))
   377         {
   378           thiswidth = composition_table[cmp_id]->width;
   379           chars = end - i;
   380           bytes = string_char_to_byte (string, end) - i_byte;
   381         }
   382 #ifdef HAVE_WINDOW_SYSTEM
   383       else if (auto_comp
   384                && f && FRAME_WINDOW_P (f)
   385                && multibyte
   386                && find_automatic_composition (i, -1, i, &ignore,
   387                                               &end, &val, string)
   388                && end > i)
   389         {
   390           int j;
   391           for (j = 0; j < LGSTRING_GLYPH_LEN (val); j++)
   392             if (NILP (LGSTRING_GLYPH (val, j)))
   393               break;
   394 
   395           int pixelwidth = composition_gstring_width (val, 0, j, NULL);
   396 
   397           /* The below is somewhat expensive, so compute it only once
   398              for the entire loop, and only if needed.  */
   399           if (font_width < 0)
   400             {
   401               font_width = FRAME_COLUMN_WIDTH (f);
   402               default_font = Fface_font (Qdefault, Qnil, Qnil);
   403               frame_font = Fframe_parameter (Qnil, Qfont);
   404 
   405               if (STRINGP (default_font) && STRINGP (frame_font)
   406                   && (SCHARS (default_font) != SCHARS (frame_font)
   407                       || SBYTES (default_font) != SBYTES (frame_font)
   408                       || memcmp (SDATA (default_font), SDATA (frame_font),
   409                                  SBYTES (default_font))))
   410                 {
   411                   Lisp_Object font_info = Ffont_info (default_font, Qnil);
   412                   if (VECTORP (font_info))
   413                     {
   414                       font_width = XFIXNUM (AREF (font_info, 11));
   415                       if (font_width <= 0)
   416                         font_width = XFIXNUM (AREF (font_info, 10));
   417                     }
   418                 }
   419             }
   420           thiswidth = (double) pixelwidth / font_width + 0.5;
   421           chars = end - i;
   422           bytes = string_char_to_byte (string, end) - i_byte;
   423         }
   424 #endif  /* HAVE_WINDOW_SYSTEM */
   425       else
   426         {
   427           int c;
   428           unsigned char *str = SDATA (string);
   429 
   430           if (multibyte)
   431             {
   432               int cbytes;
   433               c = string_char_and_length (str + i_byte, &cbytes);
   434               bytes = cbytes;
   435             }
   436           else
   437             c = str[i_byte], bytes = 1;
   438           chars = 1;
   439           thiswidth = char_width (c, dp);
   440         }
   441 
   442       if (0 < precision && precision - width < thiswidth)
   443         {
   444           *nchars = i - from;
   445           *nbytes = i_byte - from_byte;
   446           return width;
   447         }
   448       if (ckd_add (&width, width, thiswidth))
   449         string_overflow ();
   450       i += chars;
   451       i_byte += bytes;
   452     }
   453 
   454   if (precision > 0)
   455     {
   456       *nchars = i - from;
   457       *nbytes = i_byte - from_byte;
   458     }
   459 
   460   return width;
   461 }
   462 
   463 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 3, 0,
   464        doc: /* Return width of STRING in columns when displayed in the current buffer.
   465 Width of STRING is measured by how many columns it will occupy on the screen.
   466 
   467 Optional arguments FROM and TO specify the substring of STRING to
   468 consider, and are interpreted as in `substring'.
   469 
   470 Width of each character in STRING is generally taken according to
   471 `char-width', but character compositions and the display table in
   472 effect are taken into consideration.
   473 Tabs in STRING are always assumed to occupy `tab-width' columns,
   474 although they might take fewer columns depending on the column where
   475 they begin on display.
   476 The effect of faces and fonts, including fonts used for non-Latin and
   477 other unusual characters, such as emoji, is ignored, as are display
   478 properties and invisible text.
   479 
   480 For these reasons, the results are just an approximation, especially
   481 on GUI frames; for accurate dimensions of text as it will be
   482 displayed, use `string-pixel-width' or `window-text-pixel-size'
   483 instead.
   484 usage: (string-width STRING &optional FROM TO)  */)
   485   (Lisp_Object str, Lisp_Object from, Lisp_Object to)
   486 {
   487   Lisp_Object val;
   488   ptrdiff_t ifrom, ito;
   489 
   490   CHECK_STRING (str);
   491   validate_subarray (str, from, to, SCHARS (str), &ifrom, &ito);
   492   XSETFASTINT (val, lisp_string_width (str, ifrom, ito, -1, NULL, NULL, true));
   493   return val;
   494 }
   495 
   496 /* Return the number of characters in the NBYTES bytes at PTR.
   497    This works by looking at the contents and checking for multibyte
   498    sequences while assuming that there's no invalid sequence.
   499    However, if the current buffer has enable-multibyte-characters =
   500    nil, we treat each byte as a character.  */
   501 
   502 ptrdiff_t
   503 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
   504 {
   505   /* current_buffer is null at early stages of Emacs initialization.  */
   506   if (current_buffer == 0
   507       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
   508     return nbytes;
   509 
   510   return multibyte_chars_in_text (ptr, nbytes);
   511 }
   512 
   513 /* Return the number of characters in the NBYTES bytes at PTR.
   514    This works by looking at the contents and checking for multibyte
   515    sequences while assuming that there's no invalid sequence.  It
   516    ignores enable-multibyte-characters.  */
   517 
   518 ptrdiff_t
   519 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
   520 {
   521   const unsigned char *endp = ptr + nbytes;
   522   ptrdiff_t chars = 0;
   523 
   524   while (ptr < endp)
   525     {
   526       int len = multibyte_length (ptr, endp, true, true);
   527 
   528       if (len == 0)
   529         emacs_abort ();
   530       ptr += len;
   531       chars++;
   532     }
   533 
   534   return chars;
   535 }
   536 
   537 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
   538    characters and bytes in it, and store them in *NCHARS and *NBYTES
   539    respectively.  On counting bytes, pay attention to that 8-bit
   540    characters not constructing a valid multibyte sequence are
   541    represented by 2-byte in a multibyte text.  */
   542 
   543 void
   544 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
   545                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
   546 {
   547   const unsigned char *endp = str + len;
   548   ptrdiff_t chars = 0, bytes = 0;
   549 
   550   if (len >= MAX_MULTIBYTE_LENGTH)
   551     {
   552       const unsigned char *adjusted_endp = endp - (MAX_MULTIBYTE_LENGTH - 1);
   553       while (str < adjusted_endp)
   554         {
   555           int n = multibyte_length (str, NULL, false, false);
   556           if (0 < n)
   557             str += n, bytes += n;
   558           else
   559             str++, bytes += 2;
   560           chars++;
   561         }
   562     }
   563   while (str < endp)
   564     {
   565       int n = multibyte_length (str, endp, true, false);
   566       if (0 < n)
   567         str += n, bytes += n;
   568       else
   569         str++, bytes += 2;
   570       chars++;
   571     }
   572 
   573   *nchars = chars;
   574   *nbytes = bytes;
   575   return;
   576 }
   577 
   578 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
   579    It actually converts only such 8-bit characters that don't construct
   580    a multibyte sequence to multibyte forms of raw bytes.  If NCHARS
   581    is nonzero, set *NCHARS to the number of characters in the text.
   582    It is assured that we can use LEN bytes at STR as a work
   583    area and that is enough.  Return the number of bytes of the
   584    resulting text.  */
   585 
   586 ptrdiff_t
   587 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
   588                   ptrdiff_t *nchars)
   589 {
   590   unsigned char *p = str, *endp = str + nbytes;
   591   unsigned char *to;
   592   ptrdiff_t chars = 0;
   593 
   594   if (nbytes >= MAX_MULTIBYTE_LENGTH)
   595     {
   596       unsigned char *adjusted_endp = endp - (MAX_MULTIBYTE_LENGTH - 1);
   597       while (p < adjusted_endp)
   598         {
   599           int n = multibyte_length (p, NULL, false, false);
   600           if (n <= 0)
   601             break;
   602           p += n, chars++;
   603         }
   604     }
   605   while (true)
   606     {
   607       int n = multibyte_length (p, endp, true, false);
   608       if (n <= 0)
   609         break;
   610       p += n, chars++;
   611     }
   612   if (nchars)
   613     *nchars = chars;
   614   if (p == endp)
   615     return nbytes;
   616 
   617   to = p;
   618   nbytes = endp - p;
   619   endp = str + len;
   620   memmove (endp - nbytes, p, nbytes);
   621   p = endp - nbytes;
   622 
   623   if (nbytes >= MAX_MULTIBYTE_LENGTH)
   624     {
   625       unsigned char *adjusted_endp = endp - (MAX_MULTIBYTE_LENGTH - 1);
   626       while (p < adjusted_endp)
   627         {
   628           int n = multibyte_length (p, NULL, false, false);
   629           if (0 < n)
   630             {
   631               while (n--)
   632                 *to++ = *p++;
   633             }
   634           else
   635             {
   636               int c = *p++;
   637               c = BYTE8_TO_CHAR (c);
   638               to += CHAR_STRING (c, to);
   639             }
   640         }
   641       chars++;
   642     }
   643   while (p < endp)
   644     {
   645       int n = multibyte_length (p, endp, true, false);
   646       if (0 < n)
   647         {
   648           while (n--)
   649             *to++ = *p++;
   650         }
   651       else
   652         {
   653           int c = *p++;
   654           c = BYTE8_TO_CHAR (c);
   655           to += CHAR_STRING (c, to);
   656         }
   657       chars++;
   658     }
   659   if (nchars)
   660     *nchars = chars;
   661   return (to - str);
   662 }
   663 
   664 /* Parse unibyte string at STR of LEN bytes, and return the number of
   665    bytes it may occupy when converted to multibyte string by
   666    `str_to_multibyte'.  */
   667 
   668 ptrdiff_t
   669 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
   670 {
   671   /* Count the number of non-ASCII (raw) bytes, since they will occupy
   672      two bytes in a multibyte string.  */
   673   ptrdiff_t nonascii = 0;
   674   for (ptrdiff_t i = 0; i < len; i++)
   675     nonascii += str[i] >> 7;
   676   ptrdiff_t bytes;
   677   if (ckd_add (&bytes, len, nonascii))
   678     string_overflow ();
   679   return bytes;
   680 }
   681 
   682 
   683 /* Convert unibyte text at SRC of NCHARS chars to a multibyte text
   684    at DST, that contains the same single-byte characters.
   685    Return the number of bytes written at DST.  */
   686 ptrdiff_t
   687 str_to_multibyte (unsigned char *dst, const unsigned char *src,
   688                   ptrdiff_t nchars)
   689 {
   690   unsigned char *d = dst;
   691   for (ptrdiff_t i = 0; i < nchars; i++)
   692     {
   693       unsigned char c = src[i];
   694       if (c <= 0x7f)
   695         *d++ = c;
   696       else
   697         {
   698           *d++ = 0xc0 + ((c >> 6) & 1);
   699           *d++ = 0x80 + (c & 0x3f);
   700         }
   701     }
   702   return d - dst;
   703 }
   704 
   705 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
   706    actually converts characters in the range 0x80..0xFF to
   707    unibyte.  */
   708 
   709 ptrdiff_t
   710 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
   711 {
   712   const unsigned char *p = str, *endp = str + bytes;
   713   unsigned char *to;
   714   int c, len;
   715 
   716   while (p < endp)
   717     {
   718       c = *p;
   719       len = BYTES_BY_CHAR_HEAD (c);
   720       if (CHAR_BYTE8_HEAD_P (c))
   721         break;
   722       p += len;
   723     }
   724   to = str + (p - str);
   725   while (p < endp)
   726     {
   727       c = *p;
   728       len = BYTES_BY_CHAR_HEAD (c);
   729       if (CHAR_BYTE8_HEAD_P (c))
   730         {
   731           c = string_char_advance (&p);
   732           *to++ = CHAR_TO_BYTE8 (c);
   733         }
   734       else
   735         {
   736           while (len--) *to++ = *p++;
   737         }
   738     }
   739   return (to - str);
   740 }
   741 
   742 static ptrdiff_t
   743 string_count_byte8 (Lisp_Object string)
   744 {
   745   bool multibyte = STRING_MULTIBYTE (string);
   746   ptrdiff_t nbytes = SBYTES (string);
   747   unsigned char *p = SDATA (string);
   748   unsigned char *pend = p + nbytes;
   749   ptrdiff_t count = 0;
   750   int c, len;
   751 
   752   if (multibyte)
   753     while (p < pend)
   754       {
   755         c = *p;
   756         len = BYTES_BY_CHAR_HEAD (c);
   757 
   758         if (CHAR_BYTE8_HEAD_P (c))
   759           count++;
   760         p += len;
   761       }
   762   else
   763     while (p < pend)
   764       {
   765         if (*p++ >= 0x80)
   766           count++;
   767       }
   768   return count;
   769 }
   770 
   771 
   772 Lisp_Object
   773 string_escape_byte8 (Lisp_Object string)
   774 {
   775   ptrdiff_t nchars = SCHARS (string);
   776   ptrdiff_t nbytes = SBYTES (string);
   777   bool multibyte = STRING_MULTIBYTE (string);
   778   ptrdiff_t byte8_count;
   779   ptrdiff_t thrice_byte8_count, uninit_nchars, uninit_nbytes;
   780   const unsigned char *src, *src_end;
   781   unsigned char *dst;
   782   Lisp_Object val;
   783   int c, len;
   784 
   785   if (multibyte && nchars == nbytes)
   786     return string;
   787 
   788   byte8_count = string_count_byte8 (string);
   789 
   790   if (byte8_count == 0)
   791     return string;
   792 
   793   if (ckd_mul (&thrice_byte8_count, byte8_count, 3))
   794     string_overflow ();
   795 
   796   if (multibyte)
   797     {
   798       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
   799       if (ckd_add (&uninit_nchars, nchars, thrice_byte8_count)
   800           || ckd_add (&uninit_nbytes, nbytes, 2 * byte8_count))
   801         string_overflow ();
   802       val = make_uninit_multibyte_string (uninit_nchars, uninit_nbytes);
   803     }
   804   else
   805     {
   806       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
   807       if (ckd_add (&uninit_nbytes, thrice_byte8_count, nbytes))
   808         string_overflow ();
   809       val = make_uninit_string (uninit_nbytes);
   810     }
   811 
   812   src = SDATA (string);
   813   src_end = src + nbytes;
   814   dst = SDATA (val);
   815   if (multibyte)
   816     while (src < src_end)
   817       {
   818         c = *src;
   819         len = BYTES_BY_CHAR_HEAD (c);
   820 
   821         if (CHAR_BYTE8_HEAD_P (c))
   822           {
   823             c = string_char_advance (&src);
   824             c = CHAR_TO_BYTE8 (c);
   825             dst += sprintf ((char *) dst, "\\%03o", c + 0u);
   826           }
   827         else
   828           while (len--) *dst++ = *src++;
   829       }
   830   else
   831     while (src < src_end)
   832       {
   833         c = *src++;
   834         if (c >= 0x80)
   835           dst += sprintf ((char *) dst, "\\%03o", c + 0u);
   836         else
   837           *dst++ = c;
   838       }
   839   return val;
   840 }
   841 
   842 
   843 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
   844        doc: /*
   845 Concatenate all the argument characters and make the result a string.
   846 usage: (string &rest CHARACTERS)  */)
   847   (ptrdiff_t n, Lisp_Object *args)
   848 {
   849   ptrdiff_t nbytes = 0;
   850   for (ptrdiff_t i = 0; i < n; i++)
   851     {
   852       CHECK_CHARACTER (args[i]);
   853       nbytes += CHAR_BYTES (XFIXNUM (args[i]));
   854     }
   855   if (nbytes == n)
   856     return Funibyte_string (n, args);
   857   Lisp_Object str = make_uninit_multibyte_string (n, nbytes);
   858   unsigned char *p = SDATA (str);
   859   for (ptrdiff_t i = 0; i < n; i++)
   860     {
   861       eassume (CHARACTERP (args[i]));
   862       int c = XFIXNUM (args[i]);
   863       p += CHAR_STRING (c, p);
   864     }
   865   return str;
   866 }
   867 
   868 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
   869        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
   870 usage: (unibyte-string &rest BYTES)  */)
   871   (ptrdiff_t n, Lisp_Object *args)
   872 {
   873   Lisp_Object str = make_uninit_string (n);
   874   unsigned char *p = SDATA (str);
   875   for (ptrdiff_t i = 0; i < n; i++)
   876     *p++ = check_integer_range (args[i], 0, 255);
   877   return str;
   878 }
   879 
   880 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
   881        Schar_resolve_modifiers, 1, 1, 0,
   882        doc: /* Resolve modifiers in the character CHAR.
   883 The value is a character with modifiers resolved into the character
   884 code.  Unresolved modifiers are kept in the value.
   885 usage: (char-resolve-modifiers CHAR)  */)
   886   (Lisp_Object character)
   887 {
   888   EMACS_INT c;
   889 
   890   CHECK_FIXNUM (character);
   891   c = XFIXNUM (character);
   892   return make_fixnum (char_resolve_modifier_mask (c));
   893 }
   894 
   895 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
   896        doc: /* Return a byte value of a character at point.
   897 Optional 1st arg POSITION, if non-nil, is a position of a character to get
   898 a byte value.
   899 Optional 2nd arg STRING, if non-nil, is a string of which first
   900 character is a target to get a byte value.  In this case, POSITION, if
   901 non-nil, is an index of a target character in the string.
   902 
   903 If the current buffer (or STRING) is multibyte, and the target
   904 character is not ASCII nor 8-bit character, an error is signaled.  */)
   905   (Lisp_Object position, Lisp_Object string)
   906 {
   907   int c;
   908   ptrdiff_t pos;
   909   unsigned char *p;
   910 
   911   if (NILP (string))
   912     {
   913       if (NILP (position))
   914         {
   915           p = PT_ADDR;
   916         }
   917       else
   918         {
   919           EMACS_INT fixed_pos = fix_position (position);
   920           if (! (BEGV <= fixed_pos && fixed_pos < ZV))
   921             args_out_of_range_3 (position, make_fixnum (BEGV), make_fixnum (ZV));
   922           pos = fixed_pos;
   923           p = CHAR_POS_ADDR (pos);
   924         }
   925       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
   926         return make_fixnum (*p);
   927     }
   928   else
   929     {
   930       CHECK_STRING (string);
   931       if (NILP (position))
   932         {
   933           p = SDATA (string);
   934         }
   935       else
   936         {
   937           CHECK_FIXNAT (position);
   938           if (XFIXNUM (position) >= SCHARS (string))
   939             args_out_of_range (string, position);
   940           pos = XFIXNAT (position);
   941           p = SDATA (string) + string_char_to_byte (string, pos);
   942         }
   943       if (! STRING_MULTIBYTE (string))
   944         return make_fixnum (*p);
   945     }
   946   c = STRING_CHAR (p);
   947   if (CHAR_BYTE8_P (c))
   948     c = CHAR_TO_BYTE8 (c);
   949   else if (! ASCII_CHAR_P (c))
   950     error ("Not an ASCII nor an 8-bit character: %d", c);
   951   return make_fixnum (c);
   952 }
   953 
   954 /* Return true if C is an alphabetic character.  */
   955 bool
   956 alphabeticp (int c)
   957 {
   958   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
   959   if (! FIXNUMP (category))
   960     return false;
   961   EMACS_INT gen_cat = XFIXNUM (category);
   962 
   963   /* See UTS #18.  There are additional characters that should be
   964      here, those designated as Other_uppercase, Other_lowercase,
   965      and Other_alphabetic; FIXME.  */
   966   return (gen_cat == UNICODE_CATEGORY_Lu
   967           || gen_cat == UNICODE_CATEGORY_Ll
   968           || gen_cat == UNICODE_CATEGORY_Lt
   969           || gen_cat == UNICODE_CATEGORY_Lm
   970           || gen_cat == UNICODE_CATEGORY_Lo
   971           || gen_cat == UNICODE_CATEGORY_Mn
   972           || gen_cat == UNICODE_CATEGORY_Mc
   973           || gen_cat == UNICODE_CATEGORY_Me
   974           || gen_cat == UNICODE_CATEGORY_Nl);
   975 }
   976 
   977 /* Return true if C is an alphabetic or decimal-number character.  */
   978 bool
   979 alphanumericp (int c)
   980 {
   981   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
   982   if (! FIXNUMP (category))
   983     return false;
   984   EMACS_INT gen_cat = XFIXNUM (category);
   985 
   986   /* See UTS #18.  Same comment as for alphabeticp applies.  FIXME. */
   987   return (gen_cat == UNICODE_CATEGORY_Lu
   988           || gen_cat == UNICODE_CATEGORY_Ll
   989           || gen_cat == UNICODE_CATEGORY_Lt
   990           || gen_cat == UNICODE_CATEGORY_Lm
   991           || gen_cat == UNICODE_CATEGORY_Lo
   992           || gen_cat == UNICODE_CATEGORY_Mn
   993           || gen_cat == UNICODE_CATEGORY_Mc
   994           || gen_cat == UNICODE_CATEGORY_Me
   995           || gen_cat == UNICODE_CATEGORY_Nl
   996           || gen_cat == UNICODE_CATEGORY_Nd);
   997 }
   998 
   999 /* Return true if C is a graphic character.  */
  1000 bool
  1001 graphicp (int c)
  1002 {
  1003   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
  1004   if (! FIXNUMP (category))
  1005     return false;
  1006   EMACS_INT gen_cat = XFIXNUM (category);
  1007 
  1008   /* See UTS #18.  */
  1009   return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
  1010             || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
  1011             || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
  1012             || gen_cat == UNICODE_CATEGORY_Cc /* control */
  1013             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
  1014             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
  1015 }
  1016 
  1017 /* Return true if C is a printable character.  */
  1018 bool
  1019 printablep (int c)
  1020 {
  1021   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
  1022   if (! FIXNUMP (category))
  1023     return false;
  1024   EMACS_INT gen_cat = XFIXNUM (category);
  1025 
  1026   /* See UTS #18.  */
  1027   return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
  1028             || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
  1029             || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
  1030 }
  1031 
  1032 /* Return true if C is graphic character that can be printed independently.  */
  1033 bool
  1034 graphic_base_p (int c)
  1035 {
  1036   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
  1037   if (! FIXNUMP (category))
  1038     return false;
  1039   EMACS_INT gen_cat = XFIXNUM (category);
  1040 
  1041   return (!(gen_cat == UNICODE_CATEGORY_Mn       /* mark, nonspacing */
  1042             || gen_cat == UNICODE_CATEGORY_Mc    /* mark, combining */
  1043             || gen_cat == UNICODE_CATEGORY_Me    /* mark, enclosing */
  1044             || gen_cat == UNICODE_CATEGORY_Zs    /* separator, space */
  1045             || gen_cat == UNICODE_CATEGORY_Zl    /* separator, line */
  1046             || gen_cat == UNICODE_CATEGORY_Zp    /* separator, paragraph */
  1047             || gen_cat == UNICODE_CATEGORY_Cc    /* other, control */
  1048             || gen_cat == UNICODE_CATEGORY_Cs    /* other, surrogate */
  1049             || gen_cat == UNICODE_CATEGORY_Cf    /* other, format */
  1050             || gen_cat == UNICODE_CATEGORY_Cn)); /* other, unassigned */
  1051 }
  1052 
  1053 /* Return true if C is a horizontal whitespace character, as defined
  1054    by https://www.unicode.org/reports/tr18/tr18-19.html#blank.  */
  1055 bool
  1056 blankp (int c)
  1057 {
  1058   Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
  1059   if (! FIXNUMP (category))
  1060     return false;
  1061 
  1062   return XFIXNUM (category) == UNICODE_CATEGORY_Zs; /* separator, space */
  1063 }
  1064 
  1065 /* hexdigit[C] is one greater than C's numeric value if C is a
  1066    hexadecimal digit, zero otherwise.  */
  1067 signed char const hexdigit[UCHAR_MAX + 1] =
  1068   {
  1069     ['0'] = 1 + 0, ['1'] = 1 + 1, ['2'] = 1 + 2, ['3'] = 1 + 3, ['4'] = 1 + 4,
  1070     ['5'] = 1 + 5, ['6'] = 1 + 6, ['7'] = 1 + 7, ['8'] = 1 + 8, ['9'] = 1 + 9,
  1071     ['A'] = 1 + 10, ['B'] = 1 + 11, ['C'] = 1 + 12,
  1072     ['D'] = 1 + 13, ['E'] = 1 + 14, ['F'] = 1 + 15,
  1073     ['a'] = 1 + 10, ['b'] = 1 + 11, ['c'] = 1 + 12,
  1074     ['d'] = 1 + 13, ['e'] = 1 + 14, ['f'] = 1 + 15
  1075   };
  1076 
  1077 void
  1078 syms_of_character (void)
  1079 {
  1080   DEFSYM (Qcharacterp, "characterp");
  1081   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
  1082 
  1083   staticpro (&Vchar_unify_table);
  1084   Vchar_unify_table = Qnil;
  1085 
  1086   defsubr (&Smax_char);
  1087   defsubr (&Scharacterp);
  1088   defsubr (&Sunibyte_char_to_multibyte);
  1089   defsubr (&Smultibyte_char_to_unibyte);
  1090   defsubr (&Schar_width);
  1091   defsubr (&Sstring_width);
  1092   defsubr (&Sstring);
  1093   defsubr (&Sunibyte_string);
  1094   defsubr (&Schar_resolve_modifiers);
  1095   defsubr (&Sget_byte);
  1096 
  1097   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
  1098                doc: /*
  1099 Vector recording all translation tables ever defined.
  1100 Each element is a pair (SYMBOL . TABLE) relating the table to the
  1101 symbol naming it.  The ID of a translation table is an index into this vector.  */);
  1102   Vtranslation_table_vector = make_nil_vector (16);
  1103 
  1104   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
  1105                doc: /*
  1106 A char-table for characters which invoke auto-filling.
  1107 Such characters have value t in this table.  */);
  1108   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
  1109   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
  1110   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
  1111 
  1112   DEFVAR_LISP ("char-width-table", Vchar_width_table,
  1113                doc: /*
  1114 A char-table for width (columns) of each character.  */);
  1115   Vchar_width_table = Fmake_char_table (Qnil, make_fixnum (1));
  1116   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_fixnum (4));
  1117   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
  1118                         make_fixnum (4));
  1119 
  1120   DEFVAR_LISP ("ambiguous-width-chars", Vambiguous_width_chars,
  1121                doc: /*
  1122 A char-table for characters whose width (columns) can be 1 or 2.
  1123 
  1124 The actual width depends on the language-environment and on the
  1125 value of `cjk-ambiguous-chars-are-wide'.  */);
  1126   Vambiguous_width_chars = Fmake_char_table (Qnil, Qnil);
  1127 
  1128   DEFVAR_LISP ("printable-chars", Vprintable_chars,
  1129                doc: /* A char-table for each printable character.  */);
  1130   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
  1131   Fset_char_table_range (Vprintable_chars,
  1132                          Fcons (make_fixnum (32), make_fixnum (126)), Qt);
  1133   Fset_char_table_range (Vprintable_chars,
  1134                          Fcons (make_fixnum (160),
  1135                                 make_fixnum (MAX_5_BYTE_CHAR)), Qt);
  1136 
  1137   DEFVAR_LISP ("char-script-table", Vchar_script_table,
  1138                doc: /* Char table of script symbols.
  1139 It has one extra slot whose value is a list of script symbols.  */);
  1140 
  1141   DEFSYM (Qchar_script_table, "char-script-table");
  1142   Fput (Qchar_script_table, Qchar_table_extra_slots, make_fixnum (1));
  1143   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
  1144 
  1145   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
  1146                doc: /* Alist of scripts vs the representative characters.
  1147 Each element is a cons (SCRIPT . CHARS).
  1148 SCRIPT is a symbol representing a script or a subgroup of a script.
  1149 CHARS is a list or a vector of characters.
  1150 If it is a list, all characters in the list are necessary for supporting SCRIPT.
  1151 If it is a vector, one of the characters in the vector is necessary.
  1152 This variable is used to find a font for a specific script.  */);
  1153   Vscript_representative_chars = Qnil;
  1154 
  1155   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
  1156                doc: /* Char table of Unicode's "General Category".
  1157 All Unicode characters have one of the following values (symbol):
  1158   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
  1159   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
  1160 See The Unicode Standard for the meaning of those values.  */);
  1161   /* The correct char-table is setup in characters.el.  */
  1162   Vunicode_category_table = Qnil;
  1163 }
/* [<][>][^][v][top][bottom][index][help] */
root/src/character.c

DEFINITIONS