chars.c (15735B)
1 /************************************************************************** 2 * chars.c -- This file is part of GNU nano. * 3 * * 4 * Copyright (C) 2001-2011, 2013-2025 Free Software Foundation, Inc. * 5 * Copyright (C) 2016-2021 Benno Schulenberg * 6 * * 7 * GNU nano is free software: you can redistribute it and/or modify * 8 * it under the terms of the GNU General Public License as published * 9 * by the Free Software Foundation, either version 3 of the License, * 10 * or (at your option) any later version. * 11 * * 12 * GNU nano is distributed in the hope that it will be useful, * 13 * but WITHOUT ANY WARRANTY; without even the implied warranty * 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * 15 * See the GNU General Public License for more details. * 16 * * 17 * You should have received a copy of the GNU General Public License * 18 * along with this program. If not, see https://gnu.org/licenses/. * 19 * * 20 **************************************************************************/ 21 22 #include "prototypes.h" 23 24 #include <ctype.h> 25 #include <string.h> 26 #ifdef ENABLE_UTF8 27 #include <wchar.h> 28 #include <wctype.h> 29 #endif 30 31 #ifdef ENABLE_SPELLER 32 /* Return TRUE when the given character is some kind of letter. */ 33 bool is_alpha_char(const char *c) 34 { 35 #ifdef ENABLE_UTF8 36 wchar_t wc; 37 38 if (mbtowide(&wc, c) < 0) 39 return FALSE; 40 41 return iswalpha(wc); 42 #else 43 return isalpha((unsigned char)*c); 44 #endif 45 } 46 #endif /* ENABLE_SPELLER */ 47 48 /* Return TRUE when the given character is some kind of letter or a digit. */ 49 bool is_alnum_char(const char *c) 50 { 51 #ifdef ENABLE_UTF8 52 wchar_t wc; 53 54 if (mbtowide(&wc, c) < 0) 55 return FALSE; 56 57 return iswalnum(wc); 58 #else 59 return isalnum((unsigned char)*c); 60 #endif 61 } 62 63 /* Return TRUE when the given character is space or tab or other whitespace. */ 64 bool is_blank_char(const char *c) 65 { 66 #ifdef ENABLE_UTF8 67 wchar_t wc; 68 69 if ((signed char)*c >= 0) 70 return (*c == ' ' || *c == '\t'); 71 72 if (mbtowide(&wc, c) < 0) 73 return FALSE; 74 75 return iswblank(wc); 76 #else 77 return isblank((unsigned char)*c); 78 #endif 79 } 80 81 /* Return TRUE when the given character is a control character. */ 82 bool is_cntrl_char(const char *c) 83 { 84 #ifdef ENABLE_UTF8 85 if (using_utf8) 86 return ((c[0] & 0xE0) == 0 || c[0] == DEL_CODE || 87 ((signed char)c[0] == -62 && (signed char)c[1] < -96)); 88 else 89 #endif 90 return ((*c & 0x60) == 0 || *c == DEL_CODE); 91 } 92 93 /* Return TRUE when the given character is a punctuation character. */ 94 bool is_punct_char(const char *c) 95 { 96 #ifdef ENABLE_UTF8 97 wchar_t wc; 98 99 if (mbtowide(&wc, c) < 0) 100 return FALSE; 101 102 return iswpunct(wc); 103 #else 104 return ispunct((unsigned char)*c); 105 #endif 106 } 107 108 /* Return TRUE when the given character is word-forming (it is alphanumeric or 109 * specified in 'wordchars', or it is punctuation when allow_punct is TRUE). */ 110 bool is_word_char(const char *c, bool allow_punct) 111 { 112 if (*c == '\0') 113 return FALSE; 114 115 if (is_alnum_char(c)) 116 return TRUE; 117 118 if (allow_punct && is_punct_char(c)) 119 return TRUE; 120 121 if (word_chars != NULL && *word_chars != '\0') { 122 char symbol[MAXCHARLEN + 1]; 123 int symlen = collect_char(c, symbol); 124 125 symbol[symlen] = '\0'; 126 return (strstr(word_chars, symbol) != NULL); 127 } else 128 return FALSE; 129 } 130 131 /* Return the visible representation of control character c. */ 132 char control_rep(const signed char c) 133 { 134 if (c == DEL_CODE) 135 return '?'; 136 else if (c == -97) 137 return '='; 138 else if (c < 0) 139 return c + 224; 140 else 141 return c + 64; 142 } 143 144 /* Return the visible representation of multibyte control character c. */ 145 char control_mbrep(const char *c, bool isdata) 146 { 147 /* An embedded newline is an encoded NUL if it is data. */ 148 if (*c == '\n' && (isdata || as_an_at)) 149 return '@'; 150 151 #ifdef ENABLE_UTF8 152 if (using_utf8) { 153 if ((unsigned char)c[0] < 128) 154 return control_rep(c[0]); 155 else 156 return control_rep(c[1]); 157 } else 158 #endif 159 return control_rep(*c); 160 } 161 162 #ifdef ENABLE_UTF8 163 /* Convert the given multibyte sequence c to wide character wc, and return 164 * the number of bytes in the sequence, or -1 for an invalid sequence. */ 165 int mbtowide(wchar_t *wc, const char *c) 166 { 167 if ((signed char)*c < 0 && using_utf8) { 168 unsigned char v1 = c[0]; 169 unsigned char v2 = c[1] ^ 0x80; 170 171 if (v2 > 0x3F || v1 < 0xC2) 172 return -1; 173 174 if (v1 < 0xE0) { 175 *wc = (((unsigned int)(v1 & 0x1F) << 6) | (unsigned int)v2); 176 return 2; 177 } 178 179 unsigned char v3 = c[2] ^ 0x80; 180 181 if (v3 > 0x3F) 182 return -1; 183 184 if (v1 < 0xF0) { 185 if ((v1 > 0xE0 || v2 >= 0x20) && (v1 != 0xED || v2 < 0x20)) { 186 *wc = (((unsigned int)(v1 & 0x0F) << 12) | 187 ((unsigned int)v2 << 6) | (unsigned int)v3); 188 return 3; 189 } else 190 return -1; 191 } 192 193 unsigned char v4 = c[3] ^ 0x80; 194 195 if (v4 > 0x3F || v1 > 0xF4) 196 return -1; 197 198 if ((v1 > 0xF0 || v2 >= 0x10) && (v1 != 0xF4 || v2 < 0x10)) { 199 *wc = (((unsigned int)(v1 & 0x07) << 18) | ((unsigned int)v2 << 12) | 200 ((unsigned int)v3 << 6) | (unsigned int)v4); 201 return 4; 202 } else 203 return -1; 204 } 205 206 *wc = (unsigned int)*c; 207 return 1; 208 } 209 210 /* Return TRUE when the given character occupies two cells. */ 211 bool is_doublewidth(const char *ch) 212 { 213 wchar_t wc; 214 215 /* Only from U+1100 can code points have double width. */ 216 if ((unsigned char)*ch < 0xE1 || !using_utf8) 217 return FALSE; 218 219 if (mbtowide(&wc, ch) < 0) 220 return FALSE; 221 222 return (wcwidth(wc) == 2); 223 } 224 225 /* Return TRUE when the given character occupies zero cells. */ 226 bool is_zerowidth(const char *ch) 227 { 228 wchar_t wc; 229 230 /* Only from U+0300 can code points have zero width. */ 231 if ((unsigned char)*ch < 0xCC || !using_utf8) 232 return FALSE; 233 234 if (mbtowide(&wc, ch) < 0) 235 return FALSE; 236 237 #if defined(__OpenBSD__) 238 /* Work around an OpenBSD bug -- see https://sv.gnu.org/bugs/?60393. */ 239 if (wc >= 0xF0000) 240 return FALSE; 241 #endif 242 243 return (wcwidth(wc) == 0); 244 } 245 #endif /* ENABLE_UTF8 */ 246 247 /* Return the number of bytes in the character that starts at *pointer. */ 248 int char_length(const char *pointer) 249 { 250 #ifdef ENABLE_UTF8 251 if ((unsigned char)*pointer > 0xC1 && using_utf8) { 252 unsigned char c1 = pointer[0]; 253 unsigned char c2 = pointer[1]; 254 255 if ((c2 ^ 0x80) > 0x3F) 256 return 1; 257 258 if (c1 < 0xE0) 259 return 2; 260 261 if (((unsigned char)pointer[2] ^ 0x80) > 0x3F) 262 return 1; 263 264 if (c1 < 0xF0) { 265 if ((c1 > 0xE0 || c2 >= 0xA0) && (c1 != 0xED || c2 < 0xA0)) 266 return 3; 267 else 268 return 1; 269 } 270 271 if (((unsigned char)pointer[3] ^ 0x80) > 0x3F) 272 return 1; 273 274 if (c1 > 0xF4) 275 return 1; 276 277 if ((c1 > 0xF0 || c2 >= 0x90) && (c1 != 0xF4 || c2 < 0x90)) 278 return 4; 279 } 280 #endif 281 return 1; 282 } 283 284 /* Return the number of (multibyte) characters in the given string. */ 285 size_t mbstrlen(const char *pointer) 286 { 287 size_t count = 0; 288 289 while (*pointer != '\0') { 290 pointer += char_length(pointer); 291 count++; 292 } 293 294 return count; 295 } 296 297 /* Return the length (in bytes) of the character at the start of the 298 * given string, and return a copy of this character in *thechar. */ 299 int collect_char(const char *string, char *thechar) 300 { 301 int charlen = char_length(string); 302 303 for (int i = 0; i < charlen; i++) 304 thechar[i] = string[i]; 305 306 return charlen; 307 } 308 309 /* Return the length (in bytes) of the character at the start of 310 * the given string, and add this character's width to *column. */ 311 int advance_over(const char *string, size_t *column) 312 { 313 #ifdef ENABLE_UTF8 314 if ((signed char)*string < 0 && using_utf8) { 315 /* A UTF-8 upper control code has two bytes and takes two columns. */ 316 if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) { 317 *column += 2; 318 return 2; 319 } else { 320 wchar_t wc; 321 int charlen = mbtowide(&wc, string); 322 323 if (charlen < 0) { 324 *column += 1; 325 return 1; 326 } 327 328 int width = wcwidth(wc); 329 330 #if defined(__OpenBSD__) 331 *column += (width < 0 || wc >= 0xF0000) ? 1 : width; 332 #else 333 *column += (width < 0) ? 1 : width; 334 #endif 335 return charlen; 336 } 337 } 338 #endif 339 340 if ((unsigned char)*string < 0x20) { 341 if (*string == '\t') 342 *column += tabsize - *column % tabsize; 343 else 344 *column += 2; 345 } else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0) 346 *column += 2; 347 else 348 *column += 1; 349 350 return 1; 351 } 352 353 /* Return the index in buf of the beginning of the multibyte character 354 * before the one at pos. */ 355 size_t step_left(const char *buf, size_t pos) 356 { 357 #ifdef ENABLE_UTF8 358 if (using_utf8) { 359 size_t before, charlen = 0; 360 361 if (pos < 4) 362 before = 0; 363 else { 364 const char *ptr = buf + pos; 365 366 /* Probe for a valid starter byte in the preceding four bytes. */ 367 if ((signed char)*(--ptr) > -65) 368 before = pos - 1; 369 else if ((signed char)*(--ptr) > -65) 370 before = pos - 2; 371 else if ((signed char)*(--ptr) > -65) 372 before = pos - 3; 373 else if ((signed char)*(--ptr) > -65) 374 before = pos - 4; 375 else 376 before = pos - 1; 377 } 378 379 /* Move forward again until we reach the original character, 380 * so we know the length of its preceding character. */ 381 while (before < pos) { 382 charlen = char_length(buf + before); 383 before += charlen; 384 } 385 386 return before - charlen; 387 } else 388 #endif 389 return (pos == 0 ? 0 : pos - 1); 390 } 391 392 /* Return the index in buf of the beginning of the multibyte character 393 * after the one at pos. */ 394 size_t step_right(const char *buf, size_t pos) 395 { 396 return pos + char_length(buf + pos); 397 } 398 399 /* This function is equivalent to strcasecmp() for multibyte strings. */ 400 int mbstrcasecmp(const char *s1, const char *s2) 401 { 402 return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE); 403 } 404 405 /* This function is equivalent to strncasecmp() for multibyte strings. */ 406 int mbstrncasecmp(const char *s1, const char *s2, size_t n) 407 { 408 #ifdef ENABLE_UTF8 409 if (using_utf8) { 410 wchar_t wc1, wc2; 411 412 while (*s1 != '\0' && *s2 != '\0' && n > 0) { 413 if ((signed char)*s1 >= 0 && (signed char)*s2 >= 0) { 414 if ('A' <= (*s1 & 0x5F) && (*s1 & 0x5F) <= 'Z') { 415 if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z') { 416 if ((*s1 & 0x5F) != (*s2 & 0x5F)) 417 return ((*s1 & 0x5F) - (*s2 & 0x5F)); 418 } else 419 return ((*s1 | 0x20) - *s2); 420 } else if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z') 421 return (*s1 - (*s2 | 0x20)); 422 else if (*s1 != *s2) 423 return (*s1 - *s2); 424 425 s1++; s2++; n--; 426 continue; 427 } 428 429 bool bad1 = (mbtowide(&wc1, s1) < 0); 430 bool bad2 = (mbtowide(&wc2, s2) < 0); 431 432 if (bad1 || bad2) { 433 if (*s1 != *s2) 434 return (unsigned char)*s1 - (unsigned char)*s2; 435 436 if (bad1 != bad2) 437 return (bad1 ? 1 : -1); 438 } else { 439 int difference = towlower(wc1) - towlower(wc2); 440 441 if (difference != 0) 442 return difference; 443 } 444 445 s1 += char_length(s1); 446 s2 += char_length(s2); 447 n--; 448 } 449 450 return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0; 451 } else 452 #endif 453 return strncasecmp(s1, s2, n); 454 } 455 456 /* This function is equivalent to strcasestr() for multibyte strings. */ 457 char *mbstrcasestr(const char *haystack, const char *needle) 458 { 459 #ifdef ENABLE_UTF8 460 if (using_utf8) { 461 size_t needle_len = mbstrlen(needle); 462 463 while (*haystack != '\0') { 464 if (mbstrncasecmp(haystack, needle, needle_len) == 0) 465 return (char *)haystack; 466 467 haystack += char_length(haystack); 468 } 469 470 return NULL; 471 } else 472 #endif 473 return (char *)strcasestr(haystack, needle); 474 } 475 476 /* This function is equivalent to strstr(), except in that it scans the 477 * string in reverse, starting at pointer. */ 478 char *revstrstr(const char *haystack, const char *needle, 479 const char *pointer) 480 { 481 size_t needle_len = strlen(needle); 482 size_t tail_len = strlen(pointer); 483 484 if (tail_len < needle_len) 485 pointer -= (needle_len - tail_len); 486 487 while (pointer >= haystack) { 488 if (strncmp(pointer, needle, needle_len) == 0) 489 return (char *)pointer; 490 pointer--; 491 } 492 493 return NULL; 494 } 495 496 /* This function is equivalent to strcasestr(), except in that it scans 497 * the string in reverse, starting at pointer. */ 498 char *revstrcasestr(const char *haystack, const char *needle, 499 const char *pointer) 500 { 501 size_t needle_len = strlen(needle); 502 size_t tail_len = strlen(pointer); 503 504 if (tail_len < needle_len) 505 pointer -= (needle_len - tail_len); 506 507 while (pointer >= haystack) { 508 if (strncasecmp(pointer, needle, needle_len) == 0) 509 return (char *)pointer; 510 pointer--; 511 } 512 513 return NULL; 514 } 515 516 /* This function is equivalent to strcasestr() for multibyte strings, 517 * except in that it scans the string in reverse, starting at pointer. */ 518 char *mbrevstrcasestr(const char *haystack, const char *needle, 519 const char *pointer) 520 { 521 #ifdef ENABLE_UTF8 522 if (using_utf8) { 523 size_t needle_len = mbstrlen(needle); 524 size_t tail_len = mbstrlen(pointer); 525 526 if (tail_len < needle_len) 527 pointer -= (needle_len - tail_len); 528 529 if (pointer < haystack) 530 return NULL; 531 532 while (TRUE) { 533 if (mbstrncasecmp(pointer, needle, needle_len) == 0) 534 return (char *)pointer; 535 536 if (pointer == haystack) 537 return NULL; 538 539 pointer = haystack + step_left(haystack, pointer - haystack); 540 } 541 } else 542 #endif 543 return revstrcasestr(haystack, needle, pointer); 544 } 545 546 #if !defined(NANO_TINY) || defined(ENABLE_JUSTIFY) 547 /* This function is equivalent to strchr() for multibyte strings. */ 548 char *mbstrchr(const char *string, const char *chr) 549 { 550 #ifdef ENABLE_UTF8 551 if (using_utf8) { 552 bool bad_s = FALSE, bad_c = FALSE; 553 wchar_t ws, wc; 554 555 if (mbtowide(&wc, chr) < 0) { 556 wc = (unsigned char)*chr; 557 bad_c = TRUE; 558 } 559 560 while (*string != '\0') { 561 int symlen = mbtowide(&ws, string); 562 563 if (symlen < 0) { 564 ws = (unsigned char)*string; 565 bad_s = TRUE; 566 } 567 568 if (ws == wc && bad_s == bad_c) 569 break; 570 571 string += symlen; 572 } 573 574 if (*string == '\0') 575 return NULL; 576 577 return (char *)string; 578 } else 579 #endif 580 return strchr(string, *chr); 581 } 582 #endif /* !NANO_TINY || ENABLE_JUSTIFY */ 583 584 #ifndef NANO_TINY 585 /* Locate, in the given string, the first occurrence of any of 586 * the characters in accept, searching forward. */ 587 char *mbstrpbrk(const char *string, const char *accept) 588 { 589 while (*string != '\0') { 590 if (mbstrchr(accept, string) != NULL) 591 return (char *)string; 592 593 string += char_length(string); 594 } 595 596 return NULL; 597 } 598 599 /* Locate, in the string that starts at head, the first occurrence of any of 600 * the characters in accept, starting from pointer and searching backwards. */ 601 char *mbrevstrpbrk(const char *head, const char *accept, const char *pointer) 602 { 603 if (*pointer == '\0') { 604 if (pointer == head) 605 return NULL; 606 pointer = head + step_left(head, pointer - head); 607 } 608 609 while (TRUE) { 610 if (mbstrchr(accept, pointer) != NULL) 611 return (char *)pointer; 612 613 /* If we've reached the head of the string, we found nothing. */ 614 if (pointer == head) 615 return NULL; 616 617 pointer = head + step_left(head, pointer - head); 618 } 619 } 620 #endif /* !NANO_TINY */ 621 622 #if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || defined(ENABLE_JUSTIFY)) 623 /* Return TRUE if the given string contains at least one blank character. */ 624 bool has_blank_char(const char *string) 625 { 626 while (*string != '\0' && !is_blank_char(string)) 627 string += char_length(string); 628 629 return *string; 630 } 631 #endif 632 633 /* Return TRUE when the given string is empty or consists of only blanks. */ 634 bool white_string(const char *string) 635 { 636 while (*string != '\0' && (is_blank_char(string) || *string == '\r')) 637 string += char_length(string); 638 639 return !*string; 640 } 641 642 #if defined(ENABLE_SPELLER) || defined(ENABLE_COLOR) 643 /* Remove leading whitespace from the given string. */ 644 void strip_leading_blanks_from(char *string) 645 { 646 while (string && (*string == ' ' || *string == '\t')) 647 memmove(string, string + 1, strlen(string)); 648 } 649 #endif