nano

nano with my custom patches
git clone git://bsandro.tech/nano
Log | Files | Refs | README | LICENSE

chars.c (15735B)


      1 /**************************************************************************
      2  *   chars.c  --  This file is part of GNU nano.                          *
      3  *                                                                        *
      4  *   Copyright (C) 2001-2011, 2013-2025 Free Software Foundation, Inc.    *
      5  *   Copyright (C) 2016-2021 Benno Schulenberg                            *
      6  *                                                                        *
      7  *   GNU nano is free software: you can redistribute it and/or modify     *
      8  *   it under the terms of the GNU General Public License as published    *
      9  *   by the Free Software Foundation, either version 3 of the License,    *
     10  *   or (at your option) any later version.                               *
     11  *                                                                        *
     12  *   GNU nano is distributed in the hope that it will be useful,          *
     13  *   but WITHOUT ANY WARRANTY; without even the implied warranty          *
     14  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.              *
     15  *   See the GNU General Public License for more details.                 *
     16  *                                                                        *
     17  *   You should have received a copy of the GNU General Public License    *
     18  *   along with this program.  If not, see https://gnu.org/licenses/.     *
     19  *                                                                        *
     20  **************************************************************************/
     21 
     22 #include "prototypes.h"
     23 
     24 #include <ctype.h>
     25 #include <string.h>
     26 #ifdef ENABLE_UTF8
     27 #include <wchar.h>
     28 #include <wctype.h>
     29 #endif
     30 
     31 #ifdef ENABLE_SPELLER
     32 /* Return TRUE when the given character is some kind of letter. */
     33 bool is_alpha_char(const char *c)
     34 {
     35 #ifdef ENABLE_UTF8
     36 	wchar_t wc;
     37 
     38 	if (mbtowide(&wc, c) < 0)
     39 		return FALSE;
     40 
     41 	return iswalpha(wc);
     42 #else
     43 	return isalpha((unsigned char)*c);
     44 #endif
     45 }
     46 #endif /* ENABLE_SPELLER */
     47 
     48 /* Return TRUE when the given character is some kind of letter or a digit. */
     49 bool is_alnum_char(const char *c)
     50 {
     51 #ifdef ENABLE_UTF8
     52 	wchar_t wc;
     53 
     54 	if (mbtowide(&wc, c) < 0)
     55 		return FALSE;
     56 
     57 	return iswalnum(wc);
     58 #else
     59 	return isalnum((unsigned char)*c);
     60 #endif
     61 }
     62 
     63 /* Return TRUE when the given character is space or tab or other whitespace. */
     64 bool is_blank_char(const char *c)
     65 {
     66 #ifdef ENABLE_UTF8
     67 	wchar_t wc;
     68 
     69 	if ((signed char)*c >= 0)
     70 		return (*c == ' ' || *c == '\t');
     71 
     72 	if (mbtowide(&wc, c) < 0)
     73 		return FALSE;
     74 
     75 	return iswblank(wc);
     76 #else
     77 	return isblank((unsigned char)*c);
     78 #endif
     79 }
     80 
     81 /* Return TRUE when the given character is a control character. */
     82 bool is_cntrl_char(const char *c)
     83 {
     84 #ifdef ENABLE_UTF8
     85 	if (using_utf8)
     86 		return ((c[0] & 0xE0) == 0 || c[0] == DEL_CODE ||
     87 				((signed char)c[0] == -62 && (signed char)c[1] < -96));
     88 	else
     89 #endif
     90 		return ((*c & 0x60) == 0 || *c == DEL_CODE);
     91 }
     92 
     93 /* Return TRUE when the given character is a punctuation character. */
     94 bool is_punct_char(const char *c)
     95 {
     96 #ifdef ENABLE_UTF8
     97 	wchar_t wc;
     98 
     99 	if (mbtowide(&wc, c) < 0)
    100 		return FALSE;
    101 
    102 	return iswpunct(wc);
    103 #else
    104 	return ispunct((unsigned char)*c);
    105 #endif
    106 }
    107 
    108 /* Return TRUE when the given character is word-forming (it is alphanumeric or
    109  * specified in 'wordchars', or it is punctuation when allow_punct is TRUE). */
    110 bool is_word_char(const char *c, bool allow_punct)
    111 {
    112 	if (*c == '\0')
    113 		return FALSE;
    114 
    115 	if (is_alnum_char(c))
    116 		return TRUE;
    117 
    118 	if (allow_punct && is_punct_char(c))
    119 		return TRUE;
    120 
    121 	if (word_chars != NULL && *word_chars != '\0') {
    122 		char symbol[MAXCHARLEN + 1];
    123 		int symlen = collect_char(c, symbol);
    124 
    125 		symbol[symlen] = '\0';
    126 		return (strstr(word_chars, symbol) != NULL);
    127 	} else
    128 		return FALSE;
    129 }
    130 
    131 /* Return the visible representation of control character c. */
    132 char control_rep(const signed char c)
    133 {
    134 	if (c == DEL_CODE)
    135 		return '?';
    136 	else if (c == -97)
    137 		return '=';
    138 	else if (c < 0)
    139 		return c + 224;
    140 	else
    141 		return c + 64;
    142 }
    143 
    144 /* Return the visible representation of multibyte control character c. */
    145 char control_mbrep(const char *c, bool isdata)
    146 {
    147 	/* An embedded newline is an encoded NUL if it is data. */
    148 	if (*c == '\n' && (isdata || as_an_at))
    149 		return '@';
    150 
    151 #ifdef ENABLE_UTF8
    152 	if (using_utf8) {
    153 		if ((unsigned char)c[0] < 128)
    154 			return control_rep(c[0]);
    155 		else
    156 			return control_rep(c[1]);
    157 	} else
    158 #endif
    159 		return control_rep(*c);
    160 }
    161 
    162 #ifdef ENABLE_UTF8
    163 /* Convert the given multibyte sequence c to wide character wc, and return
    164  * the number of bytes in the sequence, or -1 for an invalid sequence. */
    165 int mbtowide(wchar_t *wc, const char *c)
    166 {
    167 	if ((signed char)*c < 0 && using_utf8) {
    168 		unsigned char v1 = c[0];
    169 		unsigned char v2 = c[1] ^ 0x80;
    170 
    171 		if (v2 > 0x3F || v1 < 0xC2)
    172 			return -1;
    173 
    174 		if (v1 < 0xE0) {
    175 			*wc = (((unsigned int)(v1 & 0x1F) << 6) | (unsigned int)v2);
    176 			return 2;
    177 		}
    178 
    179 		unsigned char v3 = c[2] ^ 0x80;
    180 
    181 		if (v3 > 0x3F)
    182 			return -1;
    183 
    184 		if (v1 < 0xF0) {
    185 			if ((v1 > 0xE0 || v2 >= 0x20) && (v1 != 0xED || v2 < 0x20)) {
    186 				*wc = (((unsigned int)(v1 & 0x0F) << 12) |
    187 							((unsigned int)v2 << 6) | (unsigned int)v3);
    188 				return 3;
    189 			} else
    190 				return -1;
    191 		}
    192 
    193 		unsigned char v4 = c[3] ^ 0x80;
    194 
    195 		if (v4 > 0x3F || v1 > 0xF4)
    196 			return -1;
    197 
    198 		if ((v1 > 0xF0 || v2 >= 0x10) && (v1 != 0xF4 || v2 < 0x10)) {
    199 			*wc = (((unsigned int)(v1 & 0x07) << 18) | ((unsigned int)v2 << 12) |
    200 							((unsigned int)v3 << 6) | (unsigned int)v4);
    201 			return 4;
    202 		} else
    203 			return -1;
    204 	}
    205 
    206 	*wc = (unsigned int)*c;
    207 	return 1;
    208 }
    209 
    210 /* Return TRUE when the given character occupies two cells. */
    211 bool is_doublewidth(const char *ch)
    212 {
    213 	wchar_t wc;
    214 
    215 	/* Only from U+1100 can code points have double width. */
    216 	if ((unsigned char)*ch < 0xE1 || !using_utf8)
    217 		return FALSE;
    218 
    219 	if (mbtowide(&wc, ch) < 0)
    220 		return FALSE;
    221 
    222 	return (wcwidth(wc) == 2);
    223 }
    224 
    225 /* Return TRUE when the given character occupies zero cells. */
    226 bool is_zerowidth(const char *ch)
    227 {
    228 	wchar_t wc;
    229 
    230 	/* Only from U+0300 can code points have zero width. */
    231 	if ((unsigned char)*ch < 0xCC || !using_utf8)
    232 		return FALSE;
    233 
    234 	if (mbtowide(&wc, ch) < 0)
    235 		return FALSE;
    236 
    237 #if defined(__OpenBSD__)
    238 	/* Work around an OpenBSD bug -- see https://sv.gnu.org/bugs/?60393. */
    239 	if (wc >= 0xF0000)
    240 		return FALSE;
    241 #endif
    242 
    243 	return (wcwidth(wc) == 0);
    244 }
    245 #endif /* ENABLE_UTF8 */
    246 
    247 /* Return the number of bytes in the character that starts at *pointer. */
    248 int char_length(const char *pointer)
    249 {
    250 #ifdef ENABLE_UTF8
    251 	if ((unsigned char)*pointer > 0xC1 && using_utf8) {
    252 		unsigned char c1 = pointer[0];
    253 		unsigned char c2 = pointer[1];
    254 
    255 		if ((c2 ^ 0x80) > 0x3F)
    256 			return 1;
    257 
    258 		if (c1 < 0xE0)
    259 			return 2;
    260 
    261 		if (((unsigned char)pointer[2] ^ 0x80) > 0x3F)
    262 			return 1;
    263 
    264 		if (c1 < 0xF0) {
    265 			if ((c1 > 0xE0 || c2 >= 0xA0) && (c1 != 0xED || c2 < 0xA0))
    266 				return 3;
    267 			else
    268 				return 1;
    269 		}
    270 
    271 		if (((unsigned char)pointer[3] ^ 0x80) > 0x3F)
    272 			return 1;
    273 
    274 		if (c1 > 0xF4)
    275 			return 1;
    276 
    277 		if ((c1 > 0xF0 || c2 >= 0x90) && (c1 != 0xF4 || c2 < 0x90))
    278 			return 4;
    279 	}
    280 #endif
    281 		return 1;
    282 }
    283 
    284 /* Return the number of (multibyte) characters in the given string. */
    285 size_t mbstrlen(const char *pointer)
    286 {
    287 	size_t count = 0;
    288 
    289 	while (*pointer != '\0') {
    290 		pointer += char_length(pointer);
    291 		count++;
    292 	}
    293 
    294 	return count;
    295 }
    296 
    297 /* Return the length (in bytes) of the character at the start of the
    298  * given string, and return a copy of this character in *thechar. */
    299 int collect_char(const char *string, char *thechar)
    300 {
    301 	int charlen = char_length(string);
    302 
    303 	for (int i = 0; i < charlen; i++)
    304 		thechar[i] = string[i];
    305 
    306 	return charlen;
    307 }
    308 
    309 /* Return the length (in bytes) of the character at the start of
    310  * the given string, and add this character's width to *column. */
    311 int advance_over(const char *string, size_t *column)
    312 {
    313 #ifdef ENABLE_UTF8
    314 	if ((signed char)*string < 0 && using_utf8) {
    315 		/* A UTF-8 upper control code has two bytes and takes two columns. */
    316 		if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) {
    317 			*column += 2;
    318 			return 2;
    319 		} else {
    320 			wchar_t wc;
    321 			int charlen = mbtowide(&wc, string);
    322 
    323 			if (charlen < 0) {
    324 				*column += 1;
    325 				return 1;
    326 			}
    327 
    328 			int width = wcwidth(wc);
    329 
    330 #if defined(__OpenBSD__)
    331 			*column += (width < 0 || wc >= 0xF0000) ? 1 : width;
    332 #else
    333 			*column += (width < 0) ? 1 : width;
    334 #endif
    335 			return charlen;
    336 		}
    337 	}
    338 #endif
    339 
    340 	if ((unsigned char)*string < 0x20) {
    341 		if (*string == '\t')
    342 			*column += tabsize - *column % tabsize;
    343 		else
    344 			*column += 2;
    345 	} else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0)
    346 		*column += 2;
    347 	else
    348 		*column += 1;
    349 
    350 	return 1;
    351 }
    352 
    353 /* Return the index in buf of the beginning of the multibyte character
    354  * before the one at pos. */
    355 size_t step_left(const char *buf, size_t pos)
    356 {
    357 #ifdef ENABLE_UTF8
    358 	if (using_utf8) {
    359 		size_t before, charlen = 0;
    360 
    361 		if (pos < 4)
    362 			before = 0;
    363 		else {
    364 			const char *ptr = buf + pos;
    365 
    366 			/* Probe for a valid starter byte in the preceding four bytes. */
    367 			if ((signed char)*(--ptr) > -65)
    368 				before = pos - 1;
    369 			else if ((signed char)*(--ptr) > -65)
    370 				before = pos - 2;
    371 			else if ((signed char)*(--ptr) > -65)
    372 				before = pos - 3;
    373 			else if ((signed char)*(--ptr) > -65)
    374 				before = pos - 4;
    375 			else
    376 				before = pos - 1;
    377 		}
    378 
    379 		/* Move forward again until we reach the original character,
    380 		 * so we know the length of its preceding character. */
    381 		while (before < pos) {
    382 			charlen = char_length(buf + before);
    383 			before += charlen;
    384 		}
    385 
    386 		return before - charlen;
    387 	} else
    388 #endif
    389 		return (pos == 0 ? 0 : pos - 1);
    390 }
    391 
    392 /* Return the index in buf of the beginning of the multibyte character
    393  * after the one at pos. */
    394 size_t step_right(const char *buf, size_t pos)
    395 {
    396 	return pos + char_length(buf + pos);
    397 }
    398 
    399 /* This function is equivalent to strcasecmp() for multibyte strings. */
    400 int mbstrcasecmp(const char *s1, const char *s2)
    401 {
    402 	return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE);
    403 }
    404 
    405 /* This function is equivalent to strncasecmp() for multibyte strings. */
    406 int mbstrncasecmp(const char *s1, const char *s2, size_t n)
    407 {
    408 #ifdef ENABLE_UTF8
    409 	if (using_utf8) {
    410 		wchar_t wc1, wc2;
    411 
    412 		while (*s1 != '\0' && *s2 != '\0' && n > 0) {
    413 			if ((signed char)*s1 >= 0 && (signed char)*s2 >= 0) {
    414 				if ('A' <= (*s1 & 0x5F) && (*s1 & 0x5F) <= 'Z') {
    415 					if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z') {
    416 						if ((*s1 & 0x5F) != (*s2 & 0x5F))
    417 							return ((*s1 & 0x5F) - (*s2 & 0x5F));
    418 					} else
    419 						return ((*s1 | 0x20) - *s2);
    420 				} else if ('A' <= (*s2 & 0x5F) && (*s2 & 0x5F) <= 'Z')
    421 					return (*s1 - (*s2 | 0x20));
    422 				else if (*s1 != *s2)
    423 					return (*s1 - *s2);
    424 
    425 				s1++; s2++; n--;
    426 				continue;
    427 			}
    428 
    429 			bool bad1 = (mbtowide(&wc1, s1) < 0);
    430 			bool bad2 = (mbtowide(&wc2, s2) < 0);
    431 
    432 			if (bad1 || bad2) {
    433 				if (*s1 != *s2)
    434 					return (unsigned char)*s1 - (unsigned char)*s2;
    435 
    436 				if (bad1 != bad2)
    437 					return (bad1 ? 1 : -1);
    438 			} else {
    439 				int difference = towlower(wc1) - towlower(wc2);
    440 
    441 				if (difference != 0)
    442 					return difference;
    443 			}
    444 
    445 			s1 += char_length(s1);
    446 			s2 += char_length(s2);
    447 			n--;
    448 		}
    449 
    450 		return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0;
    451 	} else
    452 #endif
    453 		return strncasecmp(s1, s2, n);
    454 }
    455 
    456 /* This function is equivalent to strcasestr() for multibyte strings. */
    457 char *mbstrcasestr(const char *haystack, const char *needle)
    458 {
    459 #ifdef ENABLE_UTF8
    460 	if (using_utf8) {
    461 		size_t needle_len = mbstrlen(needle);
    462 
    463 		while (*haystack != '\0') {
    464 			if (mbstrncasecmp(haystack, needle, needle_len) == 0)
    465 				return (char *)haystack;
    466 
    467 			haystack += char_length(haystack);
    468 		}
    469 
    470 		return NULL;
    471 	} else
    472 #endif
    473 		return (char *)strcasestr(haystack, needle);
    474 }
    475 
    476 /* This function is equivalent to strstr(), except in that it scans the
    477  * string in reverse, starting at pointer. */
    478 char *revstrstr(const char *haystack, const char *needle,
    479 		const char *pointer)
    480 {
    481 	size_t needle_len = strlen(needle);
    482 	size_t tail_len = strlen(pointer);
    483 
    484 	if (tail_len < needle_len)
    485 		pointer -= (needle_len - tail_len);
    486 
    487 	while (pointer >= haystack) {
    488 		if (strncmp(pointer, needle, needle_len) == 0)
    489 			return (char *)pointer;
    490 		pointer--;
    491 	}
    492 
    493 	return NULL;
    494 }
    495 
    496 /* This function is equivalent to strcasestr(), except in that it scans
    497  * the string in reverse, starting at pointer. */
    498 char *revstrcasestr(const char *haystack, const char *needle,
    499 		const char *pointer)
    500 {
    501 	size_t needle_len = strlen(needle);
    502 	size_t tail_len = strlen(pointer);
    503 
    504 	if (tail_len < needle_len)
    505 		pointer -= (needle_len - tail_len);
    506 
    507 	while (pointer >= haystack) {
    508 		if (strncasecmp(pointer, needle, needle_len) == 0)
    509 			return (char *)pointer;
    510 		pointer--;
    511 	}
    512 
    513 	return NULL;
    514 }
    515 
    516 /* This function is equivalent to strcasestr() for multibyte strings,
    517  * except in that it scans the string in reverse, starting at pointer. */
    518 char *mbrevstrcasestr(const char *haystack, const char *needle,
    519 		const char *pointer)
    520 {
    521 #ifdef ENABLE_UTF8
    522 	if (using_utf8) {
    523 		size_t needle_len = mbstrlen(needle);
    524 		size_t tail_len = mbstrlen(pointer);
    525 
    526 		if (tail_len < needle_len)
    527 			pointer -= (needle_len - tail_len);
    528 
    529 		if (pointer < haystack)
    530 			return NULL;
    531 
    532 		while (TRUE) {
    533 			if (mbstrncasecmp(pointer, needle, needle_len) == 0)
    534 				return (char *)pointer;
    535 
    536 			if (pointer == haystack)
    537 				return NULL;
    538 
    539 			pointer = haystack + step_left(haystack, pointer - haystack);
    540 		}
    541 	} else
    542 #endif
    543 		return revstrcasestr(haystack, needle, pointer);
    544 }
    545 
    546 #if !defined(NANO_TINY) || defined(ENABLE_JUSTIFY)
    547 /* This function is equivalent to strchr() for multibyte strings. */
    548 char *mbstrchr(const char *string, const char *chr)
    549 {
    550 #ifdef ENABLE_UTF8
    551 	if (using_utf8) {
    552 		bool bad_s = FALSE, bad_c = FALSE;
    553 		wchar_t ws, wc;
    554 
    555 		if (mbtowide(&wc, chr) < 0) {
    556 			wc = (unsigned char)*chr;
    557 			bad_c = TRUE;
    558 		}
    559 
    560 		while (*string != '\0') {
    561 			int symlen = mbtowide(&ws, string);
    562 
    563 			if (symlen < 0) {
    564 				ws = (unsigned char)*string;
    565 				bad_s = TRUE;
    566 			}
    567 
    568 			if (ws == wc && bad_s == bad_c)
    569 				break;
    570 
    571 			string += symlen;
    572 		}
    573 
    574 		if (*string == '\0')
    575 			return NULL;
    576 
    577 		return (char *)string;
    578 	} else
    579 #endif
    580 		return strchr(string, *chr);
    581 }
    582 #endif /* !NANO_TINY || ENABLE_JUSTIFY */
    583 
    584 #ifndef NANO_TINY
    585 /* Locate, in the given string, the first occurrence of any of
    586  * the characters in accept, searching forward. */
    587 char *mbstrpbrk(const char *string, const char *accept)
    588 {
    589 	while (*string != '\0') {
    590 		if (mbstrchr(accept, string) != NULL)
    591 			return (char *)string;
    592 
    593 		string += char_length(string);
    594 	}
    595 
    596 	return NULL;
    597 }
    598 
    599 /* Locate, in the string that starts at head, the first occurrence of any of
    600  * the characters in accept, starting from pointer and searching backwards. */
    601 char *mbrevstrpbrk(const char *head, const char *accept, const char *pointer)
    602 {
    603 	if (*pointer == '\0') {
    604 		if (pointer == head)
    605 			return NULL;
    606 		pointer = head + step_left(head, pointer - head);
    607 	}
    608 
    609 	while (TRUE) {
    610 		if (mbstrchr(accept, pointer) != NULL)
    611 			return (char *)pointer;
    612 
    613 		/* If we've reached the head of the string, we found nothing. */
    614 		if (pointer == head)
    615 			return NULL;
    616 
    617 		pointer = head + step_left(head, pointer - head);
    618 	}
    619 }
    620 #endif /* !NANO_TINY */
    621 
    622 #if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || defined(ENABLE_JUSTIFY))
    623 /* Return TRUE if the given string contains at least one blank character. */
    624 bool has_blank_char(const char *string)
    625 {
    626 	while (*string != '\0' && !is_blank_char(string))
    627 		string += char_length(string);
    628 
    629 	return *string;
    630 }
    631 #endif
    632 
    633 /* Return TRUE when the given string is empty or consists of only blanks. */
    634 bool white_string(const char *string)
    635 {
    636 	while (*string != '\0' && (is_blank_char(string) || *string == '\r'))
    637 		string += char_length(string);
    638 
    639 	return !*string;
    640 }
    641 
    642 #if defined(ENABLE_SPELLER) || defined(ENABLE_COLOR)
    643 /* Remove leading whitespace from the given string. */
    644 void strip_leading_blanks_from(char *string)
    645 {
    646 	while (string && (*string == ' ' || *string == '\t'))
    647 		memmove(string, string + 1, strlen(string));
    648 }
    649 #endif