chars: work around a UTF-8 bug in glibc, to display invalid codes right - nano

commit 929770191eb0f71125c2bb9dcee2665e6d947a94
parent 66d9d6c6d236ef1ed9bfc7545b227c8fbd80671b
Author: Benno Schulenberg <bensberg@telfort.nl>
Date:   Thu, 25 Mar 2021 17:28:51 +0100

chars: work around a UTF-8 bug in glibc, to display invalid codes right

The mblen() and mbtowc() functions will happily return 4 or 5 or 6
for byte sequences that start with 0xF4 0x90 or higher.  But those
sequences encode for U+110000 or higher, which are not valid Unicode
code points.  The libc of FreeBSD and OpenBSD and Alpine correctly
return -1 for such sequences.  Make nano behave correctly also when
linked against glibc, so that invalid sequences are always presented
as a series of invalid bytes and never as a single invalid code.

This fixes https://savannah.gnu.org/bugs/?60262.

Bug existed since before version 2.0.0.

Diffstat:
M src/chars.c  | 56 ++++++++++++++++++++++++++++++--------------------------
M src/prototypes.h  | 1 +
M src/winio.c  | 12 ++++++------

3 files changed, 37 insertions(+), 32 deletions(-)
diff --git a/src/chars.c b/src/chars.c
@@ -51,7 +51,7 @@ bool is_alpha_char(const char *c)
 #ifdef ENABLE_UTF8
 	wchar_t wc;
 
-	if (mbtowc(&wc, c, MAXCHARLEN) < 0)
+	if (mbtowide(&wc, c) < 0)
 		return FALSE;
 
 	return iswalpha(wc);
@@ -67,7 +67,7 @@ bool is_alnum_char(const char *c)
 #ifdef ENABLE_UTF8
 	wchar_t wc;
 
-	if (mbtowc(&wc, c, MAXCHARLEN) < 0)
+	if (mbtowide(&wc, c) < 0)
 		return FALSE;
 
 	return iswalnum(wc);
@@ -85,7 +85,7 @@ bool is_blank_char(const char *c)
 	if ((signed char)*c >= 0)
 		return (*c == ' ' || *c == '\t');
 
-	if (mbtowc(&wc, c, MAXCHARLEN) < 0)
+	if (mbtowide(&wc, c) < 0)
 		return FALSE;
 
 	return iswblank(wc);
@@ -112,7 +112,7 @@ bool is_punct_char(const char *c)
 #ifdef ENABLE_UTF8
 	wchar_t wc;
 
-	if (mbtowc(&wc, c, MAXCHARLEN) < 0)
+	if (mbtowide(&wc, c) < 0)
 		return FALSE;
 
 	return iswpunct(wc);
@@ -176,6 +176,18 @@ char control_mbrep(const char *c, bool isdata)
 }
 
 #ifdef ENABLE_UTF8
+/* Convert the given multibyte sequence c to wide character wc, and return
+ * the number of bytes in the sequence, or -1 for an invalid sequence. */
+int mbtowide(wchar_t *wc, const char *c)
+{
+	int count = mbtowc(wc, c, MAXCHARLEN);
+
+	if (count < 0 || *wc > 0x10FFFF)
+		return -1;
+	else
+		return count;
+}
+
 /* Return the width in columns of the given (multibyte) character. */
 int mbwidth(const char *c)
 {
@@ -184,7 +196,7 @@ int mbwidth(const char *c)
 		wchar_t wc;
 		int width;
 
-		if (mbtowc(&wc, c, MAXCHARLEN) < 0)
+		if (mbtowide(&wc, c) < 0)
 			return 1;
 
 		width = wcwidth(wc);
@@ -226,9 +238,14 @@ int char_length(const char *pointer)
 {
 #ifdef ENABLE_UTF8
 	/* If possibly a multibyte character, get its length; otherwise, it's 1. */
-	if ((unsigned char)*pointer > 0xC1) {
+	if ((unsigned char)*pointer > 0xC1 && use_utf8) {
 		int length = mblen(pointer, MAXCHARLEN);
 
+		/* Codes beyond U+10FFFF are invalid, even when glibc thinks otherwise. */
+		if ((unsigned char)*pointer > 0xF4 || ((unsigned char)*pointer == 0xF4 &&
+											(unsigned char)*(pointer + 1) > 0x8F))
+			return 1;
+
 		return (length < 0 ? 1 : length);
 	} else
 #endif
@@ -243,9 +260,7 @@ size_t mbstrlen(const char *pointer)
 	while (*pointer != '\0') {
 #ifdef ENABLE_UTF8
 		if ((unsigned char)*pointer > 0xC1) {
-			int length = mblen(pointer, MAXCHARLEN);
-
-			pointer += (length < 0 ? 1 : length);
+			pointer += char_length(pointer);
 		} else
 #endif
 			pointer++;
@@ -265,11 +280,7 @@ int collect_char(const char *string, char *thechar)
 #ifdef ENABLE_UTF8
 	/* If this is a UTF-8 starter byte, get the number of bytes of the character. */
 	if ((unsigned char)*string > 0xC1) {
-		charlen = mblen(string, MAXCHARLEN);
-
-		/* When the multibyte sequence is invalid, only take the first byte. */
-		if (charlen <= 0)
-			charlen = 1;
+		charlen = char_length(string);
 	} else
 #endif
 		charlen = 1;
@@ -286,19 +297,12 @@ int advance_over(const char *string, size_t *column)
 {
 #ifdef ENABLE_UTF8
 	if ((signed char)*string < 0) {
-		int charlen = mblen(string, MAXCHARLEN);
-
-		if (charlen > 0) {
 			if (is_cntrl_char(string))
 				*column += 2;
 			else
 				*column += mbwidth(string);
-		} else {
-			charlen = 1;
-			*column += 1;
-		}
 
-		return charlen;
+		return char_length(string);
 	}
 #endif
 
@@ -395,8 +399,8 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n)
 				continue;
 			}
 
-			bool bad1 = (mbtowc(&wc1, s1, MAXCHARLEN) < 0);
-			bool bad2 = (mbtowc(&wc2, s2, MAXCHARLEN) < 0);
+			bool bad1 = (mbtowide(&wc1, s1) < 0);
+			bool bad2 = (mbtowide(&wc2, s2) < 0);
 
 			if (bad1 || bad2) {
 				if (*s1 != *s2)
@@ -521,13 +525,13 @@ char *mbstrchr(const char *string, const char *chr)
 		bool bad_s = FALSE, bad_c = FALSE;
 		wchar_t ws, wc;
 
-		if (mbtowc(&wc, chr, MAXCHARLEN) < 0) {
+		if (mbtowide(&wc, chr) < 0) {
 			wc = (unsigned char)*chr;
 			bad_c = TRUE;
 		}
 
 		while (*string != '\0') {
-			int symlen = mbtowc(&ws, string, MAXCHARLEN);
+			int symlen = mbtowide(&ws, string);
 
 			if (symlen < 0) {
 				ws = (unsigned char)*string;
diff --git a/src/prototypes.h b/src/prototypes.h
@@ -204,6 +204,7 @@ bool is_cntrl_char(const char *c);
 bool is_word_char(const char *c, bool allow_punct);
 char control_mbrep(const char *c, bool isdata);
 #ifdef ENABLE_UTF8
+int mbtowide(wchar_t *wc, const char *c);
 int mbwidth(const char *c);
 bool is_zerowidth(const char *ch);
 char *make_mbchar(long code, int *length);
diff --git a/src/winio.c b/src/winio.c
@@ -1820,14 +1820,14 @@ char *display_string(const char *buf, size_t column, size_t span,
 		wchar_t wc;
 
 		/* Convert a multibyte character to a single code. */
-		charlength = mbtowc(&wc, buf, MAXCHARLEN);
+		charlength = mbtowide(&wc, buf);
 
 		/* Represent an invalid character with the Replacement Character. */
-		if (charlength < 0 || wc > 0x10FFFF) {
+		if (charlength < 0) {
 			converted[index++] = '\xEF';
 			converted[index++] = '\xBF';
 			converted[index++] = '\xBD';
-			buf += (charlength > 0 ? charlength : 1);
+			buf++;
 			column++;
 			continue;
 		}
@@ -2151,7 +2151,7 @@ void minibar(void)
 #ifdef ENABLE_UTF8
 		else if ((unsigned char)*this_position < 0x80 && using_utf8())
 			sprintf(hexadecimal, "U+%04X", (unsigned char)*this_position);
-		else if (using_utf8() && mbtowc(&widecode, this_position, MAXCHARLEN) >= 0)
+		else if (using_utf8() && mbtowide(&widecode, this_position) > 0)
 			sprintf(hexadecimal, "U+%04X", (int)widecode);
 #endif
 		else
@@ -2163,13 +2163,13 @@ void minibar(void)
 		successor = this_position + char_length(this_position);
 
 		if (*this_position && *successor && is_zerowidth(successor) &&
-					mbtowc(&widecode, successor, MAXCHARLEN) >= 0) {
+								mbtowide(&widecode, successor) > 0) {
 			sprintf(hexadecimal, "|%04X", (int)widecode);
 			waddstr(bottomwin, hexadecimal);
 
 			successor += char_length(successor);
 
-			if (is_zerowidth(successor) && mbtowc(&widecode, successor, MAXCHARLEN) >= 0) {
+			if (is_zerowidth(successor) && mbtowide(&widecode, successor) > 0) {
 				sprintf(hexadecimal, "|%04X", (int)widecode);
 				waddstr(bottomwin, hexadecimal);
 			}

	nano nano with my custom patches
	git clone git://bsandro.tech/nano
	Log \| Files \| Refs \| README \| LICENSE

M	src/chars.c	\|	56	++++++++++++++++++++++++++++++--------------------------
M	src/prototypes.h	\|	1	+
M	src/winio.c	\|	12	++++++------