commit 78f92e044a1dcaf2a74f7379fa3307e0a4525523
parent f11931a0dd4cfc777b74dd59773ab994f3561bd0
Author: Benno Schulenberg <bensberg@telfort.nl>
Date: Fri, 9 Apr 2021 10:52:29 +0200
tweaks: avoid parsing a multibyte character twice
The number of bytes in the character were determined twice: first in
mbwidth() and then in char_length(). Do it just once, in mbtowide().
Also, avoid calling is_cntrl_char(), because it does unneeded checks
when we already know that the high bit is set.
This duplicates some code, but advance_over() is called a lot, so it
is important that it is as fast as possible.
This shouldn't slow down plain ASCII, as the extra checks (use_utf8
and *string < 0xA0) are done only for non-ASCII (apart from DEL).
Diffstat:
1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/src/chars.c b/src/chars.c
@@ -334,13 +334,26 @@ int collect_char(const char *string, char *thechar)
int advance_over(const char *string, size_t *column)
{
#ifdef ENABLE_UTF8
- if ((signed char)*string < 0) {
- if (is_cntrl_char(string))
+ if ((signed char)*string < 0 && use_utf8) {
+ /* A UTF-8 upper control code has two bytes and takes two columns. */
+ if (((unsigned char)string[0] == 0xC2 && (signed char)string[1] < -96)) {
*column += 2;
- else
- *column += mbwidth(string);
+ return 2;
+ } else {
+ wchar_t wc;
+ int charlen = mbtowide(&wc, string);
+
+ if (charlen < 0) {
+ *column += 1;
+ return 1;
+ }
- return char_length(string);
+ int width = wcwidth(wc);
+
+ *column += (width < 0) ? 1 : width;
+
+ return charlen;
+ }
}
#endif
@@ -349,12 +362,8 @@ int advance_over(const char *string, size_t *column)
*column += tabsize - *column % tabsize;
else
*column += 2;
- } else if (*string == 0x7F)
+ } else if (0x7E < (unsigned char)*string && (unsigned char)*string < 0xA0)
*column += 2;
-#ifndef ENABLE_UTF8
- else if (0x7F < (unsigned char)*string && (unsigned char)*string < 0xA0)
- *column += 2;
-#endif
else
*column += 1;