commit 9a7ba5db7949e441dcdfa7f445aa2a132d41428a
parent cc2b19c8fd8ba832a339c53ee719f0d5ab09b8c4
Author: Benno Schulenberg <bensberg@telfort.nl>
Date: Mon, 4 Jun 2018 13:39:54 +0200
chars: speed up the parsing of a character for the plain ASCII case
Again, if the most significant bit of a UTF-8 byte is zero, it means
the character is a single byte and we can skip the call of mblen(),
*and* if the character is one byte it also occupies just one column,
because all ASCII characters are single-column characters -- apart
from control codes.
This partially addresses https://savannah.gnu.org/bugs/?51491.
Diffstat:
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/src/chars.c b/src/chars.c
@@ -294,14 +294,17 @@ int parse_mbchar(const char *buf, char *chr, size_t *col)
{
#ifdef ENABLE_UTF8
if (use_utf8) {
- /* Get the number of bytes in the multibyte character. */
- int length = mblen(buf, MAXCHARLEN);
+ int length;
- /* When the multibyte sequence is invalid, only take the first byte. */
- if (length <= 0) {
- IGNORE_CALL_RESULT(mblen(NULL, 0));
+ /* If this is a UTF-8 starter byte, get the number of bytes of the character. */
+ if ((signed char)*buf < 0) {
+ length = mblen(buf, MAXCHARLEN);
+
+ /* When the multibyte sequence is invalid, only take the first byte. */
+ if (length <= 0)
+ length = 1;
+ } else
length = 1;
- }
/* When requested, store the multibyte character in chr. */
if (chr != NULL) {
@@ -322,7 +325,9 @@ int parse_mbchar(const char *buf, char *chr, size_t *col)
else if (is_cntrl_mbchar(buf)) {
*col += 2;
/* If we have a normal character, get its width normally. */
- } else
+ } else if (length == 1)
+ *col += 1;
+ else
*col += mbwidth(buf);
}