patch 9.2.0145: UTF-8 decoding and length calculation can be improved

Problem:  Vim often calls utf_ptr2char() and utf_ptr2len() separately.
Solution: Refactor UTF-8 hot paths into utf_ptr2char_and_len() to
          decode the codepoint and byte length in a single pass.
          Fold combining character logic into the same optimized flow.
          Improves redraw performance by ~8-10% in UTF-8 heavy
          scenarios (Yasuhiro Matsumoto).

closes: #19649

Signed-off-by: Yasuhiro Matsumoto <mattn.jp@gmail.com>
Signed-off-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
Yasuhiro Matsumoto
2026-03-12 19:15:28 +00:00
committed by Christian Brabandt
parent 2d14d62c50
commit 55464b5d18
2 changed files with 287 additions and 128 deletions
+285 -128
View File
@@ -129,6 +129,9 @@ static int dbcs_char2cells(int c);
static int dbcs_ptr2cells_len(char_u *p, int size);
static int dbcs_ptr2char(char_u *p);
static int dbcs_head_off(char_u *base, char_u *p);
static inline int utf_ptr2char_and_len(char_u *p, int *lenp);
static inline int utf_ptr2char_and_len_len(char_u *p, int size, int *lenp);
static inline int utf_iscomposinglike_char(int c1, int c2);
#ifdef FEAT_EVAL
static int cw_value(int c);
#endif
@@ -1629,13 +1632,14 @@ utf_ptr2cells(
char_u *p)
{
int c;
int len;
// Need to convert to a character number.
if (*p >= 0x80)
{
c = utf_ptr2char(p);
c = utf_ptr2char_and_len(p, &len);
// An illegal byte is displayed as <xx>.
if (utf_ptr2len(p) == 1 || c == NUL)
if (len == 1 || c == NUL)
return 4;
// If the char is ASCII it must be an overlong sequence.
if (c < 0x80)
@@ -1670,15 +1674,16 @@ latin_ptr2cells_len(char_u *p UNUSED, int size UNUSED)
utf_ptr2cells_len(char_u *p, int size)
{
int c;
int len;
// Need to convert to a wide character.
if (size > 0 && *p >= 0x80)
{
if (utf_ptr2len_len(p, size) < utf8len_tab[*p])
c = utf_ptr2char_and_len_len(p, size, &len);
if (len > size)
return 1; // truncated
c = utf_ptr2char(p);
// An illegal byte is displayed as <xx>.
if (utf_ptr2len(p) == 1 || c == NUL)
if (len == 1 || c == NUL)
return 4;
// If the char is ASCII it must be an overlong sequence.
if (c < 0x80)
@@ -1784,6 +1789,208 @@ dbcs_ptr2char(char_u *p)
return *p;
}
/*
* Convert a UTF-8 byte sequence to a character number.
* Returns the character and sets "*lenp" to its byte length.
* Illegal bytes are returned as-is with a length of one.
*/
static inline int
utf_ptr2char_and_len(char_u *p, int *lenp)
{
int len;
int c;
if (p[0] < 0x80)
{
*lenp = p[0] == NUL ? 0 : 1;
return p[0];
}
len = utf8len_tab_zero[p[0]];
if (len <= 1 || (p[1] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if (len == 2)
{
*lenp = 2;
return c;
}
if ((p[2] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
if (len == 3)
{
*lenp = 3;
return c;
}
if ((p[3] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if (len == 4)
{
*lenp = 4;
return c;
}
if ((p[4] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12)
+ ((p[3] & 0x3f) << 6) + (p[4] & 0x3f);
if (len == 5)
{
*lenp = 5;
return c;
}
if ((p[5] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
*lenp = 6;
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18)
+ ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
/*
* Like utf_ptr2char_and_len(), but never reads beyond "size" bytes.
* For an incomplete sequence "*lenp" is set to the expected length.
*/
static inline int
utf_ptr2char_and_len_len(char_u *p, int size, int *lenp)
{
int len;
int c;
if (size < 1)
{
*lenp = 1;
return NUL;
}
if (p[0] < 0x80)
{
*lenp = 1;
return p[0];
}
len = utf8len_tab_zero[p[0]];
if (len <= 1)
{
*lenp = 1;
return p[0];
}
if (len > size)
{
int i;
// Incomplete sequence: validate continuation bytes within range.
for (i = 1; i < size; ++i)
if ((p[i] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
*lenp = len;
return p[0];
}
if ((p[1] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if (len == 2)
{
*lenp = 2;
return c;
}
if ((p[2] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
if (len == 3)
{
*lenp = 3;
return c;
}
if ((p[3] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if (len == 4)
{
*lenp = 4;
return c;
}
if ((p[4] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
c = ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12)
+ ((p[3] & 0x3f) << 6) + (p[4] & 0x3f);
if (len == 5)
{
*lenp = 5;
return c;
}
if ((p[5] & 0xc0) != 0x80)
{
*lenp = 1;
return p[0];
}
*lenp = 6;
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18)
+ ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
static inline int
utf_iscomposinglike_char(int c1, int c2)
{
if (utf_iscomposing(c2))
return TRUE;
#ifdef FEAT_ARABIC
if (!arabic_maycombine(c2))
return FALSE;
return arabic_combine(c1, c2);
#else
(void)c1;
return FALSE;
#endif
}
/*
* Convert a UTF-8 byte sequence to a character number.
* If the sequence is illegal or truncated by a NUL the first byte is
@@ -1796,40 +2003,7 @@ utf_ptr2char(char_u *p)
{
int len;
if (p[0] < 0x80) // be quick for ASCII
return p[0];
len = utf8len_tab_zero[p[0]];
if (len > 1 && (p[1] & 0xc0) == 0x80)
{
if (len == 2)
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if ((p[2] & 0xc0) == 0x80)
{
if (len == 3)
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ (p[2] & 0x3f);
if ((p[3] & 0xc0) == 0x80)
{
if (len == 4)
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if ((p[4] & 0xc0) == 0x80)
{
if (len == 5)
return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ (p[4] & 0x3f);
if ((p[5] & 0xc0) == 0x80 && len == 6)
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
}
}
}
// Illegal value, just return the first byte
return p[0];
return utf_ptr2char_and_len(p, &len);
}
/*
@@ -1953,25 +2127,29 @@ utfc_ptr2char(
int len;
int c;
int cc;
int cc_len;
int i = 0;
c = utf_ptr2char(p);
len = utf_ptr2len(p);
c = utf_ptr2char_and_len(p, &len);
// Only accept a composing char when the first char isn't illegal.
if ((len > 1 || *p < 0x80)
&& p[len] >= 0x80
&& UTF_COMPOSINGLIKE(p, p + len))
if ((len > 1 || *p < 0x80) && p[len] >= 0x80)
{
cc = utf_ptr2char(p + len);
for (;;)
cc = utf_ptr2char_and_len(p + len, &cc_len);
if (utf_iscomposinglike_char(c, cc))
{
pcc[i++] = cc;
if (i == MAX_MCO)
break;
len += utf_ptr2len(p + len);
if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len)))
break;
for (;;)
{
pcc[i++] = cc;
if (i == MAX_MCO)
break;
len += cc_len;
if (p[len] < 0x80)
break;
cc = utf_ptr2char_and_len(p + len, &cc_len);
if (!utf_iscomposing(cc))
break;
}
}
}
@@ -1994,27 +2172,28 @@ utfc_ptr2char_len(
int len;
int c;
int cc;
int cc_len;
int i = 0;
c = utf_ptr2char(p);
len = utf_ptr2len_len(p, maxlen);
c = utf_ptr2char_and_len_len(p, maxlen, &len);
// Only accept a composing char when the first char isn't illegal.
if ((len > 1 || *p < 0x80)
&& len < maxlen
&& p[len] >= 0x80
&& UTF_COMPOSINGLIKE(p, p + len))
if ((len > 1 || *p < 0x80) && len < maxlen && p[len] >= 0x80)
{
cc = utf_ptr2char(p + len);
for (;;)
cc = utf_ptr2char_and_len_len(p + len, maxlen - len, &cc_len);
if (cc_len <= maxlen - len && utf_iscomposinglike_char(c, cc))
{
pcc[i++] = cc;
if (i == MAX_MCO)
break;
len += utf_ptr2len_len(p + len, maxlen - len);
if (len >= maxlen
|| p[len] < 0x80
|| !utf_iscomposing(cc = utf_ptr2char(p + len)))
break;
for (;;)
{
pcc[i++] = cc;
if (i == MAX_MCO)
break;
len += cc_len;
if (len >= maxlen || p[len] < 0x80)
break;
cc = utf_ptr2char_and_len_len(p + len, maxlen - len, &cc_len);
if (cc_len > maxlen - len || !utf_iscomposing(cc))
break;
}
}
}
@@ -2057,14 +2236,8 @@ utfc_char2bytes(int off, char_u *buf)
utf_ptr2len(char_u *p)
{
int len;
int i;
if (*p == NUL)
return 0;
len = utf8len_tab[*p];
for (i = 1; i < len; ++i)
if ((p[i] & 0xc0) != 0x80)
return 1;
utf_ptr2char_and_len(p, &len);
return len;
}
@@ -2102,19 +2275,8 @@ utf_byte2len_zero(int b)
utf_ptr2len_len(char_u *p, int size)
{
int len;
int i;
int m;
len = utf8len_tab[*p];
if (len == 1)
return 1; // NUL, ascii or illegal lead byte
if (len > size)
m = size; // incomplete byte sequence.
else
m = len;
for (i = 1; i < m; ++i)
if ((p[i] & 0xc0) != 0x80)
return 1;
utf_ptr2char_and_len_len(p, size, &len);
return len;
}
@@ -2127,10 +2289,10 @@ utf_ptr2len_len(char_u *p, int size)
utfc_ptr2len(char_u *p)
{
int len;
int c;
int cc;
int cc_len;
int b0 = *p;
#ifdef FEAT_ARABIC
int prevlen;
#endif
if (b0 == NUL)
return 0;
@@ -2138,7 +2300,7 @@ utfc_ptr2len(char_u *p)
return 1;
// Skip over first UTF-8 char, stopping at a NUL byte.
len = utf_ptr2len(p);
c = utf_ptr2char_and_len(p, &len);
// Check for illegal byte.
if (len == 1 && b0 >= 0x80)
@@ -2148,19 +2310,22 @@ utfc_ptr2len(char_u *p)
* Check for composing characters. We can handle only the first six, but
* skip all of them (otherwise the cursor would get stuck).
*/
#ifdef FEAT_ARABIC
prevlen = 0;
#endif
if (p[len] < 0x80)
return len;
cc = utf_ptr2char_and_len(p + len, &cc_len);
if (!utf_iscomposinglike_char(c, cc))
return len;
for (;;)
{
if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len))
return len;
// Skip over composing char
#ifdef FEAT_ARABIC
prevlen = len;
#endif
len += utf_ptr2len(p + len);
len += cc_len;
if (p[len] < 0x80)
return len;
cc = utf_ptr2char_and_len(p + len, &cc_len);
if (!utf_iscomposing(cc))
return len;
}
}
@@ -2174,9 +2339,9 @@ utfc_ptr2len(char_u *p)
utfc_ptr2len_len(char_u *p, int size)
{
int len;
#ifdef FEAT_ARABIC
int prevlen;
#endif
int c;
int cc;
int cc_len;
if (size < 1 || *p == NUL)
return 0;
@@ -2184,7 +2349,7 @@ utfc_ptr2len_len(char_u *p, int size)
return 1;
// Skip over first UTF-8 char, stopping at a NUL byte.
len = utf_ptr2len_len(p, size);
c = utf_ptr2char_and_len_len(p, size, &len);
// Check for illegal byte and incomplete byte sequence.
if ((len == 1 && p[0] >= 0x80) || len > size)
@@ -2194,32 +2359,24 @@ utfc_ptr2len_len(char_u *p, int size)
* Check for composing characters. We can handle only the first six, but
* skip all of them (otherwise the cursor would get stuck).
*/
#ifdef FEAT_ARABIC
prevlen = 0;
#endif
if (len >= size || p[len] < 0x80)
return len;
cc = utf_ptr2char_and_len_len(p + len, size - len, &cc_len);
if (cc_len > size - len || !utf_iscomposinglike_char(c, cc))
return len;
while (len < size)
{
int len_next_char;
if (p[len] < 0x80)
break;
/*
* Next character length should not go beyond size to ensure that
* UTF_COMPOSINGLIKE(...) does not read beyond size.
*/
len_next_char = utf_ptr2len_len(p + len, size - len);
if (len_next_char > size - len)
break;
if (!UTF_COMPOSINGLIKE(p + prevlen, p + len))
break;
// Skip over composing char
#ifdef FEAT_ARABIC
prevlen = len;
#endif
len += len_next_char;
len += cc_len;
if (len >= size || p[len] < 0x80)
break;
cc = utf_ptr2char_and_len_len(p + len, size - len, &cc_len);
if (cc_len > size - len)
break;
if (!utf_iscomposing(cc))
break;
}
return len;
}
+2
View File
@@ -734,6 +734,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
145,
/**/
144,
/**/