mirror of
https://github.com/macvim-dev/macvim.git
synced 2026-05-28 00:21:57 +02:00
patch 9.1.2124: blob2str() does not handle UTF-16 encoding
Problem: blob2str() does not handle UTF-16 encoding
(Hirohito Higashi)
Solution: Refactor the code and fix remaining issues, see below
(Yasuhiro Matsumoto).
blob2str() function did not properly handle UTF-16/UCS-2/UTF-32/UCS-4
encodings with endianness suffixes (e.g., utf-16le, utf-16be, ucs-2le).
The encoding name was canonicalized too aggressively, losing the
endianness information needed by iconv.
This change include few fixes:
- Preserve the raw encoding name with endianness suffix for iconv calls
- Normalize encoding names properly: "ucs2be" → "ucs-2be", "utf16le" →
"utf-16le"
- For multi-byte encodings (UTF-16/32, UCS-2/4), convert the entire blob
first, then split by newlines
convert_string() cannot handle UTF-16 because it uses string_convert()
which expects NUL-terminated strings. UTF-16 contains 0x00 bytes within
characters (e.g., "H" = 0x48 0x00), causing premature termination.
Therefore, for UTF-16/32 encodings, the fix uses string_convert_ext()
with an explicit input length to convert the entire blob at once.
The code appends two NUL bytes (ga_append(&blob_ga, NUL) twice) because
UTF-16 requires a 2-byte NUL terminator (0x00 0x00), not a single-byte
NUL.
- src/strings.c: Add from_encoding_raw to preserve endianness, special
handling for UTF-16/32 and UCS-2/4
- src/mbyte.c: Fix convert_setup_ext() to use == ENC_UNICODE instead of
& ENC_UNICODE. The bitwise AND was incorrectly treating UTF-16/UCS-2
(which have ENC_UNICODE + ENC_2BYTE etc.) as UTF-8, causing iconv
setup to be skipped.
fixes: #19198
closes: #19246
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Yasuhiro Matsumoto <mattn.jp@gmail.com>
Signed-off-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
committed by
Christian Brabandt
parent
ecc3faef61
commit
2b184d4b97
@@ -1,4 +1,4 @@
|
||||
*builtin.txt* For Vim version 9.1. Last change: 2026 Jan 17
|
||||
*builtin.txt* For Vim version 9.1. Last change: 2026 Jan 31
|
||||
|
||||
|
||||
VIM REFERENCE MANUAL by Bram Moolenaar
|
||||
@@ -1389,6 +1389,9 @@ blob2str({blob} [, {options}]) *blob2str()*
|
||||
Can also be used as a |method|: >
|
||||
GetBlob()->blob2str()
|
||||
<
|
||||
If `iconv` is not available and the encoding cannot be converted
|
||||
using built-in conversion rules, an error will be reported.
|
||||
|
||||
Return type: list<string>
|
||||
|
||||
|
||||
|
||||
+185
-29
@@ -1275,10 +1275,123 @@ string_from_blob(blob_T *blob, long *start_idx)
|
||||
return ret_str;
|
||||
}
|
||||
|
||||
/*
|
||||
* Normalize encoding name for iconv by adding hyphens.
|
||||
* For example: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le"
|
||||
* Returns allocated string or NULL on allocation failure.
|
||||
*/
|
||||
static char_u *
|
||||
normalize_encoding_name(char_u *enc_skipped)
|
||||
{
|
||||
char_u *from_encoding_raw = alloc(STRLEN(enc_skipped) + 3);
|
||||
if (from_encoding_raw == NULL)
|
||||
return NULL;
|
||||
|
||||
char_u *s = enc_skipped;
|
||||
char_u *pe = from_encoding_raw;
|
||||
|
||||
// Convert to lowercase and replace '_' with '-'
|
||||
while (*s != NUL)
|
||||
{
|
||||
if (*s == '_')
|
||||
*pe++ = '-';
|
||||
else
|
||||
*pe++ = TOLOWER_ASC(*s);
|
||||
++s;
|
||||
}
|
||||
*pe = NUL;
|
||||
|
||||
// Add hyphen before digit: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le"
|
||||
char_u *p = from_encoding_raw;
|
||||
if ((STRNCMP(p, "ucs", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] != '-') ||
|
||||
(STRNCMP(p, "utf", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] != '-'))
|
||||
{
|
||||
// Insert hyphen after "ucs" or "utf": "ucs2" -> "ucs-2"
|
||||
mch_memmove(p + 4, p + 3, STRLEN(p + 3) + 1);
|
||||
p[3] = '-';
|
||||
}
|
||||
|
||||
return from_encoding_raw;
|
||||
}
|
||||
|
||||
/*
|
||||
* "blob2str()" function
|
||||
* Converts a blob to a string, ensuring valid UTF-8 encoding.
|
||||
*/
|
||||
static void
|
||||
append_converted_string_to_list(
|
||||
char_u *converted,
|
||||
int validate_utf8,
|
||||
list_T *list,
|
||||
char_u *from_encoding)
|
||||
{
|
||||
if (converted != NULL)
|
||||
{
|
||||
// After conversion, the output is a valid UTF-8 string (NUL-terminated)
|
||||
int converted_len = (int)STRLEN(converted);
|
||||
|
||||
// Split by newlines and add to list
|
||||
char_u *p = converted;
|
||||
char_u *end = converted + converted_len;
|
||||
while (p < end)
|
||||
{
|
||||
char_u *line_start = p;
|
||||
while (p < end && *p != NL)
|
||||
p++;
|
||||
|
||||
// Add this line to the result list
|
||||
char_u *line = vim_strnsave(line_start, p - line_start);
|
||||
if (line != NULL)
|
||||
{
|
||||
if (validate_utf8 && !utf_valid_string(line, NULL))
|
||||
{
|
||||
vim_free(line);
|
||||
semsg(_(e_str_encoding_from_failed), p_enc);
|
||||
vim_free(converted);
|
||||
return; // Stop processing
|
||||
}
|
||||
if (list_append_string(list, line, -1) == FAIL)
|
||||
{
|
||||
vim_free(line);
|
||||
vim_free(converted);
|
||||
return; // Stop processing on append failure
|
||||
}
|
||||
vim_free(line);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Allocation failure: report error and stop processing
|
||||
emsg(_(e_out_of_memory));
|
||||
vim_free(converted);
|
||||
return;
|
||||
}
|
||||
|
||||
if (*p == NL)
|
||||
p++;
|
||||
}
|
||||
vim_free(converted);
|
||||
}
|
||||
else
|
||||
{
|
||||
semsg(_(e_str_encoding_from_failed), from_encoding);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
append_validated_line_to_list(char_u *line, int validate_utf8, list_T *list)
|
||||
{
|
||||
if (validate_utf8 && !utf_valid_string(line, NULL))
|
||||
{
|
||||
semsg(_(e_str_encoding_from_failed), p_enc);
|
||||
vim_free(line);
|
||||
return FAIL;
|
||||
}
|
||||
|
||||
int ret = list_append_string(list, line, -1);
|
||||
vim_free(line);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
f_blob2str(typval_T *argvars, typval_T *rettv)
|
||||
{
|
||||
@@ -1300,6 +1413,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
|
||||
blen = blob_len(blob);
|
||||
|
||||
char_u *from_encoding = NULL;
|
||||
char_u *from_encoding_raw = NULL; // Encoding name with endianness preserved for iconv
|
||||
if (argvars[1].v_type != VAR_UNKNOWN)
|
||||
{
|
||||
dict_T *d = argvars[1].vval.v_dict;
|
||||
@@ -1307,7 +1421,20 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
|
||||
{
|
||||
char_u *enc = dict_get_string(d, "encoding", FALSE);
|
||||
if (enc != NULL)
|
||||
from_encoding = enc_canonize(enc_skip(enc));
|
||||
{
|
||||
char_u *enc_skipped = enc_skip(enc);
|
||||
from_encoding = enc_canonize(enc_skipped);
|
||||
|
||||
// For iconv, preserve the endianness suffix by creating a normalized
|
||||
// version with hyphens: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le"
|
||||
from_encoding_raw = normalize_encoding_name(enc_skipped);
|
||||
if (from_encoding_raw == NULL)
|
||||
{
|
||||
emsg(_(e_out_of_memory));
|
||||
VIM_CLEAR(from_encoding);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1317,46 +1444,74 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
|
||||
if (from_encoding != NULL && STRCMP(from_encoding, "none") == 0)
|
||||
{
|
||||
validate_utf8 = FALSE;
|
||||
vim_free(from_encoding);
|
||||
from_encoding = NULL;
|
||||
VIM_CLEAR(from_encoding);
|
||||
VIM_CLEAR(from_encoding_raw);
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
while (idx < blen)
|
||||
// Special handling for UTF-16/UCS-2/UTF-32/UCS-4 encodings: convert entire blob before splitting by newlines
|
||||
int from_prop = 0;
|
||||
if (from_encoding != NULL)
|
||||
from_prop = enc_canon_props(from_encoding);
|
||||
if (from_encoding != NULL && (from_prop & (ENC_2BYTE | ENC_4BYTE | ENC_2WORD)))
|
||||
{
|
||||
char_u *str;
|
||||
char_u *converted_str;
|
||||
// Build a temporary buffer from the blob as a whole
|
||||
// Don't use string_from_blob() because it treats NUL as line separator
|
||||
garray_T blob_ga;
|
||||
int nul_size = (from_prop & ENC_4BYTE) ? 4 : 2;
|
||||
ga_init2(&blob_ga, 1, blen + nul_size);
|
||||
for (long i = 0; i < blen; i++)
|
||||
ga_append(&blob_ga, (int)(unsigned char)blob_get(blob, i));
|
||||
// Add NUL terminator (2 bytes for UTF-16/UCS-2, 4 bytes for UTF-32/UCS-4)
|
||||
for (int i = 0; i < nul_size; i++)
|
||||
ga_append(&blob_ga, NUL);
|
||||
|
||||
str = string_from_blob(blob, &idx);
|
||||
if (str == NULL)
|
||||
break;
|
||||
|
||||
converted_str = str;
|
||||
if (from_encoding != NULL)
|
||||
// Convert the entire blob at once
|
||||
vimconv_T vimconv;
|
||||
vimconv.vc_type = CONV_NONE;
|
||||
// Use raw encoding name for iconv to preserve endianness (utf-16be vs utf-16)
|
||||
if (convert_setup_ext(&vimconv, from_encoding_raw ? from_encoding_raw : from_encoding, FALSE, p_enc, FALSE) == FAIL)
|
||||
{
|
||||
converted_str = convert_string(str, from_encoding, p_enc);
|
||||
vim_free(str);
|
||||
if (converted_str == NULL)
|
||||
ga_clear(&blob_ga);
|
||||
semsg(_(e_str_encoding_from_failed), from_encoding);
|
||||
goto done;
|
||||
}
|
||||
vimconv.vc_fail = TRUE;
|
||||
// Use string_convert_ext with explicit input length
|
||||
int inlen = blen;
|
||||
char_u *converted = string_convert_ext(&vimconv, (char_u *)blob_ga.ga_data, &inlen, NULL);
|
||||
convert_setup(&vimconv, NULL, NULL);
|
||||
ga_clear(&blob_ga);
|
||||
append_converted_string_to_list(converted, validate_utf8, rettv->vval.v_list, from_encoding);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Original logic for non-UTF-16 encodings
|
||||
idx = 0;
|
||||
while (idx < blen)
|
||||
{
|
||||
char_u *str;
|
||||
|
||||
str = string_from_blob(blob, &idx);
|
||||
if (str == NULL)
|
||||
break;
|
||||
|
||||
if (from_encoding != NULL)
|
||||
{
|
||||
char_u *converted = convert_string(str,
|
||||
from_encoding_raw ? from_encoding_raw : from_encoding, p_enc);
|
||||
vim_free(str);
|
||||
str = converted;
|
||||
}
|
||||
|
||||
if (str == NULL)
|
||||
{
|
||||
semsg(_(e_str_encoding_from_failed), from_encoding);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
if (validate_utf8)
|
||||
{
|
||||
if (!utf_valid_string(converted_str, NULL))
|
||||
{
|
||||
semsg(_(e_str_encoding_from_failed), p_enc);
|
||||
vim_free(converted_str);
|
||||
if (append_validated_line_to_list(str, validate_utf8, rettv->vval.v_list) == FAIL)
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = list_append_string(rettv->vval.v_list, converted_str, -1);
|
||||
vim_free(converted_str);
|
||||
if (ret == FAIL)
|
||||
break;
|
||||
}
|
||||
|
||||
// If the blob ends with a newline, we need to add another empty string.
|
||||
@@ -1365,6 +1520,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
|
||||
|
||||
done:
|
||||
vim_free(from_encoding);
|
||||
vim_free(from_encoding_raw);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -898,4 +898,44 @@ func Test_blob2str_empty_line()
|
||||
call assert_equal(['Hello', '', 'World!'], blob2str(b))
|
||||
endfunc
|
||||
|
||||
func Test_blob2str_multi_byte_encodings()
|
||||
" UTF-16LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
|
||||
|
||||
" UTF-16BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F
|
||||
call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'utf-16be'}))
|
||||
call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'utf16be'}))
|
||||
|
||||
" UCS-2LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
|
||||
|
||||
" UCS-2BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F
|
||||
call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'ucs-2be'}))
|
||||
call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'ucs2be'}))
|
||||
|
||||
" UTF-32LE: "Hi" = 48 00 00 00 69 00 00 00
|
||||
call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'utf-32le'}))
|
||||
call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'utf32le'}))
|
||||
|
||||
" UTF-32BE: "Hi" = 00 00 00 48 00 00 00 69
|
||||
call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'utf-32be'}))
|
||||
call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'utf32be'}))
|
||||
|
||||
" UCS-4LE: "Hi" = 48 00 00 00 69 00 00 00
|
||||
call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'ucs-4le'}))
|
||||
call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'ucs4le'}))
|
||||
|
||||
" UCS-4BE: "Hi" = 00 00 00 48 00 00 00 69
|
||||
call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'ucs-4be'}))
|
||||
call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'ucs4be'}))
|
||||
|
||||
" UTF-16LE with newlines: "Hi\nBye" = 48 00 69 00 0A 00 42 00 79 00 65 00
|
||||
call assert_equal(['Hi', 'Bye'], blob2str(0z48006900.0A004200.79006500, {'encoding': 'utf-16le'}))
|
||||
|
||||
" UTF-32LE with newlines: "A\nB" = 41 00 00 00 0A 00 00 00 42 00 00 00
|
||||
call assert_equal(['A', 'B'], blob2str(0z41000000.0A000000.42000000, {'encoding': 'utf-32le'}))
|
||||
endfunc
|
||||
|
||||
" vim: shiftwidth=2 sts=2 expandtab
|
||||
|
||||
@@ -4557,6 +4557,13 @@ func Test_blob2str()
|
||||
call assert_fails("call blob2str(0z6162, [])", 'E1206: Dictionary required for argument 2')
|
||||
call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using a List as a String')
|
||||
call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: Unable to convert from ''ab12xy'' encoding')
|
||||
|
||||
#" UTF-16LE encoding
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
|
||||
#" UCS-2LE encoding
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
|
||||
call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
|
||||
END
|
||||
call v9.CheckLegacyAndVim9Success(lines)
|
||||
endfunc
|
||||
|
||||
@@ -734,6 +734,8 @@ static char *(features[]) =
|
||||
|
||||
static int included_patches[] =
|
||||
{ /* Add new patch number below this line */
|
||||
/**/
|
||||
2124,
|
||||
/**/
|
||||
2123,
|
||||
/**/
|
||||
|
||||
Reference in New Issue
Block a user