commit/commit-tree: correct latin1 to utf-8

When a line in the message is not a valid utf-8, "git mailinfo"
attempts to convert it to utf-8 assuming the input is latin1 (and
punt if it does not convert cleanly).  Using the same heuristics in
"git commit" and "git commit-tree" lets the editor output be in
latin1 to make the overall system more consistent.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Linus Torvalds
2012-06-28 11:24:14 -07:00
committed by Junio C Hamano
parent 4c8a9db6f7
commit 08a94a145c
2 changed files with 88 additions and 28 deletions
+2 -26
View File
@@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
return out; return out;
} }
/*
* When there is no known charset, guess.
*
* Right now we assume that if the target is UTF-8 (the default),
* and it already looks like UTF-8 (which includes US-ASCII as its
* subset, of course) then that is what it is and there is nothing
* to do.
*
* Otherwise, we default to assuming it is Latin1 for historical
* reasons.
*/
static const char *guess_charset(const struct strbuf *line, const char *target_charset)
{
if (is_encoding_utf8(target_charset)) {
if (is_utf8(line->buf))
return NULL;
}
return "ISO8859-1";
}
static void convert_to_utf8(struct strbuf *line, const char *charset) static void convert_to_utf8(struct strbuf *line, const char *charset)
{ {
char *out; char *out;
if (!charset || !*charset) { if (!charset || !*charset)
charset = guess_charset(line, metainfo_charset); return;
if (!charset)
return;
}
if (!strcasecmp(metainfo_charset, charset)) if (!strcasecmp(metainfo_charset, charset))
return; return;
out = reencode_string(line->buf, metainfo_charset, charset); out = reencode_string(line->buf, metainfo_charset, charset);
+86 -2
View File
@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
return result; return result;
} }
static int find_invalid_utf8(const char *buf, int len)
{
int offset = 0;
while (len) {
unsigned char c = *buf++;
int bytes, bad_offset;
len--;
offset++;
/* Simple US-ASCII? No worries. */
if (c < 0x80)
continue;
bad_offset = offset-1;
/*
* Count how many more high bits set: that's how
* many more bytes this sequence should have.
*/
bytes = 0;
while (c & 0x40) {
c <<= 1;
bytes++;
}
/* Must be between 1 and 5 more bytes */
if (bytes < 1 || bytes > 5)
return bad_offset;
/* Do we *have* that many bytes? */
if (len < bytes)
return bad_offset;
offset += bytes;
len -= bytes;
/* And verify that they are good continuation bytes */
do {
if ((*buf++ & 0xc0) != 0x80)
return bad_offset;
} while (--bytes);
/* We could/should check the value and length here too */
}
return -1;
}
/*
* This verifies that the buffer is in proper utf8 format.
*
* If it isn't, it assumes any non-utf8 characters are Latin1,
* and does the conversion.
*
* Fixme: we should probably also disallow overlong forms and
* invalid characters. But we don't do that currently.
*/
static int verify_utf8(struct strbuf *buf)
{
int ok = 1;
long pos = 0;
for (;;) {
int bad;
unsigned char c;
unsigned char replace[2];
bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
if (bad < 0)
return ok;
pos += bad;
ok = 0;
c = buf->buf[pos];
strbuf_remove(buf, pos, 1);
/* We know 'c' must be in the range 128-255 */
replace[0] = 0xc0 + (c >> 6);
replace[1] = 0x80 + (c & 0x3f);
strbuf_insert(buf, pos, replace, 2);
pos += 2;
}
}
static const char commit_utf8_warn[] = static const char commit_utf8_warn[] =
"Warning: commit message does not conform to UTF-8.\n" "Warning: commit message did not conform to UTF-8.\n"
"You may want to amend it after fixing the message, or set the config\n" "You may want to amend it after fixing the message, or set the config\n"
"variable i18n.commitencoding to the encoding your project uses.\n"; "variable i18n.commitencoding to the encoding your project uses.\n";
@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
strbuf_addbuf(&buffer, msg); strbuf_addbuf(&buffer, msg);
/* And check the encoding */ /* And check the encoding */
if (encoding_is_utf8 && !is_utf8(buffer.buf)) if (encoding_is_utf8 && !verify_utf8(&buffer))
fprintf(stderr, commit_utf8_warn); fprintf(stderr, commit_utf8_warn);
if (sign_commit && do_sign_commit(&buffer, sign_commit)) if (sign_commit && do_sign_commit(&buffer, sign_commit))