commit/commit-tree: correct latin1 to utf-8

When a line in the message is not a valid utf-8, "git mailinfo" attempts to convert it to utf-8 assuming the input is latin1 (and punt if it does not convert cleanly). Using the same heuristics in "git commit" and "git commit-tree" lets the editor output be in latin1 to make the overall system more consistent. Signed-off-by: Junio C Hamano <gitster@pobox.com>
2026-05-31 11:19:02 +02:00 · 2012-06-28 11:24:14 -07:00
parent 4c8a9db6f7
commit 08a94a145c
2 changed files with 88 additions and 28 deletions
@@ -481,36 +481,12 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
 	return out;
 }
 /*
 * When there is no known charset, guess.
 *
 * Right now we assume that if the target is UTF-8 (the default),
 * and it already looks like UTF-8 (which includes US-ASCII as its
 * subset, of course) then that is what it is and there is nothing
 * to do.
 *
 * Otherwise, we default to assuming it is Latin1 for historical
 * reasons.
 */
 static const char *guess_charset(const struct strbuf *line, const char *target_charset)
 {
 	if (is_encoding_utf8(target_charset)) {
 		if (is_utf8(line->buf))
 			return NULL;
 	}
 	return "ISO8859-1";
 }
 static void convert_to_utf8(struct strbuf *line, const char *charset)
 {
 	char *out;
-	if (!charset || !*charset) {
+	if (!charset || !*charset)
-		charset = guess_charset(line, metainfo_charset);
+		return;
 		if (!charset)
 			return;
 	}
 	if (!strcasecmp(metainfo_charset, charset))
 		return;
 	out = reencode_string(line->buf, metainfo_charset, charset);
@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
 	return result;
 }
 static int find_invalid_utf8(const char *buf, int len)
 {
 	int offset = 0;
 	while (len) {
 		unsigned char c = *buf++;
 		int bytes, bad_offset;
 		len--;
 		offset++;
 		/* Simple US-ASCII? No worries. */
 		if (c < 0x80)
 			continue;
 		bad_offset = offset-1;
 		/*
 		 * Count how many more high bits set: that's how
 		 * many more bytes this sequence should have.
 		 */
 		bytes = 0;
 		while (c & 0x40) {
 			c <<= 1;
 			bytes++;
 		}
 		/* Must be between 1 and 5 more bytes */
 		if (bytes < 1 || bytes > 5)
 			return bad_offset;
 		/* Do we *have* that many bytes? */
 		if (len < bytes)
 			return bad_offset;
 		offset += bytes;
 		len -= bytes;
 		/* And verify that they are good continuation bytes */
 		do {
 			if ((*buf++ & 0xc0) != 0x80)
 				return bad_offset;
 		} while (--bytes);
 		/* We could/should check the value and length here too */
 	}
 	return -1;
 }
 /*
 * This verifies that the buffer is in proper utf8 format.
 *
 * If it isn't, it assumes any non-utf8 characters are Latin1,
 * and does the conversion.
 *
 * Fixme: we should probably also disallow overlong forms and
 * invalid characters. But we don't do that currently.
 */
 static int verify_utf8(struct strbuf *buf)
 {
 	int ok = 1;
 	long pos = 0;
 	for (;;) {
 		int bad;
 		unsigned char c;
 		unsigned char replace[2];
 		bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
 		if (bad < 0)
 			return ok;
 		pos += bad;
 		ok = 0;
 		c = buf->buf[pos];
 		strbuf_remove(buf, pos, 1);
 		/* We know 'c' must be in the range 128-255 */
 		replace[0] = 0xc0 + (c >> 6);
 		replace[1] = 0x80 + (c & 0x3f);
 		strbuf_insert(buf, pos, replace, 2);
 		pos += 2;
 	}
 }
 static const char commit_utf8_warn[] =
-"Warning: commit message does not conform to UTF-8.\n"
+"Warning: commit message did not conform to UTF-8.\n"
 "You may want to amend it after fixing the message, or set the config\n"
 "variable i18n.commitencoding to the encoding your project uses.\n";
@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
 	strbuf_addbuf(&buffer, msg);
 	/* And check the encoding */
-	if (encoding_is_utf8 && !is_utf8(buffer.buf))
+	if (encoding_is_utf8 && !verify_utf8(&buffer))
 		fprintf(stderr, commit_utf8_warn);
 	if (sign_commit && do_sign_commit(&buffer, sign_commit))