From 1ddc0481cfb744d51e235a8cccf97d4afb498743 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 27 Apr 2026 22:18:33 +0000 Subject: [PATCH 1/2] commit: name UTF-8 function appropriately We have a function named verify_utf8, but it does more than verify, it modifies the buffer if it is not UTF-8. This is different from what most people would expect, so call the function ensure_utf8, since it mutates the buffer in some cases. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- commit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/commit.c b/commit.c index 80d8d07875..790dd2faed 100644 --- a/commit.c +++ b/commit.c @@ -1637,12 +1637,12 @@ static int find_invalid_utf8(const char *buf, int len) } /* - * This verifies that the buffer is in proper utf8 format. + * This ensures that the buffer is in proper utf8 format. * * If it isn't, it assumes any non-utf8 characters are Latin1, * and does the conversion. */ -static int verify_utf8(struct strbuf *buf) +static int ensure_utf8(struct strbuf *buf) { int ok = 1; long pos = 0; @@ -1819,7 +1819,7 @@ int commit_tree_extended(const char *msg, size_t msg_len, } /* And check the encoding. */ - if (encoding_is_utf8 && (!verify_utf8(&buffer) || !verify_utf8(&compat_buffer))) + if (encoding_is_utf8 && (!ensure_utf8(&buffer) || !ensure_utf8(&compat_buffer))) fprintf(stderr, _(commit_utf8_warn)); if (r->compat_hash_algo) { From 7735d7eee3a586181dc397afa5aa8f02e009833b Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 27 Apr 2026 22:18:34 +0000 Subject: [PATCH 2/2] commit: sign commit after mutating buffer The ensure_utf8 function can mutate the buffer to change its encoding, so we must call it before signing the buffer so that we do not invalidate the signature, which is made over raw bytes. Fix a bug which caused the compatibility code to not convert the compatibility buffer if the main buffer was invalid UTF-8. We expect both buffers to be valid UTF-8 or both invalid, since the only data that would differ between them would be hex object IDs, which are always valid UTF-8. Add a test for this case using 0xfe and 0xff, which are never valid in UTF-8. Reported-by: Kushal Das Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- commit.c | 15 +++++++++++---- t/t7510-signed-commit.sh | 10 ++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/commit.c b/commit.c index 790dd2faed..e5d725fe93 100644 --- a/commit.c +++ b/commit.c @@ -1726,6 +1726,7 @@ int commit_tree_extended(const char *msg, size_t msg_len, struct repository *r = the_repository; int result = 0; int encoding_is_utf8; + bool warned = false; struct strbuf buffer = STRBUF_INIT, compat_buffer = STRBUF_INIT; struct strbuf sig = STRBUF_INIT, compat_sig = STRBUF_INIT; struct object_id *parent_buf = NULL, *compat_oid = NULL; @@ -1747,6 +1748,13 @@ int commit_tree_extended(const char *msg, size_t msg_len, oidcpy(&parent_buf[i++], &p->item->object.oid); write_commit_tree(&buffer, msg, msg_len, tree, parent_buf, nparents, author, committer, extra); + + /* And check the encoding. */ + if (encoding_is_utf8 && !ensure_utf8(&buffer)) { + fprintf(stderr, _(commit_utf8_warn)); + warned = true; + } + if (sign_commit && sign_buffer(&buffer, &sig, sign_commit, SIGN_BUFFER_USE_DEFAULT_KEY)) { result = -1; @@ -1780,6 +1788,9 @@ int commit_tree_extended(const char *msg, size_t msg_len, free_commit_extra_headers(compat_extra); free(mapped_parents); + if (encoding_is_utf8 && !ensure_utf8(&compat_buffer) && !warned) + fprintf(stderr, _(commit_utf8_warn)); + if (sign_commit && sign_buffer(&compat_buffer, &compat_sig, sign_commit, SIGN_BUFFER_USE_DEFAULT_KEY)) { @@ -1818,10 +1829,6 @@ int commit_tree_extended(const char *msg, size_t msg_len, } } - /* And check the encoding. */ - if (encoding_is_utf8 && (!ensure_utf8(&buffer) || !ensure_utf8(&compat_buffer))) - fprintf(stderr, _(commit_utf8_warn)); - if (r->compat_hash_algo) { hash_object_file(r->compat_hash_algo, compat_buffer.buf, compat_buffer.len, OBJ_COMMIT, &compat_oid_buf); diff --git a/t/t7510-signed-commit.sh b/t/t7510-signed-commit.sh index 1201c85ba6..aa9108da54 100755 --- a/t/t7510-signed-commit.sh +++ b/t/t7510-signed-commit.sh @@ -462,4 +462,14 @@ test_expect_success 'custom `gpg.program`' ' git commit -S --allow-empty -m signed-commit ' +test_expect_success GPG 'commit verifies with non-UTF-8 commit message' ' + printf "I hate\\376\\377UTF-8\\n" >message && + echo unusual-message >file && + git add file && + test_tick && git commit -S -F message 2>err && + git verify-commit HEAD && + grep "commit message did not conform to UTF-8" err >lines && + test_line_count = 1 lines +' + test_done