diff --git a/Documentation/Makefile b/Documentation/Makefile index a3fbd29744..627204928e 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -34,6 +34,7 @@ MAN5_TXT += gitformat-bundle.adoc MAN5_TXT += gitformat-chunk.adoc MAN5_TXT += gitformat-commit-graph.adoc MAN5_TXT += gitformat-index.adoc +MAN5_TXT += gitformat-loose.adoc MAN5_TXT += gitformat-pack.adoc MAN5_TXT += gitformat-signature.adoc MAN5_TXT += githooks.adoc diff --git a/Documentation/fsck-msgids.adoc b/Documentation/fsck-msgids.adoc index 81f11ba125..acac9683af 100644 --- a/Documentation/fsck-msgids.adoc +++ b/Documentation/fsck-msgids.adoc @@ -10,6 +10,12 @@ `badFilemode`:: (INFO) A tree contains a bad filemode entry. +`badGpgsig`:: + (ERROR) A tag contains a bad (truncated) signature (e.g., `gpgsig`) header. + +`badHeaderContinuation`:: + (ERROR) A continuation header (such as for `gpgsig`) is unexpectedly truncated. + `badName`:: (ERROR) An author/committer name is empty. diff --git a/Documentation/git-rev-parse.adoc b/Documentation/git-rev-parse.adoc index 18383e52af..5398691f3f 100644 --- a/Documentation/git-rev-parse.adoc +++ b/Documentation/git-rev-parse.adoc @@ -324,11 +324,12 @@ The following options are unaffected by `--path-format`: path of the current directory relative to the top-level directory. ---show-object-format[=(storage|input|output)]:: - Show the object format (hash algorithm) used for the repository - for storage inside the `.git` directory, input, or output. For - input, multiple algorithms may be printed, space-separated. - If not specified, the default is "storage". +--show-object-format[=(storage|input|output|compat)]:: + Show the object format (hash algorithm) used for the repository for storage + inside the `.git` directory, input, output, or compatibility. For input, + multiple algorithms may be printed, space-separated. If `compat` is + requested and no compatibility algorithm is enabled, prints an empty line. If + not specified, the default is "storage". --show-ref-format:: Show the reference storage format used for the repository. diff --git a/Documentation/gitformat-loose.adoc b/Documentation/gitformat-loose.adoc new file mode 100644 index 0000000000..947993663e --- /dev/null +++ b/Documentation/gitformat-loose.adoc @@ -0,0 +1,53 @@ +gitformat-loose(5) +================== + +NAME +---- +gitformat-loose - Git loose object format + + +SYNOPSIS +-------- +[verse] +$GIT_DIR/objects/[0-9a-f][0-9a-f]/* + +DESCRIPTION +----------- + +Loose objects are how Git stores individual objects, where every object is +written as a separate file. + +Over the lifetime of a repository, objects are usually written as loose objects +initially. Eventually, these loose objects will be compacted into packfiles +via repository maintenance to improve disk space usage and speed up the lookup +of these objects. + +== Loose objects + +Each loose object contains a prefix, followed immediately by the data of the +object. The prefix contains ` \0`. `` is one of `blob`, +`tree`, `commit`, or `tag` and `size` is the size of the data (without the +prefix) as a decimal integer expressed in ASCII. + +The entire contents, prefix and data concatenated, is then compressed with zlib +and the compressed data is stored in the file. The object ID of the object is +the SHA-1 or SHA-256 (as appropriate) hash of the uncompressed data. + +The file for the loose object is stored under the `objects` directory, with the +first two hex characters of the object ID being the directory and the remaining +characters being the file name. This is done to shard the data and avoid too +many files being in one directory, since some file systems perform poorly with +many items in a directory. + +As an example, the empty tree contains the data (when uncompressed) `tree 0\0` +and, in a SHA-256 repository, would have the object ID +`6ef19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321` and would be +stored under +`$GIT_DIR/objects/6e/f19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321`. + +Similarly, a blob containing the contents `abc` would have the uncompressed +data of `blob 3\0abc`. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/gitformat-pack.adoc b/Documentation/gitformat-pack.adoc index d6ae229be5..1b4db4aa61 100644 --- a/Documentation/gitformat-pack.adoc +++ b/Documentation/gitformat-pack.adoc @@ -32,6 +32,10 @@ In a repository using the traditional SHA-1, pack checksums, index checksums, and object IDs (object names) mentioned below are all computed using SHA-1. Similarly, in SHA-256 repositories, these values are computed using SHA-256. +CRC32 checksums are always computed over the entire packed object, including +the header (n-byte type and length); the base object name or offset, if any; +and the entire compressed object. The CRC32 algorithm used is that of zlib. + == pack-*.pack files have the following format: - A header appears at the beginning and consists of the following: @@ -80,6 +84,16 @@ Valid object types are: Type 5 is reserved for future expansion. Type 0 is invalid. +=== Object encoding + +Unlike loose objects, packed objects do not have a prefix containing the type, +size, and a NUL byte. These are not necessary because they can be determined by +the n-byte type and length that prefixes the data and so they are omitted from +the compressed and deltified data. + +The computation of the object ID still uses this prefix by reconstructing it +from the type and length as needed. + === Size encoding This document uses the following "size encoding" of non-negative @@ -92,6 +106,11 @@ values are more significant. This size encoding should not be confused with the "offset encoding", which is also used in this document. +When encoding the size of an undeltified object in a pack, the size is that of +the uncompressed raw object. For deltified objects, it is the size of the +uncompressed delta. The base object name or offset is not included in the size +computation. + === Deltified representation Conceptually there are only four object types: commit, tree, tag and diff --git a/Documentation/meson.build b/Documentation/meson.build index 44f94cdb7b..9d24f2da54 100644 --- a/Documentation/meson.build +++ b/Documentation/meson.build @@ -173,6 +173,7 @@ manpages = { 'gitformat-chunk.adoc' : 5, 'gitformat-commit-graph.adoc' : 5, 'gitformat-index.adoc' : 5, + 'gitformat-loose.adoc' : 5, 'gitformat-pack.adoc' : 5, 'gitformat-signature.adoc' : 5, 'githooks.adoc' : 5, diff --git a/Documentation/technical/hash-function-transition.adoc b/Documentation/technical/hash-function-transition.adoc index f047fd80ca..2359d7d106 100644 --- a/Documentation/technical/hash-function-transition.adoc +++ b/Documentation/technical/hash-function-transition.adoc @@ -227,9 +227,9 @@ network byte order): ** 4-byte length in bytes of shortened object names. This is the shortest possible length needed to make names in the shortened object name table unambiguous. - ** 4-byte integer, recording where tables relating to this format + ** 8-byte integer, recording where tables relating to this format are stored in this index file, as an offset from the beginning. - * 4-byte offset to the trailer from the beginning of this file. + * 8-byte offset to the trailer from the beginning of this file. * Zero or more additional key/value pairs (4-byte key, 4-byte value). Only one key is supported: 'PSRC'. See the "Loose objects and unreachable objects" section for supported values and how this @@ -260,12 +260,10 @@ network byte order): compressed data to be copied directly from pack to pack during repacking without undetected data corruption. - * A table of 4-byte offset values. For an object in the table of - sorted shortened object names, the value at the corresponding - index in this table indicates where that object can be found in - the pack file. These are usually 31-bit pack file offsets, but - large offsets are encoded as an index into the next table with the - most significant bit set. + * A table of 4-byte offset values. The index of this table in pack order + indicates where that object can be found in the pack file. These are + usually 31-bit pack file offsets, but large offsets are encoded as + an index into the next table with the most significant bit set. * A table of 8-byte offset entries (empty for pack files less than 2 GiB). Pack files are organized with heavily used objects toward @@ -276,10 +274,14 @@ network byte order): up to and not including the table of CRC32 values. - Zero or more NUL bytes. - The trailer consists of the following: - * A copy of the 20-byte SHA-256 checksum at the end of the + * A copy of the full main hash checksum at the end of the corresponding packfile. - * 20-byte SHA-256 checksum of all of the above. + * Full main hash checksum of all of the above. + +The "full main hash" is a full-length hash of the main (not compatibility) +algorithm in the repository. Thus, if the main algorithm is SHA-256, this is +a 32-byte SHA-256 hash and for SHA-1, it's a 20-byte SHA-1 hash. Loose object index ~~~~~~~~~~~~~~~~~~ @@ -427,17 +429,19 @@ ordinary unsigned commit. Signed Tags ~~~~~~~~~~~ -We add a new field "gpgsig-sha256" to the tag object format to allow -signing tags without relying on SHA-1. Its signed payload is the -SHA-256 content of the tag with its gpgsig-sha256 field and "-----BEGIN PGP -SIGNATURE-----" delimited in-body signature removed. +We add new fields "gpgsig" and "gpgsig-sha256" to the tag object format to +allow signing tags in both formats. The in-body signature is used for the +signature in the current hash algorithm and the header is used for the +signature in the other algorithm. Thus, a dual-signature tag will contain both +an in-body signature and a gpgsig-sha256 header for the SHA-1 format of an +object or both an in-body signature and a gpgsig header for the SHA-256 format +of and object. -This means tags can be signed +The signed payload of the tag is the content of the tag in the current +algorithm with both its gpgsig and gpgsig-sha256 fields and +"-----BEGIN PGP SIGNATURE-----" delimited in-body signature removed. -1. using SHA-1 only, as in existing signed tag objects -2. using both SHA-1 and SHA-256, by using gpgsig-sha256 and an in-body - signature. -3. using only SHA-256, by only using the gpgsig-sha256 field. +This means tags can be signed using one or both algorithms. Mergetag embedding ~~~~~~~~~~~~~~~~~~ diff --git a/builtin/rev-parse.c b/builtin/rev-parse.c index 9da92b990d..7b3711cf34 100644 --- a/builtin/rev-parse.c +++ b/builtin/rev-parse.c @@ -1107,11 +1107,20 @@ int cmd_rev_parse(int argc, const char *val = arg ? arg : "storage"; if (strcmp(val, "storage") && + strcmp(val, "compat") && strcmp(val, "input") && strcmp(val, "output")) die(_("unknown mode for --show-object-format: %s"), arg); - puts(the_hash_algo->name); + + if (!strcmp(val, "compat")) { + if (the_repository->compat_hash_algo) + puts(the_repository->compat_hash_algo->name); + else + putchar('\n'); + } else { + puts(the_hash_algo->name); + } continue; } if (!strcmp(arg, "--show-ref-format")) { diff --git a/fsck.c b/fsck.c index 171b424dd5..341e100d24 100644 --- a/fsck.c +++ b/fsck.c @@ -1067,6 +1067,24 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer, else ret = fsck_ident(&buffer, oid, OBJ_TAG, options); + if (buffer < buffer_end && (skip_prefix(buffer, "gpgsig ", &buffer) || skip_prefix(buffer, "gpgsig-sha256 ", &buffer))) { + eol = memchr(buffer, '\n', buffer_end - buffer); + if (!eol) { + ret = report(options, oid, OBJ_TAG, FSCK_MSG_BAD_GPGSIG, "invalid format - unexpected end after 'gpgsig' or 'gpgsig-sha256' line"); + goto done; + } + buffer = eol + 1; + + while (buffer < buffer_end && starts_with(buffer, " ")) { + eol = memchr(buffer, '\n', buffer_end - buffer); + if (!eol) { + ret = report(options, oid, OBJ_TAG, FSCK_MSG_BAD_HEADER_CONTINUATION, "invalid format - unexpected end in 'gpgsig' or 'gpgsig-sha256' continuation line"); + goto done; + } + buffer = eol + 1; + } + } + if (buffer < buffer_end && !starts_with(buffer, "\n")) { /* * The verify_headers() check will allow diff --git a/fsck.h b/fsck.h index 759df97655..cb6ef32f4f 100644 --- a/fsck.h +++ b/fsck.h @@ -25,9 +25,11 @@ enum fsck_msg_type { FUNC(NUL_IN_HEADER, FATAL) \ FUNC(UNTERMINATED_HEADER, FATAL) \ /* errors */ \ + FUNC(BAD_HEADER_CONTINUATION, ERROR) \ FUNC(BAD_DATE, ERROR) \ FUNC(BAD_DATE_OVERFLOW, ERROR) \ FUNC(BAD_EMAIL, ERROR) \ + FUNC(BAD_GPGSIG, ERROR) \ FUNC(BAD_NAME, ERROR) \ FUNC(BAD_OBJECT_SHA1, ERROR) \ FUNC(BAD_PACKED_REF_ENTRY, ERROR) \ diff --git a/t/t1010-mktree.sh b/t/t1010-mktree.sh index e9973f7494..312fe6717a 100755 --- a/t/t1010-mktree.sh +++ b/t/t1010-mktree.sh @@ -11,10 +11,13 @@ test_expect_success setup ' git add "$d" || return 1 done && echo zero >one && - git update-index --add --info-only one && - git write-tree --missing-ok >tree.missing && - git ls-tree $(cat tree.missing) >top.missing && - git ls-tree -r $(cat tree.missing) >all.missing && + if test_have_prereq BROKEN_OBJECTS + then + git update-index --add --info-only one && + git write-tree --missing-ok >tree.missing && + git ls-tree $(cat tree.missing) >top.missing && + git ls-tree -r $(cat tree.missing) >all.missing + fi && echo one >one && git add one && git write-tree >tree && @@ -53,7 +56,7 @@ test_expect_success 'ls-tree output in wrong order given to mktree (2)' ' test_cmp tree.withsub actual ' -test_expect_success 'allow missing object with --missing' ' +test_expect_success BROKEN_OBJECTS 'allow missing object with --missing' ' git mktree --missing actual && test_cmp tree.missing actual ' diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 5ae86c42be..c4b651c2dc 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -454,6 +454,60 @@ test_expect_success 'tag with NUL in header' ' test_grep "error in tag $tag.*unterminated header: NUL at offset" out ' +test_expect_success 'tag accepts gpgsig header even if not validly signed' ' + test_oid_cache <<-\EOF && + header sha1:gpgsig-sha256 + header sha256:gpgsig + EOF + header=$(test_oid header) && + sha=$(git rev-parse HEAD) && + cat >good-tag <<-EOF && + object $sha + type commit + tag good + tagger T A Gger 1234567890 -0000 + $header -----BEGIN PGP SIGNATURE----- + Not a valid signature + -----END PGP SIGNATURE----- + + This is a good tag. + EOF + + tag=$(git hash-object --literally -t tag -w --stdin bad-tag <<-EOF && + object $sha + type commit + tag good + tagger T A Gger 1234567890 -0000 + $header -----BEGIN PGP SIGNATURE----- + Not a valid signature + -----END PGP SIGNATURE----- + junk + + This is a bad tag with junk at the end of the headers. + EOF + + tag=$(git hash-object --literally -t tag -w --stdin out && + test_grep "error in tag $tag.*invalid format - extra header" out +' + test_expect_success 'cleaned up' ' git fsck >actual 2>&1 && test_must_be_empty actual diff --git a/t/t1500-rev-parse.sh b/t/t1500-rev-parse.sh index 58a4583088..7739ab611b 100755 --- a/t/t1500-rev-parse.sh +++ b/t/t1500-rev-parse.sh @@ -207,6 +207,40 @@ test_expect_success 'rev-parse --show-object-format in repo' ' grep "unknown mode for --show-object-format: squeamish-ossifrage" err ' + +test_expect_success 'rev-parse --show-object-format in repo with compat mode' ' + mkdir repo && + ( + sane_unset GIT_DEFAULT_HASH && + cd repo && + git init --object-format=sha256 && + git config extensions.compatobjectformat sha1 && + echo sha256 >expect && + git rev-parse --show-object-format >actual && + test_cmp expect actual && + git rev-parse --show-object-format=storage >actual && + test_cmp expect actual && + git rev-parse --show-object-format=input >actual && + test_cmp expect actual && + git rev-parse --show-object-format=output >actual && + test_cmp expect actual && + echo sha1 >expect && + git rev-parse --show-object-format=compat >actual && + test_cmp expect actual && + test_must_fail git rev-parse --show-object-format=squeamish-ossifrage 2>err && + grep "unknown mode for --show-object-format: squeamish-ossifrage" err + ) && + mkdir repo2 && + ( + sane_unset GIT_DEFAULT_HASH && + cd repo2 && + git init --object-format=sha256 && + echo >expect && + git rev-parse --show-object-format=compat >actual && + test_cmp expect actual + ) +' + test_expect_success 'rev-parse --show-ref-format' ' test_detect_ref_format >expect && git rev-parse --show-ref-format >actual && diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh index a28de7b19b..52d7759bf5 100644 --- a/t/test-lib-functions.sh +++ b/t/test-lib-functions.sh @@ -1708,11 +1708,16 @@ test_set_hash () { # Detect the hash algorithm in use. test_detect_hash () { case "${GIT_TEST_DEFAULT_HASH:-$GIT_TEST_BUILTIN_HASH}" in - "sha256") + *:*) + test_hash_algo="${GIT_TEST_DEFAULT_HASH%%:*}" + test_compat_hash_algo="${GIT_TEST_DEFAULT_HASH##*:}" + test_repo_compat_hash_algo="$test_compat_hash_algo" + ;; + sha256) test_hash_algo=sha256 test_compat_hash_algo=sha1 ;; - *) + sha1) test_hash_algo=sha1 test_compat_hash_algo=sha256 ;; diff --git a/t/test-lib.sh b/t/test-lib.sh index 562f950fb0..ef0ab7ec2d 100644 --- a/t/test-lib.sh +++ b/t/test-lib.sh @@ -1924,6 +1924,19 @@ test_lazy_prereq DEFAULT_HASH_ALGORITHM ' test_lazy_prereq DEFAULT_REPO_FORMAT ' test_have_prereq SHA1,REFFILES ' +# BROKEN_OBJECTS is a test whether we can write deliberately broken objects and +# expect them to work. When running using SHA-256 mode with SHA-1 +# compatibility, we cannot write such objects because there's no SHA-1 +# compatibility value for a nonexistent object. +test_lazy_prereq BROKEN_OBJECTS ' + ! test_have_prereq COMPAT_HASH +' + +# COMPAT_HASH is a test if we're operating in a repository with SHA-256 with +# SHA-1 compatibility. +test_lazy_prereq COMPAT_HASH ' + test -n "$test_repo_compat_hash_algo" +' # Ensure that no test accidentally triggers a Git command # that runs the actual maintenance scheduler, affecting a user's