From f9221e2cf5049805d9151b3db6a5eef07b1cc92e Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 23 Jun 2021 14:39:07 -0400 Subject: [PATCH 1/4] csum-file: introduce checksum_valid() Introduce a new function which checks the validity of a file's trailing checksum. This is similar to hashfd_check(), but different since it is intended to be used by callers who aren't writing the same data (like `git index-pack --verify`), but who instead want to validate the integrity of data that they are reading. Rewrite the first of two callers which could benefit from this new function in pack-check.c. Subsequent callers will be added in the following patches. Helped-by: Jeff King Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- csum-file.c | 16 ++++++++++++++++ csum-file.h | 3 +++ pack-check.c | 11 +---------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/csum-file.c b/csum-file.c index 7510950fa3..60f58f662a 100644 --- a/csum-file.c +++ b/csum-file.c @@ -187,3 +187,19 @@ uint32_t crc32_end(struct hashfile *f) f->do_crc = 0; return f->crc32; } + +int hashfile_checksum_valid(const unsigned char *data, size_t total_len) +{ + unsigned char got[GIT_MAX_RAWSZ]; + git_hash_ctx ctx; + size_t data_len = total_len - the_hash_algo->rawsz; + + if (total_len < the_hash_algo->rawsz) + return 0; /* say "too short"? */ + + the_hash_algo->init_fn(&ctx); + the_hash_algo->update_fn(&ctx, data, data_len); + the_hash_algo->final_fn(got, &ctx); + + return hasheq(got, data + data_len); +} diff --git a/csum-file.h b/csum-file.h index e54d53d1d0..87e3879f1c 100644 --- a/csum-file.h +++ b/csum-file.h @@ -42,6 +42,9 @@ void hashflush(struct hashfile *f); void crc32_begin(struct hashfile *); uint32_t crc32_end(struct hashfile *); +/* Verify checksum validity while reading. Returns non-zero on success. */ +int hashfile_checksum_valid(const unsigned char *data, size_t len); + /* * Returns the total number of bytes fed to the hashfile so far (including ones * that have not been written out to the descriptor yet). diff --git a/pack-check.c b/pack-check.c index 4b089fe8ec..c8e560d71a 100644 --- a/pack-check.c +++ b/pack-check.c @@ -164,22 +164,13 @@ static int verify_packfile(struct repository *r, int verify_pack_index(struct packed_git *p) { - size_t len; - const unsigned char *index_base; - git_hash_ctx ctx; - unsigned char hash[GIT_MAX_RAWSZ]; int err = 0; if (open_pack_index(p)) return error("packfile %s index not opened", p->pack_name); - index_base = p->index_data; - len = p->index_size - the_hash_algo->rawsz; /* Verify SHA1 sum of the index file */ - the_hash_algo->init_fn(&ctx); - the_hash_algo->update_fn(&ctx, index_base, len); - the_hash_algo->final_fn(hash, &ctx); - if (!hasheq(hash, index_base + len)) + if (!hashfile_checksum_valid(p->index_data, p->index_size)) err = error("Packfile index for %s hash mismatch", p->pack_name); return err; From 15316a4732eeb0dab27ba406cb80e8704cb9b46d Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 23 Jun 2021 14:39:09 -0400 Subject: [PATCH 2/4] commit-graph: rewrite to use checksum_valid() Rewrite an existing caller in `git commit-graph verify` to take advantage of checksum_valid(). Note that the replacement isn't a verbatim cut-and-paste, since the new function avoids using hashfile at all and instead talks to the_hash_algo directly, but it is functionally equivalent. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- commit-graph.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 2bcb4e0f89..1a2602da61 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -2422,14 +2422,16 @@ static void graph_report(const char *fmt, ...) #define GENERATION_ZERO_EXISTS 1 #define GENERATION_NUMBER_EXISTS 2 +static int commit_graph_checksum_valid(struct commit_graph *g) +{ + return hashfile_checksum_valid(g->data, g->data_len); +} + int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags) { uint32_t i, cur_fanout_pos = 0; struct object_id prev_oid, cur_oid; - unsigned char checksum[GIT_MAX_HEXSZ]; int generation_zero = 0; - struct hashfile *f; - int devnull; struct progress *progress = NULL; int local_error = 0; @@ -2442,11 +2444,7 @@ int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags) if (verify_commit_graph_error) return verify_commit_graph_error; - devnull = open("/dev/null", O_WRONLY); - f = hashfd(devnull, NULL); - hashwrite(f, g->data, g->data_len - g->hash_len); - finalize_hashfile(f, checksum, CSUM_CLOSE); - if (!hasheq(checksum, g->data + g->data_len - g->hash_len)) { + if (!commit_graph_checksum_valid(g)) { graph_report(_("the commit-graph file has incorrect checksum and is likely corrupt")); verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH; } From ec1e28ef9c30468d2e76e41c88a1611e63047f61 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 23 Jun 2021 14:39:12 -0400 Subject: [PATCH 3/4] midx: don't reuse corrupt MIDXs when writing When writing a new multi-pack index, Git tries to reuse as much of the data from an existing MIDX as possible, like object offsets. This is done to avoid re-opening a bunch of *.idx files unnecessarily, but can lead to problems if the data we are reusing is corrupt. That's because we'll blindly reuse data from an existing MIDX without checking its trailing checksum for validity. So if there is memory corruption while writing a MIDX, or disk corruption in the intervening period between writing and reuse, we'll blindly propagate those bad values forward. Suppose we experience a memory corruption while writing a MIDX such that we write an incorrect object offset (or alternatively, the disk corrupts the data after being written, but before being reused). Then when we go to write a new MIDX, we'll reuse the bad object offset without checking its validity. This means that the MIDX we just wrote is broken, but its trailing checksum is in-tact, since we never bothered to look at the values before writing. In the above, a "git multi-pack-index verify" would have caught the problem before writing, but writing a new MIDX wouldn't have noticed anything wrong, blindly carrying forward the corrupt offset. Individual pack indexes check their validity by verifying the crc32 attached to each entry when carrying data forward during a repack. We could solve this problem for MIDXs in the same way, but individual crc32's don't make much sense, since their entries are so small. Likewise, checking the whole file on every read may be prohibitively expensive if a repository has a lot of objects, packs, or both. But we can check the trailing checksum when reusing an existing MIDX when writing a new one. And a corrupt MIDX need not stop us from writing a new one, since we can just avoid reusing the existing one at all and pretend as if we are writing a new MIDX from scratch. Suggested-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 10 ++++++++++ t/t5319-multi-pack-index.sh | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/midx.c b/midx.c index 21d6a05e88..a12cbbf928 100644 --- a/midx.c +++ b/midx.c @@ -885,6 +885,11 @@ static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash, static void clear_midx_files_ext(struct repository *r, const char *ext, unsigned char *keep_hash); +static int midx_checksum_valid(struct multi_pack_index *m) +{ + return hashfile_checksum_valid(m->data, m->data_len); +} + static int write_midx_internal(const char *object_dir, struct multi_pack_index *m, struct string_list *packs_to_drop, const char *preferred_pack_name, @@ -911,6 +916,11 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * else ctx.m = load_multi_pack_index(object_dir, 1); + if (ctx.m && !midx_checksum_valid(ctx.m)) { + warning(_("ignoring existing multi-pack-index; checksum mismatch")); + ctx.m = NULL; + } + ctx.nr = 0; ctx.alloc = ctx.m ? ctx.m->num_packs : 16; ctx.info = NULL; diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index 5641d158df..d582f370c4 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -410,6 +410,14 @@ test_expect_success 'git-fsck incorrect offset' ' "git -c core.multipackindex=true fsck" ' +test_expect_success 'corrupt MIDX is not reused' ' + corrupt_midx_and_verify $MIDX_BYTE_OFFSET "\377" $objdir \ + "incorrect object offset" && + git multi-pack-index write 2>err && + test_i18ngrep checksum.mismatch err && + git multi-pack-index verify +' + test_expect_success 'repack progress off for redirected stderr' ' GIT_PROGRESS_DELAY=0 git multi-pack-index --object-dir=$objdir repack 2>err && test_line_count = 0 err From f89ecf79888a48e0adf14d0e05c69ee09e853fd5 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 23 Jun 2021 14:39:15 -0400 Subject: [PATCH 4/4] midx: report checksum mismatches during 'verify' 'git multi-pack-index verify' inspects the data in an existing MIDX for correctness by checking that the recorded object offsets are correct, and so on. But it does not check that the file's trailing checksum matches the data that it records. So, if an on-disk corruption happened to occur in the final few bytes (and all other data was recorded correctly), we would: - get a clean result from 'git multi-pack-index verify', but - be unable to reuse the existing MIDX when writing a new one (since we now check for checksum mismatches before reusing a MIDX) Teach the 'verify' sub-command to recognize corruption in the checksum by calling midx_checksum_valid(). Suggested-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 3 +++ t/t5319-multi-pack-index.sh | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/midx.c b/midx.c index a12cbbf928..9a35b0255d 100644 --- a/midx.c +++ b/midx.c @@ -1228,6 +1228,9 @@ int verify_midx_file(struct repository *r, const char *object_dir, unsigned flag return result; } + if (!midx_checksum_valid(m)) + midx_report(_("incorrect checksum")); + if (flags & MIDX_PROGRESS) progress = start_delayed_progress(_("Looking for referenced packfiles"), m->num_packs); diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index d582f370c4..7609f1ea64 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -418,6 +418,11 @@ test_expect_success 'corrupt MIDX is not reused' ' git multi-pack-index verify ' +test_expect_success 'verify incorrect checksum' ' + pos=$(($(wc -c <$objdir/pack/multi-pack-index) - 1)) && + corrupt_midx_and_verify $pos "\377" $objdir "incorrect checksum" +' + test_expect_success 'repack progress off for redirected stderr' ' GIT_PROGRESS_DELAY=0 git multi-pack-index --object-dir=$objdir repack 2>err && test_line_count = 0 err