1
0
Fork 0
mirror of https://github.com/git/git.git synced 2024-05-09 06:46:16 +02:00

Merge branch 'tb/path-filter-fix' into jch

The Bloom filter used for path limited history traversal was broken
on systems whose "char" is unsigned; update the implementation and
bump the format version to 2.

* tb/path-filter-fix:
  bloom: introduce `deinit_bloom_filters()`
  commit-graph: reuse existing Bloom filters where possible
  object.h: fix mis-aligned flag bits table
  commit-graph: new Bloom filter version that fixes murmur3
  commit-graph: unconditionally load Bloom filters
  bloom: prepare to discard incompatible Bloom filters
  bloom: annotate filters with hash version
  repo-settings: introduce commitgraph.changedPathsVersion
  t4216: test changed path filters with high bit paths
  t/helper/test-read-graph: implement `bloom-filters` mode
  bloom.h: make `load_bloom_filter_from_graph()` public
  t/helper/test-read-graph.c: extract `dump_graph_info()`
  gitformat-commit-graph: describe version 2 of BDAT
  commit-graph: ensure Bloom filters are read with consistent settings
  revision.c: consult Bloom filters for root commits
  t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
This commit is contained in:
Junio C Hamano 2024-04-26 09:22:47 -07:00
commit 3d5247292d
14 changed files with 721 additions and 58 deletions

View File

@ -9,6 +9,29 @@ commitGraph.maxNewFilters::
commit-graph write` (c.f., linkgit:git-commit-graph[1]).
commitGraph.readChangedPaths::
If true, then git will use the changed-path Bloom filters in the
commit-graph file (if it exists, and they are present). Defaults to
true. See linkgit:git-commit-graph[1] for more information.
Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and
commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion
is also set, commitGraph.changedPathsVersion takes precedence.)
commitGraph.changedPathsVersion::
Specifies the version of the changed-path Bloom filters that Git will read and
write. May be -1, 0, 1, or 2. Note that values greater than 1 may be
incompatible with older versions of Git which do not yet understand
those versions. Use caution when operating in a mixed-version
environment.
+
Defaults to -1.
+
If -1, Git will use the version of the changed-path Bloom filters in the
repository, defaulting to 1 if there are none.
+
If 0, Git will not read any Bloom filters, and will write version 1 Bloom
filters when instructed to write.
+
If 1, Git will only read version 1 Bloom filters, and will write version 1
Bloom filters.
+
If 2, Git will only read version 2 Bloom filters, and will write version 2
Bloom filters.
+
See linkgit:git-commit-graph[1] for more information.

View File

@ -142,13 +142,16 @@ All multi-byte numbers are in network byte order.
==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
* It starts with header consisting of three unsigned 32-bit integers:
- Version of the hash algorithm being used. We currently only support
value 1 which corresponds to the 32-bit version of the murmur3 hash
- Version of the hash algorithm being used. We currently support
value 2 which corresponds to the 32-bit version of the murmur3 hash
implemented exactly as described in
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
in Probabilistic Verification"
in Probabilistic Verification". Version 1 Bloom filters have a bug that appears
when char is signed and the repository has path names that have characters >=
0x80; Git supports reading and writing them, but this ability will be removed
in a future version of Git.
- The number of times a path is hashed and hence the number of bit positions
that cumulatively determine whether a file is present in the commit.
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter

208
bloom.c
View File

@ -6,6 +6,9 @@
#include "commit-graph.h"
#include "commit.h"
#include "commit-slab.h"
#include "tree.h"
#include "tree-walk.h"
#include "config.h"
define_commit_slab(bloom_filter_slab, struct bloom_filter);
@ -48,9 +51,9 @@ static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
return -1;
}
static int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos)
int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos)
{
uint32_t lex_pos, start_index, end_index;
@ -88,6 +91,8 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
filter->data = (unsigned char *)(g->chunk_bloom_data +
sizeof(unsigned char) * start_index +
BLOOMDATA_CHUNK_HEADER_SIZE);
filter->version = g->bloom_filter_settings->hash_version;
filter->to_free = NULL;
return 1;
}
@ -99,7 +104,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const uint32_t r1 = 15;
const uint32_t r2 = 13;
const uint32_t m = 5;
const uint32_t n = 0xe6546b64;
int i;
uint32_t k1 = 0;
const char *tail;
int len4 = len / sizeof(uint32_t);
uint32_t k;
for (i = 0; i < len4; i++) {
uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
k = byte1 | byte2 | byte3 | byte4;
k *= c1;
k = rotate_left(k, r1);
k *= c2;
seed ^= k;
seed = rotate_left(seed, r2) * m + n;
}
tail = (data + len4 * sizeof(uint32_t));
switch (len & (sizeof(uint32_t) - 1)) {
case 3:
k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
/*-fallthrough*/
case 2:
k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
/*-fallthrough*/
case 1:
k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
k1 *= c1;
k1 = rotate_left(k1, r1);
k1 *= c2;
seed ^= k1;
break;
}
seed ^= (uint32_t)len;
seed ^= (seed >> 16);
seed *= 0x85ebca6b;
seed ^= (seed >> 13);
seed *= 0xc2b2ae35;
seed ^= (seed >> 16);
return seed;
}
static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
@ -164,8 +226,14 @@ void fill_bloom_key(const char *data,
int i;
const uint32_t seed0 = 0x293ae76f;
const uint32_t seed1 = 0x7e646e2c;
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
uint32_t hash0, hash1;
if (settings->hash_version == 2) {
hash0 = murmur3_seeded_v2(seed0, data, len);
hash1 = murmur3_seeded_v2(seed1, data, len);
} else {
hash0 = murmur3_seeded_v1(seed0, data, len);
hash1 = murmur3_seeded_v1(seed1, data, len);
}
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
for (i = 0; i < settings->num_hashes; i++)
@ -197,6 +265,18 @@ void init_bloom_filters(void)
init_bloom_filter_slab(&bloom_filters);
}
static void free_one_bloom_filter(struct bloom_filter *filter)
{
if (!filter)
return;
free(filter->to_free);
}
void deinit_bloom_filters(void)
{
deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
}
static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
const struct hashmap_entry *eptr,
const struct hashmap_entry *entry_or_key,
@ -210,11 +290,97 @@ static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
return strcmp(e1->path, e2->path);
}
static void init_truncated_large_filter(struct bloom_filter *filter)
static void init_truncated_large_filter(struct bloom_filter *filter,
int version)
{
filter->data = xmalloc(1);
filter->data = filter->to_free = xmalloc(1);
filter->data[0] = 0xFF;
filter->len = 1;
filter->version = version;
}
#define VISITED (1u<<21)
#define HIGH_BITS (1u<<22)
static int has_entries_with_high_bit(struct repository *r, struct tree *t)
{
if (parse_tree(t))
return 1;
if (!(t->object.flags & VISITED)) {
struct tree_desc desc;
struct name_entry entry;
init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
while (tree_entry(&desc, &entry)) {
size_t i;
for (i = 0; i < entry.pathlen; i++) {
if (entry.path[i] & 0x80) {
t->object.flags |= HIGH_BITS;
goto done;
}
}
if (S_ISDIR(entry.mode)) {
struct tree *sub = lookup_tree(r, &entry.oid);
if (sub && has_entries_with_high_bit(r, sub)) {
t->object.flags |= HIGH_BITS;
goto done;
}
}
}
done:
t->object.flags |= VISITED;
}
return !!(t->object.flags & HIGH_BITS);
}
static int commit_tree_has_high_bit_paths(struct repository *r,
struct commit *c)
{
struct tree *t;
if (repo_parse_commit(r, c))
return 1;
t = repo_get_commit_tree(r, c);
if (!t)
return 1;
return has_entries_with_high_bit(r, t);
}
static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
struct bloom_filter *filter,
int hash_version)
{
struct commit_list *p = c->parents;
if (commit_tree_has_high_bit_paths(r, c))
return NULL;
if (p && commit_tree_has_high_bit_paths(r, p->item))
return NULL;
filter->version = hash_version;
return filter;
}
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
{
struct bloom_filter *filter;
int hash_version;
filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
if (!filter)
return NULL;
prepare_repo_settings(r);
hash_version = r->settings.commit_graph_changed_paths_version;
if (!(hash_version == -1 || hash_version == filter->version))
return NULL; /* unusable filter */
return filter;
}
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
@ -242,8 +408,23 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
filter, graph_pos);
}
if (filter->data && filter->len)
return filter;
if (filter->data && filter->len) {
struct bloom_filter *upgrade;
if (!settings || settings->hash_version == filter->version)
return filter;
/* version mismatch, see if we can upgrade */
if (compute_if_not_present &&
git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
upgrade = upgrade_filter(r, c, filter,
settings->hash_version);
if (upgrade) {
if (computed)
*computed |= BLOOM_UPGRADED;
return upgrade;
}
}
}
if (!compute_if_not_present)
return NULL;
@ -299,19 +480,22 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
}
if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
init_truncated_large_filter(filter);
init_truncated_large_filter(filter,
settings->hash_version);
if (computed)
*computed |= BLOOM_TRUNC_LARGE;
goto cleanup;
}
filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
filter->version = settings->hash_version;
if (!filter->len) {
if (computed)
*computed |= BLOOM_TRUNC_EMPTY;
filter->len = 1;
}
CALLOC_ARRAY(filter->data, filter->len);
filter->to_free = filter->data;
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
struct bloom_key key;
@ -325,7 +509,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
} else {
for (i = 0; i < diff_queued_diff.nr; i++)
diff_free_filepair(diff_queued_diff.queue[i]);
init_truncated_large_filter(filter);
init_truncated_large_filter(filter, settings->hash_version);
if (computed)
*computed |= BLOOM_TRUNC_LARGE;

38
bloom.h
View File

@ -3,13 +3,16 @@
struct commit;
struct repository;
struct commit_graph;
struct bloom_filter_settings {
/*
* The version of the hashing technique being used.
* We currently only support version = 1 which is
* The newest version is 2, which is
* the seeded murmur3 hashing technique implemented
* in bloom.c.
* in bloom.c. Bloom filters of version 1 were created
* with prior versions of Git, which had a bug in the
* implementation of the hash function.
*/
uint32_t hash_version;
@ -52,6 +55,9 @@ struct bloom_filter_settings {
struct bloom_filter {
unsigned char *data;
size_t len;
int version;
void *to_free;
};
/*
@ -68,6 +74,10 @@ struct bloom_key {
uint32_t *hashes;
};
int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos);
/*
* Calculate the murmur3 32-bit hash value for the given data
* using the given seed.
@ -75,7 +85,7 @@ struct bloom_key {
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len);
void fill_bloom_key(const char *data,
size_t len,
@ -88,12 +98,14 @@ void add_key_to_filter(const struct bloom_key *key,
const struct bloom_filter_settings *settings);
void init_bloom_filters(void);
void deinit_bloom_filters(void);
enum bloom_filter_computed {
BLOOM_NOT_COMPUTED = (1 << 0),
BLOOM_COMPUTED = (1 << 1),
BLOOM_TRUNC_LARGE = (1 << 2),
BLOOM_TRUNC_EMPTY = (1 << 3),
BLOOM_UPGRADED = (1 << 4),
};
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
@ -102,8 +114,24 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
const struct bloom_filter_settings *settings,
enum bloom_filter_computed *computed);
#define get_bloom_filter(r, c) get_or_compute_bloom_filter( \
(r), (c), 0, NULL, NULL)
/*
* Find the Bloom filter associated with the given commit "c".
*
* If any of the following are true
*
* - the repository does not have a commit-graph, or
* - the repository disables reading from the commit-graph, or
* - the given commit does not have a Bloom filter computed, or
* - there is a Bloom filter for commit "c", but it cannot be read
* because the filter uses an incompatible version of murmur3
*
* , then `get_bloom_filter()` will return NULL. Otherwise, the corresponding
* Bloom filter will be returned.
*
* For callers who wish to inspect Bloom filters with incompatible hash
* versions, use get_or_compute_bloom_filter().
*/
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c);
int bloom_filter_contains(const struct bloom_filter *filter,
const struct bloom_key *key,

View File

@ -344,7 +344,6 @@ static int graph_read_bloom_data(const unsigned char *chunk_start,
size_t chunk_size, void *data)
{
struct commit_graph *g = data;
uint32_t hash_version;
if (chunk_size < BLOOMDATA_CHUNK_HEADER_SIZE) {
warning(_("ignoring too-small changed-path chunk"
@ -356,13 +355,9 @@ static int graph_read_bloom_data(const unsigned char *chunk_start,
g->chunk_bloom_data = chunk_start;
g->chunk_bloom_data_size = chunk_size;
hash_version = get_be32(chunk_start);
if (hash_version != 1)
return 0;
g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
g->bloom_filter_settings->hash_version = hash_version;
g->bloom_filter_settings->hash_version = get_be32(chunk_start);
g->bloom_filter_settings->num_hashes = get_be32(chunk_start + 4);
g->bloom_filter_settings->bits_per_entry = get_be32(chunk_start + 8);
g->bloom_filter_settings->max_changed_paths = DEFAULT_BLOOM_MAX_CHANGES;
@ -459,7 +454,7 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
graph->read_generation_data = 1;
}
if (s->commit_graph_read_changed_paths) {
if (s->commit_graph_changed_paths_version) {
read_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES,
graph_read_bloom_index, graph);
read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA,
@ -543,6 +538,31 @@ static int validate_mixed_generation_chain(struct commit_graph *g)
return 0;
}
static void validate_mixed_bloom_settings(struct commit_graph *g)
{
struct bloom_filter_settings *settings = NULL;
for (; g; g = g->base_graph) {
if (!g->bloom_filter_settings)
continue;
if (!settings) {
settings = g->bloom_filter_settings;
continue;
}
if (g->bloom_filter_settings->bits_per_entry != settings->bits_per_entry ||
g->bloom_filter_settings->num_hashes != settings->num_hashes ||
g->bloom_filter_settings->hash_version != settings->hash_version) {
g->chunk_bloom_indexes = NULL;
g->chunk_bloom_data = NULL;
FREE_AND_NULL(g->bloom_filter_settings);
warning(_("disabling Bloom filters for commit-graph "
"layer '%s' due to incompatible settings"),
oid_to_hex(&g->oid));
}
}
}
static int add_graph_to_chain(struct commit_graph *g,
struct commit_graph *chain,
struct object_id *oids,
@ -666,6 +686,7 @@ struct commit_graph *load_commit_graph_chain_fd_st(struct repository *r,
}
validate_mixed_generation_chain(graph_chain);
validate_mixed_bloom_settings(graph_chain);
free(oids);
fclose(fp);
@ -810,6 +831,7 @@ void close_commit_graph(struct raw_object_store *o)
return;
clear_commit_graph_data_slab(&commit_graph_data_slab);
deinit_bloom_filters();
free_commit_graph(o->commit_graph);
o->commit_graph = NULL;
}
@ -1147,6 +1169,7 @@ struct write_commit_graph_context {
int count_bloom_filter_not_computed;
int count_bloom_filter_trunc_empty;
int count_bloom_filter_trunc_large;
int count_bloom_filter_upgraded;
};
static int write_graph_chunk_fanout(struct hashfile *f,
@ -1754,6 +1777,8 @@ static void trace2_bloom_filter_write_statistics(struct write_commit_graph_conte
ctx->count_bloom_filter_trunc_empty);
trace2_data_intmax("commit-graph", ctx->r, "filter-trunc-large",
ctx->count_bloom_filter_trunc_large);
trace2_data_intmax("commit-graph", ctx->r, "filter-upgraded",
ctx->count_bloom_filter_upgraded);
}
static void compute_bloom_filters(struct write_commit_graph_context *ctx)
@ -1795,6 +1820,8 @@ static void compute_bloom_filters(struct write_commit_graph_context *ctx)
ctx->count_bloom_filter_trunc_empty++;
if (computed & BLOOM_TRUNC_LARGE)
ctx->count_bloom_filter_trunc_large++;
} else if (computed & BLOOM_UPGRADED) {
ctx->count_bloom_filter_upgraded++;
} else if (computed & BLOOM_NOT_COMPUTED)
ctx->count_bloom_filter_not_computed++;
ctx->total_bloom_filter_data_size += filter
@ -2478,6 +2505,13 @@ int write_commit_graph(struct object_directory *odb,
}
if (!commit_graph_compatible(r))
return 0;
if (r->settings.commit_graph_changed_paths_version < -1
|| r->settings.commit_graph_changed_paths_version > 2) {
warning(_("attempting to write a commit-graph, but "
"'commitgraph.changedPathsVersion' (%d) is not supported"),
r->settings.commit_graph_changed_paths_version);
return 0;
}
CALLOC_ARRAY(ctx, 1);
ctx->r = r;
@ -2490,6 +2524,7 @@ int write_commit_graph(struct object_directory *odb,
ctx->write_generation_data = (get_configured_generation_version(r) == 2);
ctx->num_generation_data_overflows = 0;
bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version;
bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
bloom_settings.bits_per_entry);
bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
@ -2519,12 +2554,20 @@ int write_commit_graph(struct object_directory *odb,
g = ctx->r->objects->commit_graph;
/* We have changed-paths already. Keep them in the next graph */
if (g && g->chunk_bloom_data) {
if (g && g->bloom_filter_settings) {
ctx->changed_paths = 1;
ctx->bloom_settings = g->bloom_filter_settings;
/* don't propagate the hash_version unless unspecified */
if (bloom_settings.hash_version == -1)
bloom_settings.hash_version = g->bloom_filter_settings->hash_version;
bloom_settings.bits_per_entry = g->bloom_filter_settings->bits_per_entry;
bloom_settings.num_hashes = g->bloom_filter_settings->num_hashes;
bloom_settings.max_changed_paths = g->bloom_filter_settings->max_changed_paths;
}
}
bloom_settings.hash_version = bloom_settings.hash_version == 2 ? 2 : 1;
if (ctx->split) {
struct commit_graph *g = ctx->r->objects->commit_graph;
@ -2607,6 +2650,9 @@ int write_commit_graph(struct object_directory *odb,
res = write_commit_graph_file(ctx);
if (ctx->changed_paths)
deinit_bloom_filters();
if (ctx->split)
mark_commit_graphs(ctx);

View File

@ -62,7 +62,7 @@ void object_array_init(struct object_array *array);
/*
* object flag allocation:
* revision.h: 0---------10 15 23------27
* revision.h: 0---------10 15 23------27
* fetch-pack.c: 01 67
* negotiator/default.c: 2--5
* walker.c: 0-2
@ -75,6 +75,7 @@ void object_array_init(struct object_array *array);
* commit-reach.c: 16-----19
* sha1-name.c: 20
* list-objects-filter.c: 21
* bloom.c: 2122
* builtin/fsck.c: 0--3
* builtin/gc.c: 0
* builtin/index-pack.c: 2021

View File

@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
* possible.
*/
the_repository->settings.commit_graph_generation_version = 2;
the_repository->settings.commit_graph_read_changed_paths = 1;
the_repository->settings.commit_graph_changed_paths_version = 1;
g = parse_commit_graph(&the_repository->settings, (void *)data, size);
repo_clear(the_repository);
free_commit_graph(g);

View File

@ -23,6 +23,7 @@ void prepare_repo_settings(struct repository *r)
int value;
const char *strval;
int manyfiles;
int read_changed_paths;
if (!r->gitdir)
BUG("Cannot add settings for uninitialized repository");
@ -54,7 +55,10 @@ void prepare_repo_settings(struct repository *r)
/* Commit graph config or default, does not cascade (simple) */
repo_cfg_bool(r, "core.commitgraph", &r->settings.core_commit_graph, 1);
repo_cfg_int(r, "commitgraph.generationversion", &r->settings.commit_graph_generation_version, 2);
repo_cfg_bool(r, "commitgraph.readchangedpaths", &r->settings.commit_graph_read_changed_paths, 1);
repo_cfg_bool(r, "commitgraph.readchangedpaths", &read_changed_paths, 1);
repo_cfg_int(r, "commitgraph.changedpathsversion",
&r->settings.commit_graph_changed_paths_version,
read_changed_paths ? -1 : 0);
repo_cfg_bool(r, "gc.writecommitgraph", &r->settings.gc_write_commit_graph, 1);
repo_cfg_bool(r, "fetch.writecommitgraph", &r->settings.fetch_write_commit_graph, 0);

View File

@ -33,7 +33,7 @@ struct repo_settings {
int core_commit_graph;
int commit_graph_generation_version;
int commit_graph_read_changed_paths;
int commit_graph_changed_paths_version;
int gc_write_commit_graph;
int fetch_write_commit_graph;
int command_requires_full_index;

View File

@ -844,17 +844,28 @@ static int rev_compare_tree(struct rev_info *revs,
return tree_difference;
}
static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit,
int nth_parent)
{
struct tree *t1 = repo_get_commit_tree(the_repository, commit);
int bloom_ret = -1;
if (!t1)
return 0;
if (!nth_parent && revs->bloom_keys_nr) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
if (!bloom_ret)
return 1;
}
tree_difference = REV_TREE_SAME;
revs->pruning.flags.has_changes = 0;
diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
count_bloom_filter_false_positive++;
return tree_difference == REV_TREE_SAME;
}
@ -892,7 +903,7 @@ static int compact_treesame(struct rev_info *revs, struct commit *commit, unsign
if (nth_parent != 0)
die("compact_treesame %u", nth_parent);
old_same = !!(commit->object.flags & TREESAME);
if (rev_same_tree_as_empty(revs, commit))
if (rev_same_tree_as_empty(revs, commit, nth_parent))
commit->object.flags |= TREESAME;
else
commit->object.flags &= ~TREESAME;
@ -988,7 +999,14 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
return;
if (!commit->parents) {
if (rev_same_tree_as_empty(revs, commit))
/*
* Pretend as if we are comparing ourselves to the
* (non-existent) first parent of this commit object. Even
* though no such parent exists, its changed-path Bloom filter
* (if one exists) is relative to the empty tree, using Bloom
* filters is allowed here.
*/
if (rev_same_tree_as_empty(revs, commit, 0))
commit->object.flags |= TREESAME;
return;
}
@ -1069,7 +1087,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
case REV_TREE_NEW:
if (revs->remove_empty_trees &&
rev_same_tree_as_empty(revs, p)) {
rev_same_tree_as_empty(revs, p, nth_parent)) {
/* We are adding all the specified
* paths from this parent, so the
* history beyond this parent is not

View File

@ -49,6 +49,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid)
static const char *bloom_usage = "\n"
" test-tool bloom get_murmur3 <string>\n"
" test-tool bloom get_murmur3_seven_highbit\n"
" test-tool bloom generate_filter <string> [<string>...]\n"
" test-tool bloom get_filter_for_commit <commit-hex>\n";
@ -63,7 +64,13 @@ int cmd__bloom(int argc, const char **argv)
uint32_t hashed;
if (argc < 3)
usage(bloom_usage);
hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2]));
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}
if (!strcmp(argv[1], "get_murmur3_seven_highbit")) {
uint32_t hashed;
hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}

View File

@ -5,20 +5,8 @@
#include "bloom.h"
#include "setup.h"
int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
static void dump_graph_info(struct commit_graph *graph)
{
struct commit_graph *graph = NULL;
struct object_directory *odb;
setup_git_directory();
odb = the_repository->objects->odb;
prepare_repo_settings(the_repository);
graph = read_commit_graph_one(the_repository, odb);
if (!graph)
return 1;
printf("header: %08x %d %d %d %d\n",
ntohl(*(uint32_t*)graph->data),
*(unsigned char*)(graph->data + 4),
@ -57,8 +45,57 @@ int cmd__read_graph(int argc UNUSED, const char **argv UNUSED)
if (graph->topo_levels)
printf(" topo_levels");
printf("\n");
}
static void dump_graph_bloom_filters(struct commit_graph *graph)
{
uint32_t i;
for (i = 0; i < graph->num_commits + graph->num_commits_in_base; i++) {
struct bloom_filter filter = { 0 };
size_t j;
if (load_bloom_filter_from_graph(graph, &filter, i) < 0) {
fprintf(stderr, "missing Bloom filter for graph "
"position %"PRIu32"\n", i);
continue;
}
for (j = 0; j < filter.len; j++)
printf("%02x", filter.data[j]);
if (filter.len)
printf("\n");
}
}
int cmd__read_graph(int argc, const char **argv)
{
struct commit_graph *graph = NULL;
struct object_directory *odb;
int ret = 0;
setup_git_directory();
odb = the_repository->objects->odb;
prepare_repo_settings(the_repository);
graph = read_commit_graph_one(the_repository, odb);
if (!graph) {
ret = 1;
goto done;
}
if (argc <= 1)
dump_graph_info(graph);
else if (!strcmp(argv[1], "bloom-filters"))
dump_graph_bloom_filters(graph);
else {
fprintf(stderr, "unknown sub-command: '%s'\n", argv[1]);
ret = 1;
}
done:
UNLEAK(graph);
return 0;
return ret;
}

View File

@ -29,6 +29,14 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' '
test_cmp expect actual
'
test_expect_success 'compute unseeded murmur3 hash for test string 3' '
cat >expect <<-\EOF &&
Murmur3 Hash with seed=0:0xa183ccfd
EOF
test-tool bloom get_murmur3_seven_highbit >actual &&
test_cmp expect actual
'
test_expect_success 'compute bloom key for empty string' '
cat >expect <<-\EOF &&
Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004|

View File

@ -82,7 +82,23 @@ test_bloom_filters_used () {
test_bloom_filters_not_used () {
log_args=$1
setup "$log_args" &&
! grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf" &&
if grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf"
then
# if the Bloom filter system is initialized, ensure that no
# filters were used
data="statistics:{"
# unusable filters (e.g., those computed with a
# different value of commitGraph.changedPathsVersion)
# are counted in the filter_not_present bucket, so any
# value is OK there.
data="$data\"filter_not_present\":[0-9][0-9]*,"
data="$data\"maybe\":0,"
data="$data\"definitely_not\":0,"
data="$data\"false_positive\":0}"
grep -q "$data" "$TRASH_DIRECTORY/trace.perf"
fi &&
test_cmp log_wo_bloom log_w_bloom
}
@ -163,7 +179,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
test_bloom_filters_used_when_some_filters_are_missing () {
log_args=$1
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":9"
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":10"
setup "$log_args" &&
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
test_cmp log_wo_bloom log_w_bloom
@ -206,6 +222,10 @@ test_filter_trunc_large () {
grep "\"key\":\"filter-trunc-large\",\"value\":\"$1\"" $2
}
test_filter_upgraded () {
grep "\"key\":\"filter-upgraded\",\"value\":\"$1\"" $2
}
test_expect_success 'correctly report changes over limit' '
git init limits &&
(
@ -405,8 +425,292 @@ test_expect_success 'Bloom generation backfills empty commits' '
)
'
graph=.git/objects/info/commit-graph
graphdir=.git/objects/info/commit-graphs
chain=$graphdir/commit-graph-chain
test_expect_success 'setup for mixed Bloom setting tests' '
repo=mixed-bloom-settings &&
git init $repo &&
for i in one two three
do
test_commit -C $repo $i file || return 1
done
'
test_expect_success 'ensure Bloom filters with incompatible settings are ignored' '
# Compute Bloom filters with "unusual" settings.
git -C $repo rev-parse one >in &&
GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=3 git -C $repo commit-graph write \
--stdin-commits --changed-paths --split <in &&
layer=$(head -n 1 $repo/$chain) &&
# A commit-graph layer without Bloom filters "hides" the layers
# below ...
git -C $repo rev-parse two >in &&
git -C $repo commit-graph write --stdin-commits --no-changed-paths \
--split=no-merge <in &&
# Another commit-graph layer that has Bloom filters, but with
# standard settings, and is thus incompatible with the base
# layer written above.
git -C $repo rev-parse HEAD >in &&
git -C $repo commit-graph write --stdin-commits --changed-paths \
--split=no-merge <in &&
test_line_count = 3 $repo/$chain &&
# Ensure that incompatible Bloom filters are ignored.
git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- file \
>expect 2>err &&
git -C $repo log --oneline --no-decorate -- file >actual 2>err &&
test_cmp expect actual &&
grep "disabling Bloom filters for commit-graph layer .$layer." err
'
test_expect_success 'merge graph layers with incompatible Bloom settings' '
# Ensure that incompatible Bloom filters are ignored when
# merging existing layers.
git -C $repo commit-graph write --reachable --changed-paths 2>err &&
grep "disabling Bloom filters for commit-graph layer .$layer." err &&
test_path_is_file $repo/$graph &&
test_dir_is_empty $repo/$graphdir &&
git -C $repo -c core.commitGraph=false log --oneline --no-decorate -- \
file >expect &&
trace_out="$(pwd)/trace.perf" &&
GIT_TRACE2_PERF="$trace_out" \
git -C $repo log --oneline --no-decorate -- file >actual 2>err &&
test_cmp expect actual &&
grep "statistics:{\"filter_not_present\":0," trace.perf &&
test_must_be_empty err
'
test_expect_success 'ensure Bloom filter with incompatible versions are ignored' '
rm "$repo/$graph" &&
git -C $repo log --oneline --no-decorate -- $CENT >expect &&
# Compute v1 Bloom filters for commits at the bottom.
git -C $repo rev-parse HEAD^ >in &&
git -C $repo commit-graph write --stdin-commits --changed-paths \
--split <in &&
# Compute v2 Bloomfilters for the rest of the commits at the top.
git -C $repo rev-parse HEAD >in &&
git -C $repo -c commitGraph.changedPathsVersion=2 commit-graph write \
--stdin-commits --changed-paths --split=no-merge <in &&
test_line_count = 2 $repo/$chain &&
git -C $repo log --oneline --no-decorate -- $CENT >actual 2>err &&
test_cmp expect actual &&
layer="$(head -n 1 $repo/$chain)" &&
cat >expect.err <<-EOF &&
warning: disabling Bloom filters for commit-graph layer $SQ$layer$SQ due to incompatible settings
EOF
test_cmp expect.err err
'
get_first_changed_path_filter () {
test-tool read-graph bloom-filters >filters.dat &&
head -n 1 filters.dat
}
# chosen to be the same under all Unicode normalization forms
CENT=$(printf "\302\242")
test_expect_success 'set up repo with high bit path, version 1 changed-path' '
git init highbit1 &&
test_commit -C highbit1 c1 "$CENT" &&
git -C highbit1 commit-graph write --reachable --changed-paths
'
test_expect_success 'setup check value of version 1 changed-path' '
(
cd highbit1 &&
echo "52a9" >expect &&
get_first_changed_path_filter >actual
)
'
# expect will not match actual if char is unsigned by default. Write the test
# in this way, so that a user running this test script can still see if the two
# files match. (It will appear as an ordinary success if they match, and a skip
# if not.)
if test_cmp highbit1/expect highbit1/actual
then
test_set_prereq SIGNED_CHAR_BY_DEFAULT
fi
test_expect_success SIGNED_CHAR_BY_DEFAULT 'check value of version 1 changed-path' '
# Only the prereq matters for this test.
true
'
test_expect_success 'setup make another commit' '
# "git log" does not use Bloom filters for root commits - see how, in
# revision.c, rev_compare_tree() (the only code path that eventually calls
# get_bloom_filter()) is only called by try_to_simplify_commit() when the commit
# has one parent. Therefore, make another commit so that we perform the tests on
# a non-root commit.
test_commit -C highbit1 anotherc1 "another$CENT"
'
test_expect_success 'version 1 changed-path used when version 1 requested' '
(
cd highbit1 &&
test_bloom_filters_used "-- another$CENT"
)
'
test_expect_success 'version 1 changed-path not used when version 2 requested' '
(
cd highbit1 &&
git config --add commitgraph.changedPathsVersion 2 &&
test_bloom_filters_not_used "-- another$CENT"
)
'
test_expect_success 'version 1 changed-path used when autodetect requested' '
(
cd highbit1 &&
git config --add commitgraph.changedPathsVersion -1 &&
test_bloom_filters_used "-- another$CENT"
)
'
test_expect_success 'when writing another commit graph, preserve existing version 1 of changed-path' '
test_commit -C highbit1 c1double "$CENT$CENT" &&
git -C highbit1 commit-graph write --reachable --changed-paths &&
(
cd highbit1 &&
git config --add commitgraph.changedPathsVersion -1 &&
echo "options: bloom(1,10,7) read_generation_data" >expect &&
test-tool read-graph >full &&
grep options full >actual &&
test_cmp expect actual
)
'
test_expect_success 'set up repo with high bit path, version 2 changed-path' '
git init highbit2 &&
git -C highbit2 config --add commitgraph.changedPathsVersion 2 &&
test_commit -C highbit2 c2 "$CENT" &&
git -C highbit2 commit-graph write --reachable --changed-paths
'
test_expect_success 'check value of version 2 changed-path' '
(
cd highbit2 &&
echo "c01f" >expect &&
get_first_changed_path_filter >actual &&
test_cmp expect actual
)
'
test_expect_success 'setup make another commit' '
# "git log" does not use Bloom filters for root commits - see how, in
# revision.c, rev_compare_tree() (the only code path that eventually calls
# get_bloom_filter()) is only called by try_to_simplify_commit() when the commit
# has one parent. Therefore, make another commit so that we perform the tests on
# a non-root commit.
test_commit -C highbit2 anotherc2 "another$CENT"
'
test_expect_success 'version 2 changed-path used when version 2 requested' '
(
cd highbit2 &&
test_bloom_filters_used "-- another$CENT"
)
'
test_expect_success 'version 2 changed-path not used when version 1 requested' '
(
cd highbit2 &&
git config --add commitgraph.changedPathsVersion 1 &&
test_bloom_filters_not_used "-- another$CENT"
)
'
test_expect_success 'version 2 changed-path used when autodetect requested' '
(
cd highbit2 &&
git config --add commitgraph.changedPathsVersion -1 &&
test_bloom_filters_used "-- another$CENT"
)
'
test_expect_success 'when writing another commit graph, preserve existing version 2 of changed-path' '
test_commit -C highbit2 c2double "$CENT$CENT" &&
git -C highbit2 commit-graph write --reachable --changed-paths &&
(
cd highbit2 &&
git config --add commitgraph.changedPathsVersion -1 &&
echo "options: bloom(2,10,7) read_generation_data" >expect &&
test-tool read-graph >full &&
grep options full >actual &&
test_cmp expect actual
)
'
test_expect_success 'when writing commit graph, do not reuse changed-path of another version' '
git init doublewrite &&
test_commit -C doublewrite c "$CENT" &&
git -C doublewrite config --add commitgraph.changedPathsVersion 1 &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C doublewrite commit-graph write --reachable --changed-paths &&
test_filter_computed 1 trace2.txt &&
test_filter_upgraded 0 trace2.txt &&
git -C doublewrite commit-graph write --reachable --changed-paths &&
for v in -2 3
do
git -C doublewrite config --add commitgraph.changedPathsVersion $v &&
git -C doublewrite commit-graph write --reachable --changed-paths 2>err &&
cat >expect <<-EOF &&
warning: attempting to write a commit-graph, but ${SQ}commitgraph.changedPathsVersion${SQ} ($v) is not supported
EOF
test_cmp expect err || return 1
done &&
git -C doublewrite config --add commitgraph.changedPathsVersion 2 &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C doublewrite commit-graph write --reachable --changed-paths &&
test_filter_computed 1 trace2.txt &&
test_filter_upgraded 0 trace2.txt &&
(
cd doublewrite &&
echo "c01f" >expect &&
get_first_changed_path_filter >actual &&
test_cmp expect actual
)
'
test_expect_success 'when writing commit graph, reuse changed-path of another version where possible' '
git init upgrade &&
test_commit -C upgrade base no-high-bits &&
git -C upgrade config --add commitgraph.changedPathsVersion 1 &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C upgrade commit-graph write --reachable --changed-paths &&
test_filter_computed 1 trace2.txt &&
test_filter_upgraded 0 trace2.txt &&
git -C upgrade config --add commitgraph.changedPathsVersion 2 &&
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
git -C upgrade commit-graph write --reachable --changed-paths &&
test_filter_computed 0 trace2.txt &&
test_filter_upgraded 1 trace2.txt
'
corrupt_graph () {
graph=.git/objects/info/commit-graph &&
test_when_finished "rm -rf $graph" &&
git commit-graph write --reachable --changed-paths &&
corrupt_chunk_file $graph "$@"