2018-04-02 22:34:19 +02:00
|
|
|
#ifndef COMMIT_GRAPH_H
|
|
|
|
#define COMMIT_GRAPH_H
|
|
|
|
|
2023-05-16 08:34:06 +02:00
|
|
|
#include "object-store-ll.h"
|
2020-04-14 06:04:25 +02:00
|
|
|
#include "oidset.h"
|
2018-04-10 14:56:02 +02:00
|
|
|
|
2018-08-29 14:49:04 +02:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
|
2020-06-23 19:47:01 +02:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE "GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE"
|
2020-04-06 18:59:55 +02:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
|
2018-08-29 14:49:04 +02:00
|
|
|
|
commit-graph: introduce envvar to disable commit existence checks
Our `lookup_commit_in_graph()` helper tries to look up commits from the
commit graph and, if it doesn't exist there, falls back to parsing it
from the object database instead. This is intended to speed up the
lookup of any such commit that exists in the database. There is an edge
case though where the commit exists in the graph, but not in the object
database. To avoid returning such stale commits the helper function thus
double checks that any such commit parsed from the graph also exists in
the object database. This makes the function safe to use even when
commit graphs aren't updated regularly.
We're about to introduce the same pattern into other parts of our code
base though, namely `repo_parse_commit_internal()`. Here the extra
sanity check is a bit of a tougher sell: `lookup_commit_in_graph()` was
a newly introduced helper, and as such there was no performance hit by
adding this sanity check. If we added `repo_parse_commit_internal()`
with that sanity check right from the beginning as well, this would
probably never have been an issue to begin with. But by retrofitting it
with this sanity check now we do add a performance regression to
preexisting code, and thus there is a desire to avoid this or at least
give an escape hatch.
In practice, there is no inherent reason why either of those functions
should have the sanity check whereas the other one does not: either both
of them are able to detect this issue or none of them should be. This
also means that the default of whether we do the check should likely be
the same for both. To err on the side of caution, we thus rather want to
make `repo_parse_commit_internal()` stricter than to loosen the checks
that we already have in `lookup_commit_in_graph()`.
The escape hatch is added in the form of a new GIT_COMMIT_GRAPH_PARANOIA
environment variable that mirrors GIT_REF_PARANOIA. If enabled, which is
the default, we will double check that commits looked up in the commit
graph via `lookup_commit_in_graph()` also exist in the object database.
This same check will also be added in `repo_parse_commit_internal()`.
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-31 08:16:13 +01:00
|
|
|
/*
|
|
|
|
* This environment variable controls whether commits looked up via the
|
|
|
|
* commit graph will be double checked to exist in the object database.
|
|
|
|
*/
|
|
|
|
#define GIT_COMMIT_GRAPH_PARANOIA "GIT_COMMIT_GRAPH_PARANOIA"
|
|
|
|
|
2020-04-16 22:14:03 +02:00
|
|
|
/*
|
|
|
|
* This method is only used to enhance coverage of the commit-graph
|
|
|
|
* feature in the test suite with the GIT_TEST_COMMIT_GRAPH and
|
|
|
|
* GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS environment variables. Do not
|
|
|
|
* call this method oustide of a builtin, and only if you know what
|
|
|
|
* you are doing!
|
|
|
|
*/
|
|
|
|
void git_test_write_commit_graph_or_die(void);
|
|
|
|
|
2018-07-12 00:42:39 +02:00
|
|
|
struct commit;
|
2020-04-06 18:59:49 +02:00
|
|
|
struct bloom_filter_settings;
|
2020-06-05 15:00:28 +02:00
|
|
|
struct repository;
|
|
|
|
struct raw_object_store;
|
|
|
|
struct string_list;
|
2018-07-12 00:42:39 +02:00
|
|
|
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-03 22:18:02 +01:00
|
|
|
char *get_commit_graph_filename(struct object_directory *odb);
|
2020-09-17 20:11:46 +02:00
|
|
|
char *get_commit_graph_chain_filename(struct object_directory *odb);
|
2019-03-25 13:08:30 +01:00
|
|
|
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
|
commit-graph: factor out chain opening function
The load_commit_graph_chain() function opens the chain file and all of
the slices of graph that it points to. If there is no chain file (which
is a totally normal condition), we return NULL. But if we run into
errors with the chain file or loading the actual graph data, we also
return NULL, and the caller cannot tell the difference.
The caller can check for ENOENT for the unremarkable "no such file"
case. But I'm hesitant to assume that the rest of the function would
never accidentally set errno to ENOENT itself, since it is opening the
slice files (and that would mean the caller fails to notice a real
error).
So let's break this into two functions: one to open the file, and one to
actually load it. This matches the interface we provide for the
non-chain graph file, which will also come in handy in a moment when we
fix some bugs in the "git commit-graph verify" code.
Some notes:
- I've kept the "1 is good, 0 is bad" return convention (and the weird
"fd" out-parameter) used by the matching open_commit_graph()
function and other parts of the commit-graph code. This is unlike
most of the rest of Git (which would just return the fd, with -1 for
error), but it makes sense to stay consistent with the adjacent bits
of the API here.
- The existing chain loading function will quietly return if the file
is too small to hold a single entry. I've retained that behavior
(and explicitly set ENOENT in the opener function) for now, under
the notion that it's probably valid (though I'd imagine unusual) to
have an empty chain file.
There are two small behavior changes here, but I think both are strictly
positive:
1. The original blindly did a stat() before checking if fopen()
succeeded, meaning we were making a pointless extra stat call.
2. We now use fstat() to check the file size. The previous code using
a regular stat() on the pathname meant we could technically race
with somebody updating the chain file, and end up with a size that
does not match what we just opened with fopen(). I doubt anybody
ever hit this in practice, but it may have caused an out-of-bounds
read.
We'll retain the load_commit_graph_chain() function which does both the
open and reading steps (most existing callers do not care about seeing
errors anyway, since loading commit-graphs is optimistic).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-09-28 06:38:07 +02:00
|
|
|
int open_commit_graph_chain(const char *chain_file, int *fd, struct stat *st);
|
2018-04-10 14:56:02 +02:00
|
|
|
|
2018-04-10 14:56:05 +02:00
|
|
|
/*
|
|
|
|
* Given a commit struct, try to fill the commit struct info, including:
|
|
|
|
* 1. tree object
|
|
|
|
* 2. date
|
|
|
|
* 3. parents.
|
|
|
|
*
|
|
|
|
* Returns 1 if and only if the commit was found in the packed graph.
|
|
|
|
*
|
|
|
|
* See parse_commit_buffer() for the fallback after this call.
|
|
|
|
*/
|
2018-07-12 00:42:42 +02:00
|
|
|
int parse_commit_in_graph(struct repository *r, struct commit *item);
|
2018-04-10 14:56:05 +02:00
|
|
|
|
2022-07-13 01:10:31 +02:00
|
|
|
/*
|
|
|
|
* Fills `*pos` with the graph position of `c`, and returns 1 if `c` is
|
|
|
|
* found in the commit-graph belonging to `r`, or 0 otherwise.
|
|
|
|
* Initializes the commit-graph belonging to `r` if it hasn't been
|
|
|
|
* already.
|
|
|
|
*
|
|
|
|
* Note: this is a low-level helper that does not alter any slab data
|
|
|
|
* associated with `c`. Useful in circumstances where the slab data is
|
|
|
|
* already being modified (e.g., writing the commit-graph itself).
|
|
|
|
*
|
|
|
|
* In most cases, callers should use `parse_commit_in_graph()` instead.
|
|
|
|
*/
|
|
|
|
int repo_find_commit_pos_in_graph(struct repository *r, struct commit *c,
|
|
|
|
uint32_t *pos);
|
|
|
|
|
2021-08-09 10:12:03 +02:00
|
|
|
/*
|
|
|
|
* Look up the given commit ID in the commit-graph. This will only return a
|
|
|
|
* commit if the ID exists both in the graph and in the object database such
|
|
|
|
* that we don't return commits whose object has been pruned. Otherwise, this
|
|
|
|
* function returns `NULL`.
|
|
|
|
*/
|
|
|
|
struct commit *lookup_commit_in_graph(struct repository *repo, const struct object_id *id);
|
|
|
|
|
2018-05-01 14:47:13 +02:00
|
|
|
/*
|
|
|
|
* It is possible that we loaded commit contents from the commit buffer,
|
|
|
|
* but we also want to ensure the commit-graph content is correctly
|
|
|
|
* checked and filled. Fill the graph_pos and generation members of
|
|
|
|
* the given commit.
|
|
|
|
*/
|
2018-07-12 00:42:42 +02:00
|
|
|
void load_commit_graph_info(struct repository *r, struct commit *item);
|
2018-05-01 14:47:13 +02:00
|
|
|
|
2018-07-12 00:42:42 +02:00
|
|
|
struct tree *get_commit_tree_in_graph(struct repository *r,
|
|
|
|
const struct commit *c);
|
2018-04-06 21:09:46 +02:00
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
struct commit_graph {
|
|
|
|
const unsigned char *data;
|
|
|
|
size_t data_len;
|
|
|
|
|
|
|
|
unsigned char hash_len;
|
|
|
|
unsigned char num_chunks;
|
|
|
|
uint32_t num_commits;
|
|
|
|
struct object_id oid;
|
2019-06-18 20:14:27 +02:00
|
|
|
char *filename;
|
2020-02-03 22:18:00 +01:00
|
|
|
struct object_directory *odb;
|
2018-04-10 14:56:02 +02:00
|
|
|
|
2019-06-18 20:14:24 +02:00
|
|
|
uint32_t num_commits_in_base;
|
commit-graph: use generation v2 only if entire chain does
Since there are released versions of Git that understand generation
numbers in the commit-graph's CDAT chunk but do not understand the GDAT
chunk, the following scenario is possible:
1. "New" Git writes a commit-graph with the GDAT chunk.
2. "Old" Git writes a split commit-graph on top without a GDAT chunk.
If each layer of split commit-graph is treated independently, as it was
the case before this commit, with Git inspecting only the current layer
for chunk_generation_data pointer, commits in the lower layer (one with
GDAT) whould have corrected commit date as their generation number,
while commits in the upper layer would have topological levels as their
generation. Corrected commit dates usually have much larger values than
topological levels. This means that if we take two commits, one from the
upper layer, and one reachable from it in the lower layer, then the
expectation that the generation of a parent is smaller than the
generation of a child would be violated.
It is difficult to expose this issue in a test. Since we _start_ with
artificially low generation numbers, any commit walk that prioritizes
generation numbers will walk all of the commits with high generation
number before walking the commits with low generation number. In all the
cases I tried, the commit-graph layers themselves "protect" any
incorrect behavior since none of the commits in the lower layer can
reach the commits in the upper layer.
This issue would manifest itself as a performance problem in this case,
especially with something like "git log --graph" since the low
generation numbers would cause the in-degree queue to walk all of the
commits in the lower layer before allowing the topo-order queue to write
anything to output (depending on the size of the upper layer).
Therefore, When writing the new layer in split commit-graph, we write a
GDAT chunk only if the topmost layer has a GDAT chunk. This guarantees
that if a layer has GDAT chunk, all lower layers must have a GDAT chunk
as well.
Rewriting layers follows similar approach: if the topmost layer below
the set of layers being rewritten (in the split commit-graph chain)
exists, and it does not contain GDAT chunk, then the result of rewrite
does not have GDAT chunks either.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 19:11:16 +01:00
|
|
|
unsigned int read_generation_data;
|
2019-06-18 20:14:24 +02:00
|
|
|
struct commit_graph *base_graph;
|
|
|
|
|
2018-04-10 14:56:02 +02:00
|
|
|
const uint32_t *chunk_oid_fanout;
|
|
|
|
const unsigned char *chunk_oid_lookup;
|
|
|
|
const unsigned char *chunk_commit_data;
|
commit-graph: implement generation data chunk
As discovered by Ævar, we cannot increment graph version to
distinguish between generation numbers v1 and v2 [1]. Thus, one of
pre-requistes before implementing generation number v2 was to
distinguish between graph versions in a backwards compatible manner.
We are going to introduce a new chunk called Generation DATa chunk (or
GDAT). GDAT will store corrected committer date offsets whereas CDAT
will still store topological level.
Old Git does not understand GDAT chunk and would ignore it, reading
topological levels from CDAT. New Git can parse GDAT and take advantage
of newer generation numbers, falling back to topological levels when
GDAT chunk is missing (as it would happen with a commit-graph written
by old Git).
We introduce a test environment variable 'GIT_TEST_COMMIT_GRAPH_NO_GDAT'
which forces commit-graph file to be written without generation data
chunk to emulate a commit-graph file written by old Git.
To minimize the space required to store corrrected commit date, Git
stores corrected commit date offsets into the commit-graph file, instea
of corrected commit dates. This saves us 4 bytes per commit, decreasing
the GDAT chunk size by half, but it's possible for the offset to
overflow the 4-bytes allocated for storage. As such overflows are and
should be exceedingly rare, we use the following overflow management
scheme:
We introduce a new commit-graph chunk, Generation Data OVerflow ('GDOV')
to store corrected commit dates for commits with offsets greater than
GENERATION_NUMBER_V2_OFFSET_MAX.
If the offset is greater than GENERATION_NUMBER_V2_OFFSET_MAX, we set
the MSB of the offset and the other bits store the position of corrected
commit date in GDOV chunk, similar to how Extra Edge List is maintained.
We test the overflow-related code with the following repo history:
F - N - U
/ \
U - N - U N
\ /
N - F - N
Where the commits denoted by U have committer date of zero seconds
since Unix epoch, the commits denoted by N have committer date of
1112354055 (default committer date for the test suite) seconds since
Unix epoch and the commits denoted by F have committer date of
(2 ^ 31 - 2) seconds since Unix epoch.
The largest offset observed is 2 ^ 31, just large enough to overflow.
[1]: https://lore.kernel.org/git/87a7gdspo4.fsf@evledraar.gmail.com/
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 19:11:15 +01:00
|
|
|
const unsigned char *chunk_generation_data;
|
|
|
|
const unsigned char *chunk_generation_data_overflow;
|
commit-graph: bounds-check generation overflow chunk
If the generation entry in a commit-graph doesn't fit, we instead insert
an offset into a generation overflow chunk. But since we don't record
the size of the chunk, we may read outside the chunk if the offset we
find on disk is malicious or corrupted.
We can't check the size of the chunk up-front; it will vary based on how
many entries need overflow. So instead, we'll do a bounds-check before
accessing the chunk memory. Unfortunately there is no error-return from
this function, so we'll just have to die(), which is what it does for
other forms of corruption.
As with other cases, we can drop the st_mult() call, since we know our
bounds-checked value will fit within a size_t.
Before this patch, the test here actually "works" because we read
garbage data from the next chunk. And since that garbage data happens
not to provide a generation number which changes the output, it appears
to work. We could construct a case that actually segfaults or produces
wrong output, but it would be a bit tricky. For our purposes its
sufficient to check that we've detected the bounds error.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 23:05:47 +02:00
|
|
|
size_t chunk_generation_data_overflow_size;
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 21:21:13 +01:00
|
|
|
const unsigned char *chunk_extra_edges;
|
commit-graph: detect out-of-bounds extra-edges pointers
If an entry in a commit-graph file has more than 2 parents, the
fixed-size parent fields instead point to an offset within an "extra
edges" chunk. We blindly follow these, assuming that the chunk is
present and sufficiently large; this can lead to an out-of-bounds read
for a corrupt or malicious file.
We can fix this by recording the size of the chunk and adding a
bounds-check in fill_commit_in_graph(). There are a few tricky bits:
1. We'll switch from working with a pointer to an offset. This makes
some corner cases just fall out naturally:
a. If we did not find an EDGE chunk at all, our size will
correctly be zero (so everything is "out of bounds").
b. Comparing "size / 4" lets us make sure we have at least 4 bytes
to read, and we never compute a pointer more than one element
past the end of the array (computing a larger pointer is
probably OK in practice, but is technically undefined
behavior).
c. The current code casts to "uint32_t *". Replacing it with an
offset avoids any comparison between different types of pointer
(since the chunk is stored as "unsigned char *").
2. This is the first case in which fill_commit_in_graph() may return
anything but success. We need to make sure to roll back the
"parsed" flag (and any parents we might have added before running
out of buffer) so that the caller can cleanly fall back to
loading the commit object itself.
It's a little non-trivial to do this, and we might benefit from
factoring it out. But we can wait on that until we actually see a
second case where we return an error.
As a bonus, this lets us drop the st_mult() call. Since we've already
done a bounds check, we know there won't be any integer overflow (it
would imply our buffer is larger than a size_t can hold).
The included test does not actually segfault before this patch (though
you could construct a case where it does). Instead, it reads garbage
from the next chunk which results in it complaining about a bogus parent
id. This is sufficient for our needs, though (we care that the fallback
succeeds, and that stderr mentions the out-of-bounds read).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 23:05:38 +02:00
|
|
|
size_t chunk_extra_edges_size;
|
2019-06-18 20:14:26 +02:00
|
|
|
const unsigned char *chunk_base_graphs;
|
commit-graph: bounds-check base graphs chunk
When we are loading a commit-graph chain, we check that each slice of the
chain points to the appropriate set of base graphs via its BASE chunk.
But since we don't record the size of the chunk, we may access
out-of-bounds memory if the file is corrupted.
Since we know the number of entries we expect to find (based on the
position within the commit-graph-chain file), we can just check the size
up front.
In theory this would also let us drop the st_mult() call a few lines
later when we actually access the memory, since we know that the
computed offset will fit in a size_t. But because the operands
"g->hash_len" and "n" have types "unsigned char" and "int", we'd have to
cast to size_t first. Leaving the st_mult() does that cast, and makes it
more obvious that we don't have an overflow problem.
Note that the test does not actually segfault before this patch, since
it just reads garbage from the chunk after BASE (and indeed, it even
rejects the file because that garbage does not have the expected hash
value). You could construct a file with BASE at the end that did
segfault, but corrupting the existing one is easy, and we can check
stderr for the expected message.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 23:05:41 +02:00
|
|
|
size_t chunk_base_graphs_size;
|
2020-04-06 18:59:49 +02:00
|
|
|
const unsigned char *chunk_bloom_indexes;
|
|
|
|
const unsigned char *chunk_bloom_data;
|
commit-graph: check bounds when accessing BDAT chunk
When loading Bloom filters from a commit-graph file, we use the offset
values in the BIDX chunk to index into the memory mapped for the BDAT
chunk. But since we don't record how big the BDAT chunk is, we just
trust that the BIDX offsets won't cause us to read outside of the chunk
memory. A corrupted or malicious commit-graph file will cause us to
segfault (in practice this isn't a very interesting attack, since
commit-graph files are local-only, and the worst case is an
out-of-bounds read).
We can't fix this by checking the chunk size during parsing, since the
data in the BDAT chunk doesn't have a fixed size (that's why we need the
BIDX in the first place). So we'll fix it in two parts:
1. Record the BDAT chunk size during parsing, and then later check
that the BIDX offsets we look up are within bounds.
2. Because the offsets are relative to the end of the BDAT header, we
must also make sure that the BDAT chunk is at least as large as the
expected header size. Otherwise, we overflow when trying to move
past the header, even for an offset of "0". We can check this
early, during the parsing stage.
The error messages are rather verbose, but since this is not something
you'd expect to see outside of severe bugs or corruption, it makes sense
to err on the side of too many details. Sadly we can't mention the
filename during the chunk-parsing stage, as we haven't set g->filename
at this point, nor passed it down through the stack.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-10-09 23:05:50 +02:00
|
|
|
size_t chunk_bloom_data_size;
|
2020-04-06 18:59:49 +02:00
|
|
|
|
2021-01-16 19:11:12 +01:00
|
|
|
struct topo_level_slab *topo_levels;
|
2020-04-06 18:59:49 +02:00
|
|
|
struct bloom_filter_settings *bloom_filter_settings;
|
2018-04-10 14:56:02 +02:00
|
|
|
};
|
|
|
|
|
2020-09-09 17:22:56 +02:00
|
|
|
struct commit_graph *load_commit_graph_one_fd_st(struct repository *r,
|
|
|
|
int fd, struct stat *st,
|
2020-02-03 22:18:04 +01:00
|
|
|
struct object_directory *odb);
|
commit-graph: factor out chain opening function
The load_commit_graph_chain() function opens the chain file and all of
the slices of graph that it points to. If there is no chain file (which
is a totally normal condition), we return NULL. But if we run into
errors with the chain file or loading the actual graph data, we also
return NULL, and the caller cannot tell the difference.
The caller can check for ENOENT for the unremarkable "no such file"
case. But I'm hesitant to assume that the rest of the function would
never accidentally set errno to ENOENT itself, since it is opening the
slice files (and that would mean the caller fails to notice a real
error).
So let's break this into two functions: one to open the file, and one to
actually load it. This matches the interface we provide for the
non-chain graph file, which will also come in handy in a moment when we
fix some bugs in the "git commit-graph verify" code.
Some notes:
- I've kept the "1 is good, 0 is bad" return convention (and the weird
"fd" out-parameter) used by the matching open_commit_graph()
function and other parts of the commit-graph code. This is unlike
most of the rest of Git (which would just return the fd, with -1 for
error), but it makes sense to stay consistent with the adjacent bits
of the API here.
- The existing chain loading function will quietly return if the file
is too small to hold a single entry. I've retained that behavior
(and explicitly set ENOENT in the opener function) for now, under
the notion that it's probably valid (though I'd imagine unusual) to
have an empty chain file.
There are two small behavior changes here, but I think both are strictly
positive:
1. The original blindly did a stat() before checking if fopen()
succeeded, meaning we were making a pointless extra stat call.
2. We now use fstat() to check the file size. The previous code using
a regular stat() on the pathname meant we could technically race
with somebody updating the chain file, and end up with a size that
does not match what we just opened with fopen(). I doubt anybody
ever hit this in practice, but it may have caused an out-of-bounds
read.
We'll retain the load_commit_graph_chain() function which does both the
open and reading steps (most existing callers do not care about seeing
errors anyway, since loading commit-graphs is optimistic).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-09-28 06:38:07 +02:00
|
|
|
struct commit_graph *load_commit_graph_chain_fd_st(struct repository *r,
|
2023-09-28 06:39:51 +02:00
|
|
|
int fd, struct stat *st,
|
|
|
|
int *incomplete_chain);
|
2020-02-03 22:18:00 +01:00
|
|
|
struct commit_graph *read_commit_graph_one(struct repository *r,
|
|
|
|
struct object_directory *odb);
|
commit-graph: pass repo_settings instead of repository
The parse_commit_graph() function takes a 'struct repository *' pointer,
but it only ever accesses config settings (either directly or through
the .settings field of the repo struct). Move all relevant config
settings into the repo_settings struct, and update parse_commit_graph()
and its existing callers so that it takes 'struct repo_settings *'
instead.
Callers of parse_commit_graph() will now need to call
prepare_repo_settings() themselves, or initialize a 'struct
repo_settings' directly.
Prior to ab14d0676c (commit-graph: pass a 'struct repository *' in more
places, 2020-09-09), parsing a commit-graph was a pure function
depending only on the contents of the commit-graph itself. Commit
ab14d0676c introduced a dependency on a `struct repository` pointer, and
later commits such as b66d84756f (commit-graph: respect
'commitGraph.readChangedPaths', 2020-09-09) added dependencies on config
settings, which were accessed through the `settings` field of the
repository pointer. This field was initialized via a call to
`prepare_repo_settings()`.
Additionally, this fixes an issue in fuzz-commit-graph: In 44c7e62
(2021-12-06, repo-settings:prepare_repo_settings only in git repos),
prepare_repo_settings was changed to issue a BUG() if it is called by a
process whose CWD is not a Git repository.
The combination of commits mentioned above broke fuzz-commit-graph,
which attempts to parse arbitrary fuzzing-engine-provided bytes as a
commit graph file. Prior to this change, parse_commit_graph() called
prepare_repo_settings(), but since we run the fuzz tests without a valid
repository, we are hitting the BUG() from 44c7e62 for every test case.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Josh Steadmon <steadmon@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-07-14 23:43:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Callers should initialize the repo_settings with prepare_repo_settings()
|
|
|
|
* prior to calling parse_commit_graph().
|
|
|
|
*/
|
|
|
|
struct commit_graph *parse_commit_graph(struct repo_settings *s,
|
2020-09-09 17:22:56 +02:00
|
|
|
void *graph_map, size_t graph_size);
|
2019-01-15 23:25:50 +01:00
|
|
|
|
commit-reach: use can_all_from_reach
The is_descendant_of method previously used in_merge_bases() to check if
the commit can reach any of the commits in the provided list. This had
two performance problems:
1. The performance is quadratic in worst-case.
2. A single in_merge_bases() call requires walking beyond the target
commit in order to find the full set of boundary commits that may be
merge-bases.
The can_all_from_reach method avoids this quadratic behavior and can
limit the search beyond the target commits using generation numbers. It
requires a small prototype adjustment to stop using commit-date as a
cutoff, as that optimization is no longer appropriate here.
Since in_merge_bases() uses paint_down_to_common(), is_descendant_of()
naturally found cutoffs to avoid walking the entire commit graph. Since
we want to always return the correct result, we cannot use the
min_commit_date cutoff in can_all_from_reach. We then rely on generation
numbers to provide the cutoff.
Since not all repos will have a commit-graph file, nor will we always
have generation numbers computed for a commit-graph file, create a new
method, generation_numbers_enabled(), that checks for a commit-graph
file and sees if the first commit in the file has a non-zero generation
number. In the case that we do not have generation numbers, use the old
logic for is_descendant_of().
Performance was meausured on a copy of the Linux repository using the
'test-tool reach is_descendant_of' command using this input:
A:v4.9
X:v4.10
X:v4.11
X:v4.12
X:v4.13
X:v4.14
X:v4.15
X:v4.16
X:v4.17
X.v3.0
Note that this input is tailored to demonstrate the quadratic nature of
the previous method, as it will compute merge-bases for v4.9 versus all
of the later versions before checking against v4.1.
Before: 0.26 s
After: 0.21 s
Since we previously used the is_descendant_of method in the ref_newer
method, we also measured performance there using
'test-tool reach ref_newer' with this input:
A:v4.9
B:v3.19
Before: 0.10 s
After: 0.08 s
By adding a new commit with parent v3.19, we test the non-reachable case
of ref_newer:
Before: 0.09 s
After: 0.08 s
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-20 18:33:30 +02:00
|
|
|
/*
|
|
|
|
* Return 1 if and only if the repository has a commit-graph
|
|
|
|
* file and generation numbers are computed in that file.
|
|
|
|
*/
|
|
|
|
int generation_numbers_enabled(struct repository *r);
|
|
|
|
|
commit-reach: use corrected commit dates in paint_down_to_common()
091f4cf (commit: don't use generation numbers if not needed,
2018-08-30) changed paint_down_to_common() to use commit dates instead
of generation numbers v1 (topological levels) as the performance
regressed on certain topologies. With generation number v2 (corrected
commit dates) implemented, we no longer have to rely on commit dates and
can use generation numbers.
For example, the command `git merge-base v4.8 v4.9` on the Linux
repository walks 167468 commits, taking 0.135s for committer date and
167496 commits, taking 0.157s for corrected committer date respectively.
While using corrected commit dates, Git walks nearly the same number of
commits as commit date, the process is slower as for each comparision we
have to access a commit-slab (for corrected committer date) instead of
accessing struct member (for committer date).
This change incidentally broke the fragile t6404-recursive-merge test.
t6404-recursive-merge sets up a unique repository where all commits have
the same committer date without a well-defined merge-base.
While running tests with GIT_TEST_COMMIT_GRAPH unset, we use committer
date as a heuristic in paint_down_to_common(). 6404.1 'combined merge
conflicts' merges commits in the order:
- Merge C with B to form an intermediate commit.
- Merge the intermediate commit with A.
With GIT_TEST_COMMIT_GRAPH=1, we write a commit-graph and subsequently
use the corrected committer date, which changes the order in which
commits are merged:
- Merge A with B to form an intermediate commit.
- Merge the intermediate commit with C.
While resulting repositories are equivalent, 6404.4 'virtual trees were
processed' fails with GIT_TEST_COMMIT_GRAPH=1 as we are selecting
different merge-bases and thus have different object ids for the
intermediate commits.
As this has already causes problems (as noted in 859fdc0 (commit-graph:
define GIT_TEST_COMMIT_GRAPH, 2018-08-29)), we disable commit graph
within t6404-recursive-merge.
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 19:11:17 +01:00
|
|
|
/*
|
|
|
|
* Return 1 if and only if the repository has a commit-graph
|
|
|
|
* file and generation data chunk has been written for the file.
|
|
|
|
*/
|
|
|
|
int corrected_commit_dates_enabled(struct repository *r);
|
|
|
|
|
commit-graph: introduce 'get_bloom_filter_settings()'
Many places in the code often need a pointer to the commit-graph's
'struct bloom_filter_settings', in which case they often take the value
from the top-most commit-graph.
In the non-split case, this works as expected. In the split case,
however, things get a little tricky. Not all layers in a chain of
incremental commit-graphs are required to themselves have Bloom data,
and so whether or not some part of the code uses Bloom filters depends
entirely on whether or not the top-most level of the commit-graph chain
has Bloom filters.
This has been the behavior since Bloom filters were introduced, and has
been codified into the tests since a759bfa9ee (t4216: add end to end
tests for git log with Bloom filters, 2020-04-06). In fact, t4216.130
requires that Bloom filters are not used in exactly the case described
earlier.
There is no reason that this needs to be the case, since it is perfectly
valid for commits in an earlier layer to have Bloom filters when commits
in a newer layer do not.
Since Bloom settings are guaranteed in practice to be the same for any
layer in a chain that has Bloom data, it is sufficient to traverse the
'->base_graph' pointer until either (1) a non-null 'struct
bloom_filter_settings *' is found, or (2) until we are at the root of
the commit-graph chain.
Introduce a 'get_bloom_filter_settings()' function that does just this,
and use it instead of purely dereferencing the top-most graph's
'->bloom_filter_settings' pointer.
While we're at it, add an additional test in t5324 to guard against code
in the commit-graph writing machinery that doesn't correctly handle a
NULL 'struct bloom_filter *'.
Co-authored-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-09 17:22:44 +02:00
|
|
|
struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r);
|
|
|
|
|
2019-08-05 10:02:39 +02:00
|
|
|
enum commit_graph_write_flags {
|
|
|
|
COMMIT_GRAPH_WRITE_APPEND = (1 << 0),
|
|
|
|
COMMIT_GRAPH_WRITE_PROGRESS = (1 << 1),
|
2019-08-05 10:02:40 +02:00
|
|
|
COMMIT_GRAPH_WRITE_SPLIT = (1 << 2),
|
commit-graph: drop COMMIT_GRAPH_WRITE_CHECK_OIDS flag
Since 7c5c9b9c57 (commit-graph: error out on invalid commit oids in
'write --stdin-commits', 2019-08-05), the commit-graph builtin dies on
receiving non-commit OIDs as input to '--stdin-commits'.
This behavior can be cumbersome to work around in, say, the case of
piping 'git for-each-ref' to 'git commit-graph write --stdin-commits' if
the caller does not want to cull out non-commits themselves. In this
situation, it would be ideal if 'git commit-graph write' wrote the graph
containing the inputs that did pertain to commits, and silently ignored
the remainder of the input.
Some options have been proposed to the effect of '--[no-]check-oids'
which would allow callers to have the commit-graph builtin do just that.
After some discussion, it is difficult to imagine a caller who wouldn't
want to pass '--no-check-oids', suggesting that we should get rid of the
behavior of complaining about non-commit inputs altogether.
If callers do wish to retain this behavior, they can easily work around
this change by doing the following:
git for-each-ref --format='%(objectname) %(objecttype) %(*objecttype)' |
awk '
!/commit/ { print "not-a-commit:"$1 }
/commit/ { print $1 }
' |
git commit-graph write --stdin-commits
To make it so that valid OIDs that refer to non-existent objects are
indeed an error after loosening the error handling, perform an extra
lookup to make sure that object indeed exists before sending it to the
commit-graph internals.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-05-13 23:59:55 +02:00
|
|
|
COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 3),
|
2020-07-30 22:20:31 +02:00
|
|
|
COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS = (1 << 4),
|
2019-08-05 10:02:39 +02:00
|
|
|
};
|
2019-06-12 15:29:38 +02:00
|
|
|
|
2020-04-14 06:04:08 +02:00
|
|
|
enum commit_graph_split_flags {
|
builtin/commit-graph.c: introduce split strategy 'no-merge'
In the previous commit, we laid the groundwork for supporting different
splitting strategies. In this commit, we introduce the first splitting
strategy: 'no-merge'.
Passing '--split=no-merge' is useful for callers which wish to write a
new incremental commit-graph, but do not want to spend effort condensing
the incremental chain [1]. Previously, this was possible by passing
'--size-multiple=0', but this no longer the case following 63020f175f
(commit-graph: prefer default size_mult when given zero, 2020-01-02).
When '--split=no-merge' is given, the commit-graph machinery will never
condense an existing chain, and it will always write a new incremental.
[1]: This might occur when, for example, a server administrator running
some program after each push may want to ensure that each job runs
proportional in time to the size of the push, and does not "jump" when
the commit-graph machinery decides to trigger a merge.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-14 06:04:12 +02:00
|
|
|
COMMIT_GRAPH_SPLIT_UNSPECIFIED = 0,
|
builtin/commit-graph.c: introduce split strategy 'replace'
When using split commit-graphs, it is sometimes useful to completely
replace the commit-graph chain with a new base.
For example, consider a scenario in which a repository builds a new
commit-graph incremental for each push. Occasionally (say, after some
fixed number of pushes), they may wish to rebuild the commit-graph chain
with all reachable commits.
They can do so with
$ git commit-graph write --reachable
but this removes the chain entirely and replaces it with a single
commit-graph in 'objects/info/commit-graph'. Unfortunately, this means
that the next push will have to move this commit-graph into the first
layer of a new chain, and then write its new commits on top.
Avoid such copying entirely by allowing the caller to specify that they
wish to replace the entirety of their commit-graph chain, while also
specifying that the new commit-graph should become the basis of a fresh,
length-one chain.
This addresses the above situation by making it possible for the caller
to instead write:
$ git commit-graph write --reachable --split=replace
which writes a new length-one chain to 'objects/info/commit-graphs',
making the commit-graph incremental generated by the subsequent push
relatively cheap by avoiding the aforementioned copy.
In order to do this, remove an assumption in 'write_commit_graph_file'
that chains are always at least two incrementals long.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-14 06:04:17 +02:00
|
|
|
COMMIT_GRAPH_SPLIT_MERGE_PROHIBITED = 1,
|
|
|
|
COMMIT_GRAPH_SPLIT_REPLACE = 2
|
2020-04-14 06:04:08 +02:00
|
|
|
};
|
|
|
|
|
2020-09-18 04:59:49 +02:00
|
|
|
struct commit_graph_opts {
|
2019-06-18 20:14:32 +02:00
|
|
|
int size_multiple;
|
|
|
|
int max_commits;
|
|
|
|
timestamp_t expire_time;
|
2020-09-18 04:59:49 +02:00
|
|
|
enum commit_graph_split_flags split_flags;
|
2020-09-18 15:27:27 +02:00
|
|
|
int max_new_filters;
|
2019-06-18 20:14:32 +02:00
|
|
|
};
|
|
|
|
|
2019-06-12 15:29:37 +02:00
|
|
|
/*
|
|
|
|
* The write_commit_graph* methods return zero on success
|
|
|
|
* and a negative value on failure. Note that if the repository
|
|
|
|
* is not compatible with the commit-graph feature, then the
|
|
|
|
* methods will return 0 without writing a commit-graph.
|
|
|
|
*/
|
2020-02-04 06:51:50 +01:00
|
|
|
int write_commit_graph_reachable(struct object_directory *odb,
|
2019-08-05 10:02:39 +02:00
|
|
|
enum commit_graph_write_flags flags,
|
2020-09-18 04:59:49 +02:00
|
|
|
const struct commit_graph_opts *opts);
|
2020-02-04 06:51:50 +01:00
|
|
|
int write_commit_graph(struct object_directory *odb,
|
2022-03-04 19:32:12 +01:00
|
|
|
const struct string_list *pack_indexes,
|
2020-04-14 06:04:25 +02:00
|
|
|
struct oidset *commits,
|
2019-08-05 10:02:39 +02:00
|
|
|
enum commit_graph_write_flags flags,
|
2020-09-18 04:59:49 +02:00
|
|
|
const struct commit_graph_opts *opts);
|
2018-04-02 22:34:19 +02:00
|
|
|
|
2019-06-18 20:14:32 +02:00
|
|
|
#define COMMIT_GRAPH_VERIFY_SHALLOW (1 << 0)
|
|
|
|
|
|
|
|
int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags);
|
2018-06-27 15:24:32 +02:00
|
|
|
|
2019-05-17 20:41:47 +02:00
|
|
|
void close_commit_graph(struct raw_object_store *);
|
2018-07-12 00:42:40 +02:00
|
|
|
void free_commit_graph(struct commit_graph *);
|
|
|
|
|
upload-pack: disable commit graph more gently for shallow traversal
When the client has asked for certain shallow options like
"deepen-since", we do a custom rev-list walk that pretends to be
shallow. Before doing so, we have to disable the commit-graph, since it
is not compatible with the shallow view of the repository. That's
handled by 829a321569 (commit-graph: close_commit_graph before shallow
walk, 2018-08-20). That commit literally closes and frees our
repo->objects->commit_graph struct.
That creates an interesting problem for commits that have _already_ been
parsed using the commit graph. Their commit->object.parsed flag is set,
their commit->graph_pos is set, but their commit->maybe_tree may still
be NULL. When somebody later calls repo_get_commit_tree(), we see that
we haven't loaded the tree oid yet and try to get it from the commit
graph. But since it has been freed, we segfault!
So the root of the issue is a data dependency between the commit's
lazy-load of the tree oid and the fact that the commit graph can go
away mid-process. How can we resolve it?
There are a couple of general approaches:
1. The obvious answer is to avoid loading the tree from the graph when
we see that it's NULL. But then what do we return for the tree oid?
If we return NULL, our caller in do_traverse() will rightly
complain that we have no tree. We'd have to fallback to loading the
actual commit object and re-parsing it. That requires teaching
parse_commit_buffer() to understand re-parsing (i.e., not starting
from a clean slate and not leaking any allocated bits like parent
list pointers).
2. When we close the commit graph, walk through the set of in-memory
objects and clear any graph_pos pointers. But this means we also
have to "unparse" any such commits so that we know they still need
to open the commit object to fill in their trees. So it's no less
complicated than (1), and is more expensive (since we clear objects
we might not later need).
3. Stop freeing the commit-graph struct. Continue to let it be used
for lazy-loads of tree oids, but let upload-pack specify that it
shouldn't be used for further commit parsing.
4. Push the whole shallow rev-list out to its own sub-process, with
the commit-graph disabled from the start, giving it a clean memory
space to work from.
I've chosen (3) here. Options (1) and (2) would work, but are
non-trivial to implement. Option (4) is more expensive, and I'm not sure
how complicated it is (shelling out for the actual rev-list part is
easy, but we do then parse the resulting commits internally, and I'm not
clear which parts need to be handling shallow-ness).
The new test in t5500 triggers this segfault, but see the comments there
for how horribly intimate it has to be with how both upload-pack and
commit graphs work.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-09-12 16:44:45 +02:00
|
|
|
/*
|
|
|
|
* Disable further use of the commit graph in this process when parsing a
|
|
|
|
* "struct commit".
|
|
|
|
*/
|
|
|
|
void disable_commit_graph(struct repository *r);
|
|
|
|
|
2020-06-17 11:14:09 +02:00
|
|
|
struct commit_graph_data {
|
|
|
|
uint32_t graph_pos;
|
2021-01-16 19:11:13 +01:00
|
|
|
timestamp_t generation;
|
2020-06-17 11:14:09 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Commits should be parsed before accessing generation, graph positions.
|
|
|
|
*/
|
2021-01-16 19:11:13 +01:00
|
|
|
timestamp_t commit_graph_generation(const struct commit *);
|
2020-06-17 11:14:09 +02:00
|
|
|
uint32_t commit_graph_position(const struct commit *);
|
2023-03-20 12:26:52 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* After this method, all commits reachable from those in the given
|
|
|
|
* list will have non-zero, non-infinite generation numbers.
|
|
|
|
*/
|
|
|
|
void ensure_generations_valid(struct repository *r,
|
|
|
|
struct commit **commits, size_t nr);
|
|
|
|
|
2018-04-02 22:34:19 +02:00
|
|
|
#endif
|