1
0
Fork 0
mirror of https://github.com/git/git.git synced 2024-05-20 09:56:10 +02:00
git/pack-bitmap.h
Eric Wong 4bfdf5800f treewide: switch to khashl for memory savings
khashl is an updated version of khash with less memory overhead
(one bit/bucket instead of two) than the original khash and
similar overall performance.  According to its author,
insertions are simpler (linear probing) but deletions may be
slightly slower[1].  Of course, the majority of hash tables in
git do not delete individual elements.

Overall memory usage did not decrease much, as the hash tables
and elements we store in them are big and currently dwarf the
overhead of the khash internals.  Only around 10 MB in
allocations (and a few dozen KB peak use out of ~6 GB) is saved
when doing a no-op `git gc' of a Linux kernel object store with
thousands of refs and islands.

A summary of differences I've found from khash to khashl:

* two 32-bit ints (instead of four) in the top-level struct

* 2 heap allocations (instead of 3) for maps
  (though I wonder locality suffers when probing is necessary)

* 1 bit of metadata per-bucket (no tombstones for deleted elements)

* 0.75 load factor.  Lowered slightly from 0.77, but no FP multiply
  and responsible for the aforementioned struct size reduction

* FNV-1A instead of x31 hash for strings

* Fibonacci hashing (__kh_h2b), probably good for FNV-1A, but
  I'm skeptical of its usefulness for our SHA-* using cases

* linear probing instead of quadratic

* Wang's integer hash functions (currently unused)

* optional hash value caching and ensemble APIs (currently unused)

* some API differences (see below), but not enough to easily
  use both khash and khashl in the same compilation unit

This patch was made with two additional goals to ease review:

1) minimize changes outside of khash*.h files

2) minimize and document all differences from upstream[2] khashl.h

Our khashl.h differences from upstream:

* favor portability constructs from our codebase:
  MAYBE_UNUSED over klib_unused, inline over kh_inline, and
  various integer types

* disable packed attribute to satisfy -Werror=address-of-packed-member,
  AFAIK it doesn't change any of the data structures we use

* port the following commits over from our old khash.h:
  9249ca26ac (khash: factor out kh_release_*, 2018-10-04)
  2756ca4347 (use REALLOC_ARRAY for changing the allocation size of arrays, 2014-09-16)
  5632e838f8 (khash: clarify that allocations never fail, 2021-07-03)

* use our memory allocation wrappers

* provide wrappers for compatibility with existing callers using the
  khash API.  The khashl function naming convention is: ${NOUN}_${VERB}
  while the khash convention is: kh_${VERB}_${NOUN}.  The kh_${NAME}_t
  typedef and naming convention are preserved via __KHASH_COMPAT macro
  to ease review (despite the `_t' suffix being reserved and typedefs
  being discouraged in the Linux kernel).

* copy relevant API docs over from khash.h for identically named macros

* preserve kh_begin, kh_foreach, kh_foreach_value from khash.h since
  khashl.h doesn't provide them

* flesh out KHASHL_{SET,MAP}_INIT wrappers with *_clear, *_resize,
  and *_release functions

* sparse fixes from Junio and Jeff

[1] https://attractivechaos.wordpress.com/2019/12/28/deletion-from-hash-tables-without-tombstones/
[2] git clone https://github.com/attractivechaos/klib.git
    2895a16cb55e (support an ensemble of hash tables, 2023-12-18)

khashl.h API differences from khash.h which affected this change:

* KHASHL_MAP_INIT and KHASHL_SET_INIT macros replace KHASH_INIT

* user-supplied hash and equality functions use different names

* object-store-ll.h avoided the kh_*_t convention (since I dislike
  typedef) and was the only place where I had to change a definition.

Signed-off-by: Eric Wong <e@80x24.org>
Helped-by: Junio C Hamano <gitster@pobox.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2024-03-28 08:52:43 -07:00

130 lines
3.9 KiB
C

#ifndef PACK_BITMAP_H
#define PACK_BITMAP_H
#include "ewah/ewok.h"
#include "khashl.h"
#include "pack.h"
#include "pack-objects.h"
#include "string-list.h"
struct commit;
struct repository;
struct rev_info;
static const char BITMAP_IDX_SIGNATURE[] = {'B', 'I', 'T', 'M'};
struct bitmap_disk_header {
char magic[ARRAY_SIZE(BITMAP_IDX_SIGNATURE)];
uint16_t version;
uint16_t options;
uint32_t entry_count;
unsigned char checksum[GIT_MAX_RAWSZ];
};
#define NEEDS_BITMAP (1u<<22)
/*
* The width in bytes of a single triplet in the lookup table
* extension:
* (commit_pos, offset, xor_row)
*
* whose fields ar 32-, 64-, 32- bits wide, respectively.
*/
#define BITMAP_LOOKUP_TABLE_TRIPLET_WIDTH (16)
enum pack_bitmap_opts {
BITMAP_OPT_FULL_DAG = 0x1,
BITMAP_OPT_HASH_CACHE = 0x4,
BITMAP_OPT_LOOKUP_TABLE = 0x10,
};
enum pack_bitmap_flags {
BITMAP_FLAG_REUSE = 0x1
};
typedef int (*show_reachable_fn)(
const struct object_id *oid,
enum object_type type,
int flags,
uint32_t hash,
struct packed_git *found_pack,
off_t found_offset);
struct bitmap_index;
struct bitmapped_pack {
struct packed_git *p;
uint32_t bitmap_pos;
uint32_t bitmap_nr;
uint32_t pack_int_id; /* MIDX only */
};
struct bitmap_index *prepare_bitmap_git(struct repository *r);
struct bitmap_index *prepare_midx_bitmap_git(struct multi_pack_index *midx);
void count_bitmap_commit_list(struct bitmap_index *, uint32_t *commits,
uint32_t *trees, uint32_t *blobs, uint32_t *tags);
void traverse_bitmap_commit_list(struct bitmap_index *,
struct rev_info *revs,
show_reachable_fn show_reachable);
void test_bitmap_walk(struct rev_info *revs);
int test_bitmap_commits(struct repository *r);
int test_bitmap_hashes(struct repository *r);
#define GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL \
"GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL"
struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
int filter_provided_objects);
void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
struct bitmapped_pack **packs_out,
size_t *packs_nr_out,
struct bitmap **reuse_out,
int multi_pack_reuse);
int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
kh_oid_map_t *reused_bitmaps, int show_progress);
void free_bitmap_index(struct bitmap_index *);
int bitmap_walk_contains(struct bitmap_index *,
struct bitmap *bitmap, const struct object_id *oid);
/*
* After a traversal has been performed by prepare_bitmap_walk(), this can be
* queried to see if a particular object was reachable from any of the
* objects flagged as UNINTERESTING.
*/
int bitmap_has_oid_in_uninteresting(struct bitmap_index *, const struct object_id *oid);
off_t get_disk_usage_from_bitmap(struct bitmap_index *, struct rev_info *);
void bitmap_writer_show_progress(int show);
void bitmap_writer_set_checksum(const unsigned char *sha1);
void bitmap_writer_build_type_index(struct packing_data *to_pack,
struct pack_idx_entry **index,
uint32_t index_nr);
uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
struct packing_data *mapping);
int rebuild_bitmap(const uint32_t *reposition,
struct ewah_bitmap *source,
struct bitmap *dest);
struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git,
struct commit *commit);
void bitmap_writer_select_commits(struct commit **indexed_commits,
unsigned int indexed_commits_nr, int max_bitmaps);
int bitmap_writer_build(struct packing_data *to_pack);
void bitmap_writer_finish(struct pack_idx_entry **index,
uint32_t index_nr,
const char *filename,
uint16_t options);
char *midx_bitmap_filename(struct multi_pack_index *midx);
char *pack_bitmap_filename(struct packed_git *p);
int bitmap_is_midx(struct bitmap_index *bitmap_git);
const struct string_list *bitmap_preferred_tips(struct repository *r);
int bitmap_is_preferred_refname(struct repository *r, const char *refname);
int verify_bitmap_files(struct repository *r);
#endif