From 7ce7c7607b14248b1a3ae7cdd1c079eb3b6efa09 Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Thu, 21 Aug 2014 18:05:08 +0200 Subject: [PATCH 1/7] convert: drop arguments other than 'path' from would_convert_to_git() It is only the path that matters in the decision whether to filter or not. Clarify this by making path the only argument of would_convert_to_git(). Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- convert.h | 5 ++--- sha1_file.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/convert.h b/convert.h index 0c2143c152..c638b33632 100644 --- a/convert.h +++ b/convert.h @@ -40,10 +40,9 @@ extern int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst); extern int renormalize_buffer(const char *path, const char *src, size_t len, struct strbuf *dst); -static inline int would_convert_to_git(const char *path, const char *src, - size_t len, enum safe_crlf checksafe) +static inline int would_convert_to_git(const char *path) { - return convert_to_git(path, src, len, NULL, checksafe); + return convert_to_git(path, NULL, 0, NULL, 0); } /***************************************************************** diff --git a/sha1_file.c b/sha1_file.c index 3f70b1d86a..00c07f233b 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -3144,7 +3144,7 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); else if (size <= big_file_threshold || type != OBJ_BLOB || - (path && would_convert_to_git(path, NULL, 0, 0))) + (path && would_convert_to_git(path))) ret = index_core(sha1, fd, size, type, path, flags); else ret = index_stream(sha1, fd, size, type, path, flags); From 23b0c4782e5f9357e6189253021053a4404ddf4e Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Tue, 26 Aug 2014 17:23:21 +0200 Subject: [PATCH 2/7] config.c: add git_env_ulong() to parse environment variable The new function parses an integeral value that fits in unsigned long in human readable form, i.e. possibly with unit suffix, e.g. 10k = 10240, etc., from an environment variable. Parsing of GIT_MMAP_LIMIT and GIT_ALLOC_LIMIT will use it in later patches. Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- cache.h | 1 + config.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/cache.h b/cache.h index fcb511db70..b820b6adc1 100644 --- a/cache.h +++ b/cache.h @@ -1318,6 +1318,7 @@ extern int git_config_rename_section_in_file(const char *, const char *, const c extern const char *git_etc_gitconfig(void); extern int check_repository_format_version(const char *var, const char *value, void *cb); extern int git_env_bool(const char *, int); +extern unsigned long git_env_ulong(const char *, unsigned long); extern int git_config_system(void); extern int config_error_nonbool(const char *); #if defined(__GNUC__) diff --git a/config.c b/config.c index 058505cb8d..adab91909e 100644 --- a/config.c +++ b/config.c @@ -1116,12 +1116,28 @@ const char *git_etc_gitconfig(void) return system_wide; } +/* + * Parse environment variable 'k' as a boolean (in various + * possible spellings); if missing, use the default value 'def'. + */ int git_env_bool(const char *k, int def) { const char *v = getenv(k); return v ? git_config_bool(k, v) : def; } +/* + * Parse environment variable 'k' as ulong with possibly a unit + * suffix; if missing, use the default value 'val'. + */ +unsigned long git_env_ulong(const char *k, unsigned long val) +{ + const char *v = getenv(k); + if (v && !git_parse_ulong(v, &val)) + die("failed to parse %s", k); + return val; +} + int git_config_system(void) { return !git_env_bool("GIT_CONFIG_NOSYSTEM", 0); From 9927d9627fe01d780e754a02cf9adc36e084c587 Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Tue, 26 Aug 2014 17:23:22 +0200 Subject: [PATCH 3/7] memory_limit: use git_env_ulong() to parse GIT_ALLOC_LIMIT GIT_ALLOC_LIMIT limits xmalloc()'s size, which is of type size_t. Better use git_env_ulong() to parse the environment variable, so that the postfixes 'k', 'm', and 'g' can be used; and use size_t to store the limit for consistency. The change to size_t has no direct practical impact, because the environment variable is only meant to be used for our own tests, and we use it to test small sizes. The cast of size in the call to die() is changed to uintmax_t to match the format string PRIuMAX. Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- t/t1050-large.sh | 2 +- wrapper.c | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/t/t1050-large.sh b/t/t1050-large.sh index aea493646e..e7657ab323 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -13,7 +13,7 @@ test_expect_success setup ' echo X | dd of=large2 bs=1k seek=2000 && echo X | dd of=large3 bs=1k seek=2000 && echo Y | dd of=huge bs=1k seek=2500 && - GIT_ALLOC_LIMIT=1500 && + GIT_ALLOC_LIMIT=1500k && export GIT_ALLOC_LIMIT ' diff --git a/wrapper.c b/wrapper.c index bc1bfb8600..c5204f7a07 100644 --- a/wrapper.c +++ b/wrapper.c @@ -11,14 +11,15 @@ static void (*try_to_free_routine)(size_t size) = do_nothing; static void memory_limit_check(size_t size) { - static int limit = -1; - if (limit == -1) { - const char *env = getenv("GIT_ALLOC_LIMIT"); - limit = env ? atoi(env) * 1024 : 0; + static size_t limit = 0; + if (!limit) { + limit = git_env_ulong("GIT_ALLOC_LIMIT", 0); + if (!limit) + limit = SIZE_MAX; } - if (limit && size > limit) - die("attempting to allocate %"PRIuMAX" over limit %d", - (intmax_t)size, limit); + if (size > limit) + die("attempting to allocate %"PRIuMAX" over limit %"PRIuMAX, + (uintmax_t)size, (uintmax_t)limit); } try_to_free_t set_try_to_free_routine(try_to_free_t routine) From 02710228dd79d9f9c6fa180233491639b603c06d Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Tue, 26 Aug 2014 17:23:23 +0200 Subject: [PATCH 4/7] mmap_limit: introduce GIT_MMAP_LIMIT to allow testing expected mmap size In order to test expectations about mmap in a way similar to testing expectations about malloc with GIT_ALLOC_LIMIT introduced by d41489a6 (Add more large blob test cases, 2012-03-07), introduce a new environment variable GIT_MMAP_LIMIT to limit the largest allowed mmap length. xmmap() is modified to check the size of the requested region and fail it if it is beyond the limit. Together with GIT_ALLOC_LIMIT tests can now confirm expectations about memory consumption. Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- sha1_file.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sha1_file.c b/sha1_file.c index 00c07f233b..d9b51578cc 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -663,10 +663,26 @@ void release_pack_memory(size_t need) ; /* nothing */ } +static void mmap_limit_check(size_t length) +{ + static size_t limit = 0; + if (!limit) { + limit = git_env_ulong("GIT_MMAP_LIMIT", 0); + if (!limit) + limit = SIZE_MAX; + } + if (length > limit) + die("attempting to mmap %"PRIuMAX" over limit %"PRIuMAX, + (uintmax_t)length, (uintmax_t)limit); +} + void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset) { - void *ret = mmap(start, length, prot, flags, fd, offset); + void *ret; + + mmap_limit_check(length); + ret = mmap(start, length, prot, flags, fd, offset); if (ret == MAP_FAILED) { if (!length) return NULL; From b29763aa9bcbb99a59aec3820e30ff1864cfa765 Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Tue, 26 Aug 2014 17:23:24 +0200 Subject: [PATCH 5/7] copy_fd(): do not close the input file descriptor The caller, not this function, opened the file descriptor; it is selfish for the callee to close it when it is done reading from it. The caller may want an option to rewind and re-read the contents after it returns. Simplify the loop to copy the input in full to the output; its body essentially is what a call to write_in_full() helper does. Signed-off-by: Steffen Prohaska Helped-by: Jeff King Signed-off-by: Junio C Hamano --- copy.c | 26 +++++--------------------- lockfile.c | 3 +++ 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/copy.c b/copy.c index a7f58fd905..f2970ec462 100644 --- a/copy.c +++ b/copy.c @@ -4,34 +4,17 @@ int copy_fd(int ifd, int ofd) { while (1) { char buffer[8192]; - char *buf = buffer; ssize_t len = xread(ifd, buffer, sizeof(buffer)); if (!len) break; if (len < 0) { - int read_error = errno; - close(ifd); return error("copy-fd: read returned %s", - strerror(read_error)); - } - while (len) { - int written = xwrite(ofd, buf, len); - if (written > 0) { - buf += written; - len -= written; - } - else if (!written) { - close(ifd); - return error("copy-fd: write returned 0"); - } else { - int write_error = errno; - close(ifd); - return error("copy-fd: write returned %s", - strerror(write_error)); - } + strerror(errno)); } + if (write_in_full(ofd, buffer, len) < 0) + return error("copy-fd: write returned %s", + strerror(errno)); } - close(ifd); return 0; } @@ -60,6 +43,7 @@ int copy_file(const char *dst, const char *src, int mode) return fdo; } status = copy_fd(fdi, fdo); + close(fdi); if (close(fdo) != 0) return error("%s: close error: %s", dst, strerror(errno)); diff --git a/lockfile.c b/lockfile.c index 2564a7f544..2448d30cd0 100644 --- a/lockfile.c +++ b/lockfile.c @@ -224,8 +224,11 @@ int hold_lock_file_for_append(struct lock_file *lk, const char *path, int flags) } else if (copy_fd(orig_fd, fd)) { if (flags & LOCK_DIE_ON_ERROR) exit(128); + close(orig_fd); close(fd); return -1; + } else { + close(orig_fd); } return fd; } From 9035d75a2be9d80d82676504d69553245017f6d4 Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Tue, 26 Aug 2014 17:23:25 +0200 Subject: [PATCH 6/7] convert: stream from fd to required clean filter to reduce used address space The data is streamed to the filter process anyway. Better avoid mapping the file if possible. This is especially useful if a clean filter reduces the size, for example if it computes a sha1 for binary data, like git media. The file size that the previous implementation could handle was limited by the available address space; large files for example could not be handled with (32-bit) msysgit. The new implementation can filter files of any size as long as the filter output is small enough. The new code path is only taken if the filter is required. The filter consumes data directly from the fd. If it fails, the original data is not immediately available. The condition can easily be handled as a fatal error, which is expected for a required filter anyway. If the filter was not required, the condition would need to be handled in a different way, like seeking to 0 and reading the data. But this would require more restructuring of the code and is probably not worth it. The obvious approach of falling back to reading all data would not help achieving the main purpose of this patch, which is to handle large files with limited address space. If reading all data is an option, we can simply take the old code path right away and mmap the entire file. The environment variable GIT_MMAP_LIMIT, which has been introduced in a previous commit is used to test that the expected code path is taken. A related test that exercises required filters is modified to verify that the data actually has been modified on its way from the file system to the object store. Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- convert.c | 55 ++++++++++++++++++++++++++++++++++++++----- convert.h | 5 ++++ sha1_file.c | 27 ++++++++++++++++++++- t/t0021-conversion.sh | 24 +++++++++++++++---- 4 files changed, 99 insertions(+), 12 deletions(-) diff --git a/convert.c b/convert.c index cb5fbb45ea..677d339a8b 100644 --- a/convert.c +++ b/convert.c @@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len, struct filter_params { const char *src; unsigned long size; + int fd; const char *cmd; const char *path; }; -static int filter_buffer(int in, int out, void *data) +static int filter_buffer_or_fd(int in, int out, void *data) { /* * Spawn cmd and feed the buffer contents through its stdin. @@ -355,7 +356,12 @@ static int filter_buffer(int in, int out, void *data) sigchain_push(SIGPIPE, SIG_IGN); - write_err = (write_in_full(child_process.in, params->src, params->size) < 0); + if (params->src) { + write_err = (write_in_full(child_process.in, params->src, params->size) < 0); + } else { + write_err = copy_fd(params->fd, child_process.in); + } + if (close(child_process.in)) write_err = 1; if (write_err) @@ -371,7 +377,7 @@ static int filter_buffer(int in, int out, void *data) return (write_err || status); } -static int apply_filter(const char *path, const char *src, size_t len, +static int apply_filter(const char *path, const char *src, size_t len, int fd, struct strbuf *dst, const char *cmd) { /* @@ -392,11 +398,12 @@ static int apply_filter(const char *path, const char *src, size_t len, return 1; memset(&async, 0, sizeof(async)); - async.proc = filter_buffer; + async.proc = filter_buffer_or_fd; async.data = ¶ms; async.out = -1; params.src = src; params.size = len; + params.fd = fd; params.cmd = cmd; params.path = path; @@ -747,6 +754,25 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) } } +int would_convert_to_git_filter_fd(const char *path) +{ + struct conv_attrs ca; + + convert_attrs(&ca, path); + if (!ca.drv) + return 0; + + /* + * Apply a filter to an fd only if the filter is required to succeed. + * We must die if the filter fails, because the original data before + * filtering is not available. + */ + if (!ca.drv->required) + return 0; + + return apply_filter(path, NULL, 0, -1, NULL, ca.drv->clean); +} + int convert_to_git(const char *path, const char *src, size_t len, struct strbuf *dst, enum safe_crlf checksafe) { @@ -761,7 +787,7 @@ int convert_to_git(const char *path, const char *src, size_t len, required = ca.drv->required; } - ret |= apply_filter(path, src, len, dst, filter); + ret |= apply_filter(path, src, len, -1, dst, filter); if (!ret && required) die("%s: clean filter '%s' failed", path, ca.drv->name); @@ -778,6 +804,23 @@ int convert_to_git(const char *path, const char *src, size_t len, return ret | ident_to_git(path, src, len, dst, ca.ident); } +void convert_to_git_filter_fd(const char *path, int fd, struct strbuf *dst, + enum safe_crlf checksafe) +{ + struct conv_attrs ca; + convert_attrs(&ca, path); + + assert(ca.drv); + assert(ca.drv->clean); + + if (!apply_filter(path, NULL, 0, fd, dst, ca.drv->clean)) + die("%s: clean filter '%s' failed", path, ca.drv->name); + + ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr); + crlf_to_git(path, dst->buf, dst->len, dst, ca.crlf_action, checksafe); + ident_to_git(path, dst->buf, dst->len, dst, ca.ident); +} + static int convert_to_working_tree_internal(const char *path, const char *src, size_t len, struct strbuf *dst, int normalizing) @@ -811,7 +854,7 @@ static int convert_to_working_tree_internal(const char *path, const char *src, } } - ret_filter = apply_filter(path, src, len, dst, filter); + ret_filter = apply_filter(path, src, len, -1, dst, filter); if (!ret_filter && required) die("%s: smudge filter %s failed", path, ca.drv->name); diff --git a/convert.h b/convert.h index c638b33632..d9d853cd3d 100644 --- a/convert.h +++ b/convert.h @@ -44,6 +44,11 @@ static inline int would_convert_to_git(const char *path) { return convert_to_git(path, NULL, 0, NULL, 0); } +/* Precondition: would_convert_to_git_filter_fd(path) == true */ +extern void convert_to_git_filter_fd(const char *path, int fd, + struct strbuf *dst, + enum safe_crlf checksafe); +extern int would_convert_to_git_filter_fd(const char *path); /***************************************************************** * diff --git a/sha1_file.c b/sha1_file.c index d9b51578cc..423ec64e87 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -3090,6 +3090,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size, return ret; } +static int index_stream_convert_blob(unsigned char *sha1, int fd, + const char *path, unsigned flags) +{ + int ret; + const int write_object = flags & HASH_WRITE_OBJECT; + struct strbuf sbuf = STRBUF_INIT; + + assert(path); + assert(would_convert_to_git_filter_fd(path)); + + convert_to_git_filter_fd(path, fd, &sbuf, + write_object ? safe_crlf : SAFE_CRLF_FALSE); + + if (write_object) + ret = write_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB), + sha1); + else + ret = hash_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB), + sha1); + strbuf_release(&sbuf); + return ret; +} + static int index_pipe(unsigned char *sha1, int fd, enum object_type type, const char *path, unsigned flags) { @@ -3157,7 +3180,9 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int ret; size_t size = xsize_t(st->st_size); - if (!S_ISREG(st->st_mode)) + if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path)) + ret = index_stream_convert_blob(sha1, fd, path, flags); + else if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); else if (size <= big_file_threshold || type != OBJ_BLOB || (path && would_convert_to_git(path))) diff --git a/t/t0021-conversion.sh b/t/t0021-conversion.sh index f890c54d13..ca7d2a630a 100755 --- a/t/t0021-conversion.sh +++ b/t/t0021-conversion.sh @@ -153,17 +153,23 @@ test_expect_success 'filter shell-escaped filenames' ' : ' -test_expect_success 'required filter success' ' - git config filter.required.smudge cat && - git config filter.required.clean cat && +test_expect_success 'required filter should filter data' ' + git config filter.required.smudge ./rot13.sh && + git config filter.required.clean ./rot13.sh && git config filter.required.required true && echo "*.r filter=required" >.gitattributes && - echo test >test.r && + cat test.o >test.r && git add test.r && + rm -f test.r && - git checkout -- test.r + git checkout -- test.r && + cmp test.o test.r && + + ./rot13.sh expected && + git cat-file blob :test.r >actual && + cmp expected actual ' test_expect_success 'required filter smudge failure' ' @@ -190,6 +196,14 @@ test_expect_success 'required filter clean failure' ' test_must_fail git add test.fc ' +test_expect_success 'filtering large input to small output should use little memory' ' + git config filter.devnull.clean "cat >/dev/null" && + git config filter.devnull.required true && + for i in $(test_seq 1 30); do printf "%1048576d" 1; done >30MB && + echo "30MB filter=devnull" >.gitattributes && + GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB +' + test_expect_success EXPENSIVE 'filter large file' ' git config filter.largefile.smudge cat && git config filter.largefile.clean cat && From 9079ab7cb6768fa04e086303f90be043b423cca4 Mon Sep 17 00:00:00 2001 From: Steffen Prohaska Date: Sun, 21 Sep 2014 12:03:26 +0200 Subject: [PATCH 7/7] sha1_file: don't convert off_t to size_t too early to avoid potential die() xsize_t() checks if an off_t argument can be safely converted to a size_t return value. If the check is executed too early, it could fail for large files on 32-bit architectures even if the size_t code path is not taken. Other paths might be able to handle the large file. Specifically, index_stream_convert_blob() is able to handle a large file if a filter is configured that returns a small result. Signed-off-by: Steffen Prohaska Signed-off-by: Junio C Hamano --- sha1_file.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sha1_file.c b/sha1_file.c index 423ec64e87..7b2612f733 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -3178,17 +3178,22 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags) { int ret; - size_t size = xsize_t(st->st_size); + /* + * Call xsize_t() only when needed to avoid potentially unnecessary + * die() for large files. + */ if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path)) ret = index_stream_convert_blob(sha1, fd, path, flags); else if (!S_ISREG(st->st_mode)) ret = index_pipe(sha1, fd, type, path, flags); - else if (size <= big_file_threshold || type != OBJ_BLOB || + else if (st->st_size <= big_file_threshold || type != OBJ_BLOB || (path && would_convert_to_git(path))) - ret = index_core(sha1, fd, size, type, path, flags); + ret = index_core(sha1, fd, xsize_t(st->st_size), type, path, + flags); else - ret = index_stream(sha1, fd, size, type, path, flags); + ret = index_stream(sha1, fd, xsize_t(st->st_size), type, path, + flags); close(fd); return ret; }