diff --git a/Makefile b/Makefile index 40bdcff696..60496ff957 100644 --- a/Makefile +++ b/Makefile @@ -262,7 +262,8 @@ LIB_OBJS = \ revision.o pager.o tree-walk.o xdiff-interface.o \ write_or_die.o trace.o list-objects.o grep.o \ alloc.o merge-file.o path-list.o help.o unpack-trees.o $(DIFF_OBJS) \ - color.o wt-status.o archive-zip.o archive-tar.o shallow.o utf8.o + color.o wt-status.o archive-zip.o archive-tar.o shallow.o utf8.o \ + convert.o BUILTIN_OBJS = \ builtin-add.o \ diff --git a/cache.h b/cache.h index c62b0b090d..9c019e8bba 100644 --- a/cache.h +++ b/cache.h @@ -201,6 +201,7 @@ extern const char *apply_default_whitespace; extern int zlib_compression_level; extern size_t packed_git_window_size; extern size_t packed_git_limit; +extern int auto_crlf; #define GIT_REPO_VERSION 0 extern int repository_format_version; @@ -468,4 +469,8 @@ extern int nfvasprintf(char **str, const char *fmt, va_list va); extern void trace_printf(const char *format, ...); extern void trace_argv_printf(const char **argv, int count, const char *format, ...); +/* convert.c */ +extern int convert_to_git(const char *path, char **bufp, unsigned long *sizep); +extern int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep); + #endif /* CACHE_H */ diff --git a/config.c b/config.c index d82107124a..ffe02129a5 100644 --- a/config.c +++ b/config.c @@ -324,6 +324,11 @@ int git_default_config(const char *var, const char *value) return 0; } + if (!strcmp(var, "core.autocrlf")) { + auto_crlf = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "user.name")) { strlcpy(git_default_name, value, sizeof(git_default_name)); return 0; diff --git a/convert.c b/convert.c new file mode 100644 index 0000000000..13beb70582 --- /dev/null +++ b/convert.c @@ -0,0 +1,186 @@ +#include "cache.h" +/* + * convert.c - convert a file when checking it out and checking it in. + * + * This should use the pathname to decide on whether it wants to do some + * more interesting conversions (automatic gzip/unzip, general format + * conversions etc etc), but by default it just does automatic CRLF<->LF + * translation when the "auto_crlf" option is set. + */ + +struct text_stat { + /* CR, LF and CRLF counts */ + unsigned cr, lf, crlf; + + /* These are just approximations! */ + unsigned printable, nonprintable; +}; + +static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats) +{ + unsigned long i; + + memset(stats, 0, sizeof(*stats)); + + for (i = 0; i < size; i++) { + unsigned char c = buf[i]; + if (c == '\r') { + stats->cr++; + if (i+1 < size && buf[i+1] == '\n') + stats->crlf++; + continue; + } + if (c == '\n') { + stats->lf++; + continue; + } + if (c == 127) + /* DEL */ + stats->nonprintable++; + else if (c < 32) { + switch (c) { + /* BS, HT, ESC and FF */ + case '\b': case '\t': case '\033': case '\014': + stats->printable++; + break; + default: + stats->nonprintable++; + } + } + else + stats->printable++; + } +} + +/* + * The same heuristics as diff.c::mmfile_is_binary() + */ +static int is_binary(unsigned long size, struct text_stat *stats) +{ + + if ((stats->printable >> 7) < stats->nonprintable) + return 1; + /* + * Other heuristics? Average line length might be relevant, + * as might LF vs CR vs CRLF counts.. + * + * NOTE! It might be normal to have a low ratio of CRLF to LF + * (somebody starts with a LF-only file and edits it with an editor + * that adds CRLF only to lines that are added..). But do we + * want to support CR-only? Probably not. + */ + return 0; +} + +int convert_to_git(const char *path, char **bufp, unsigned long *sizep) +{ + char *buffer, *nbuf; + unsigned long size, nsize; + struct text_stat stats; + + /* + * FIXME! Other pluggable conversions should go here, + * based on filename patterns. Right now we just do the + * stupid auto-CRLF one. + */ + if (!auto_crlf) + return 0; + + size = *sizep; + if (!size) + return 0; + buffer = *bufp; + + gather_stats(buffer, size, &stats); + + /* No CR? Nothing to convert, regardless. */ + if (!stats.cr) + return 0; + + /* + * We're currently not going to even try to convert stuff + * that has bare CR characters. Does anybody do that crazy + * stuff? + */ + if (stats.cr != stats.crlf) + return 0; + + /* + * And add some heuristics for binary vs text, of course... + */ + if (is_binary(size, &stats)) + return 0; + + /* + * Ok, allocate a new buffer, fill it in, and return true + * to let the caller know that we switched buffers on it. + */ + nsize = size - stats.crlf; + nbuf = xmalloc(nsize); + *bufp = nbuf; + *sizep = nsize; + do { + unsigned char c = *buffer++; + if (c != '\r') + *nbuf++ = c; + } while (--size); + + return 1; +} + +int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep) +{ + char *buffer, *nbuf; + unsigned long size, nsize; + struct text_stat stats; + unsigned char last; + + /* + * FIXME! Other pluggable conversions should go here, + * based on filename patterns. Right now we just do the + * stupid auto-CRLF one. + */ + if (!auto_crlf) + return 0; + + size = *sizep; + if (!size) + return 0; + buffer = *bufp; + + gather_stats(buffer, size, &stats); + + /* No LF? Nothing to convert, regardless. */ + if (!stats.lf) + return 0; + + /* Was it already in CRLF format? */ + if (stats.lf == stats.crlf) + return 0; + + /* If we have any bare CR characters, we're not going to touch it */ + if (stats.cr != stats.crlf) + return 0; + + if (is_binary(size, &stats)) + return 0; + + /* + * Ok, allocate a new buffer, fill it in, and return true + * to let the caller know that we switched buffers on it. + */ + nsize = size + stats.lf - stats.crlf; + nbuf = xmalloc(nsize); + *bufp = nbuf; + *sizep = nsize; + last = 0; + do { + unsigned char c = *buffer++; + if (c == '\n' && last != '\r') + *nbuf++ = '\r'; + *nbuf++ = c; + last = c; + } while (--size); + + return 1; +} diff --git a/diff.c b/diff.c index 13b9b6c560..561587cace 100644 --- a/diff.c +++ b/diff.c @@ -1332,6 +1332,9 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only) reuse_worktree_file(s->path, s->sha1, 0)) { struct stat st; int fd; + char *buf; + unsigned long size; + if (lstat(s->path, &st) < 0) { if (errno == ENOENT) { err_empty: @@ -1364,7 +1367,19 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only) s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); s->should_munmap = 1; - /* FIXME! CRLF -> LF conversion goes here, based on "s->path" */ + + /* + * Convert from working tree format to canonical git format + */ + buf = s->data; + size = s->size; + if (convert_to_git(s->path, &buf, &size)) { + munmap(s->data, s->size); + s->should_munmap = 0; + s->data = buf; + s->size = size; + s->should_free = 1; + } } else { char type[20]; diff --git a/entry.c b/entry.c index c2641ddefd..472a9ef321 100644 --- a/entry.c +++ b/entry.c @@ -78,6 +78,9 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat path, sha1_to_hex(ce->sha1)); } switch (ntohl(ce->ce_mode) & S_IFMT) { + char *buf; + unsigned long nsize; + case S_IFREG: if (to_tempfile) { strcpy(path, ".merge_file_XXXXXX"); @@ -89,7 +92,18 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat return error("git-checkout-index: unable to create file %s (%s)", path, strerror(errno)); } - /* FIXME: LF -> CRLF conversion goes here, based on "ce->name" */ + + /* + * Convert from git internal format to working tree format + */ + buf = new; + nsize = size; + if (convert_to_working_tree(ce->name, &buf, &nsize)) { + free(new); + new = buf; + size = nsize; + } + wrote = write_in_full(fd, new, size); close(fd); free(new); diff --git a/environment.c b/environment.c index 54c22f8248..2fa0960412 100644 --- a/environment.c +++ b/environment.c @@ -28,6 +28,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE; size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT; int pager_in_use; int pager_use_color = 1; +int auto_crlf = 0; static const char *git_dir; static char *git_object_dir, *git_index_file, *git_refs_dir, *git_graft_file; diff --git a/sha1_file.c b/sha1_file.c index 8ad7fad825..6ec67b2923 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2082,7 +2082,7 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, con { unsigned long size = st->st_size; void *buf; - int ret; + int ret, re_allocated = 0; buf = ""; if (size) @@ -2091,11 +2091,30 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, con if (!type) type = blob_type; - /* FIXME: CRLF -> LF conversion here for blobs! We'll need the path! */ + + /* + * Convert blobs to git internal format + */ + if (!strcmp(type, blob_type)) { + unsigned long nsize = size; + char *nbuf = buf; + if (convert_to_git(NULL, &nbuf, &nsize)) { + if (size) + munmap(buf, size); + size = nsize; + buf = nbuf; + re_allocated = 1; + } + } + if (write_object) ret = write_sha1_file(buf, size, type, sha1); else ret = hash_sha1_file(buf, size, type, sha1); + if (re_allocated) { + free(buf); + return ret; + } if (size) munmap(buf, size); return ret;