diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh index 15764ee9ac8..74586f3813c 100755 --- a/t/t4034-diff-words.sh +++ b/t/t4034-diff-words.sh @@ -69,6 +69,10 @@ test_language_driver () { echo "* diff='"$lang"'" >.gitattributes && word_diff --color-words ' + test_expect_success "diff driver '$lang' in Islandic" ' + LANG=is_IS.UTF-8 LANGUAGE=is LC_ALL="$is_IS_locale" \ + word_diff --color-words + ' } test_expect_success setup ' diff --git a/userdiff.c b/userdiff.c index 09203fbc354..eaec6ebb5e9 100644 --- a/userdiff.c +++ b/userdiff.c @@ -17,6 +17,7 @@ static int drivers_alloc; .cflags = REG_EXTENDED, \ }, \ .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \ + .word_regex_multi_byte = wrx "|[^[:space:]]", \ } #define IPATTERN(lang, rx, wrx) { \ .name = lang, \ @@ -26,6 +27,7 @@ static int drivers_alloc; .cflags = REG_EXTENDED | REG_ICASE, \ }, \ .word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \ + .word_regex_multi_byte = wrx "|[^[:space:]]", \ } /* @@ -294,7 +296,7 @@ PATTERNS("scheme", /* All other words should be delimited by spaces or parentheses */ "|([^][)(}{[ \t])+"), PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$", - "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"), + "\\\\[a-zA-Z@]+|\\\\.|([a-zA-Z0-9]|[^\x01-\x7f])+"), { "default", NULL, NULL, -1, { NULL, 0 } }, }; #undef PATTERNS @@ -330,6 +332,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver, return 0; } +static int regexec_supports_multi_byte_chars(void) +{ + static const char not_space[] = "[^[:space:]]"; + static const char utf8_multi_byte_char[] = "\xc2\xa3"; + regex_t re; + regmatch_t match; + static int result = -1; + + if (result != -1) + return result; + if (regcomp(&re, not_space, REG_EXTENDED)) + BUG("invalid regular expression: %s", not_space); + result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) && + match.rm_so == 0 && + match.rm_eo == strlen(utf8_multi_byte_char); + regfree(&re); + return result; +} + static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len) { struct find_by_namelen_data udcbdata = { @@ -405,7 +426,13 @@ int userdiff_config(const char *k, const char *v) struct userdiff_driver *userdiff_find_by_name(const char *name) { int len = strlen(name); - return userdiff_find_by_namelen(name, len); + struct userdiff_driver *driver = userdiff_find_by_namelen(name, len); + if (driver && driver->word_regex_multi_byte) { + if (regexec_supports_multi_byte_chars()) + driver->word_regex = driver->word_regex_multi_byte; + driver->word_regex_multi_byte = NULL; + } + return driver; } struct userdiff_driver *userdiff_find_by_path(struct index_state *istate, diff --git a/userdiff.h b/userdiff.h index 24419db6973..d726804c3e5 100644 --- a/userdiff.h +++ b/userdiff.h @@ -18,6 +18,7 @@ struct userdiff_driver { int binary; struct userdiff_funcname funcname; const char *word_regex; + const char *word_regex_multi_byte; const char *textconv; struct notes_cache *textconv_cache; int textconv_want_cache;