Make IsVendor quicker
Although iterating across the regexps is quicker than naively concatenating them, it is still quite slow. This PR proposes a slightly cleverer solution. First instead of just concatenating with groups this PR uses non-capturing groups. This speeds up the regexp processing. Secondly we group the regexps in to 3 groups - those that have to be at the start, those that are segments or at the start and the rest. This makes a considerable speed improvement. Thirdly the regexps are sorted within those groups - which also speeds things up. All in all for a non-vendored file this makes IsVendor around twice as fast. Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
parent
d2d4c32d4d
commit
20726a1de3
115
utils.go
115
utils.go
@ -3,6 +3,8 @@ package enry
|
||||
import (
|
||||
"bytes"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/data"
|
||||
@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
|
||||
return strings.HasPrefix(base, ".") && base != "."
|
||||
}
|
||||
|
||||
var isVendorRegExp *regexp.Regexp
|
||||
|
||||
// IsVendor returns whether or not path is a vendor path.
|
||||
func IsVendor(path string) bool {
|
||||
return matchRegexSlice(data.VendorMatchers, path)
|
||||
return isVendorRegExp.MatchString(path)
|
||||
}
|
||||
|
||||
// IsTest returns whether or not path is a test path.
|
||||
@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func init() {
|
||||
// We now collate the individual regexps that make up the VendorMatchers to
|
||||
// produce a single large regexp which is around twice as fast to test than
|
||||
// simply iterating through all the regexps or naïvely collating the
|
||||
// regexps.
|
||||
//
|
||||
// ---
|
||||
//
|
||||
// data.VendorMatchers here is a slice containing individual regexps that
|
||||
// match a vendor file therefore if we want to test if a filename is a
|
||||
// Vendor we need to test whether that filename matches one or more of
|
||||
// those regexps.
|
||||
//
|
||||
// Now we could test each matcher in turn using a shortcircuiting test i.e.
|
||||
//
|
||||
// func IsVendor(filename string) bool {
|
||||
// for _, matcher := range data.VendorMatchers {
|
||||
// if matcher.Match(filename) {
|
||||
// return true
|
||||
// }
|
||||
// }
|
||||
// return false
|
||||
// }
|
||||
//
|
||||
// Or concatentate all these regexps using groups i.e.
|
||||
//
|
||||
// `(regexp1)|(regexp2)|(regexp3)|...`
|
||||
//
|
||||
// However both of these are relatively slow and they don't take advantage
|
||||
// of the inherent structure within our regexps...
|
||||
//
|
||||
// If we look at our regexps there are essentially three types of regexp:
|
||||
//
|
||||
// 1. Those that start with `^`
|
||||
// 2. Those that start with `(^|/)`
|
||||
// 3. Others
|
||||
//
|
||||
// If we collate our regexps into these groups that will significantly
|
||||
// reduce the likelihood of backtracking within the regexp trie matcher.
|
||||
//
|
||||
// A further improvement is to use non-capturing groups as otherwise the
|
||||
// regexp parser, whilst matching, will have to allocate slices for
|
||||
// matching positions. (A future improvement here could be in the use of
|
||||
// enforcing non-capturing groups within the sub-regexps too.)
|
||||
//
|
||||
// Finally if we sort the segments we can help the matcher build a more
|
||||
// efficient matcher and trie.
|
||||
|
||||
// alias the VendorMatchers to simplify things
|
||||
matchers := data.VendorMatchers
|
||||
|
||||
// Create three temporary string slices for our three groups above - prefixes removed
|
||||
caretStrings := make([]string, 0, 10)
|
||||
caretSegmentStrings := make([]string, 0, 10)
|
||||
matcherStrings := make([]string, 0, len(matchers))
|
||||
|
||||
// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
|
||||
for _, matcher := range matchers {
|
||||
str := matcher.String()
|
||||
if str[0] == '^' {
|
||||
caretStrings = append(caretStrings, str[1:])
|
||||
} else if str[0:5] == "(^|/)" {
|
||||
caretSegmentStrings = append(caretSegmentStrings, str[5:])
|
||||
} else {
|
||||
matcherStrings = append(matcherStrings, str)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
|
||||
sort.Strings(caretSegmentStrings)
|
||||
sort.Strings(caretStrings)
|
||||
sort.Strings(matcherStrings)
|
||||
|
||||
// Now build the collated regexp
|
||||
sb := &strings.Builder{}
|
||||
|
||||
// Start with group 1 - those that started with `^`
|
||||
sb.WriteString("(?:^(?:")
|
||||
sb.WriteString(caretStrings[0])
|
||||
for _, matcher := range caretStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString("))")
|
||||
sb.WriteString("|")
|
||||
|
||||
// Now add group 2 - those that started with `(^|/)`
|
||||
sb.WriteString("(?:(?:^|/)(?:")
|
||||
sb.WriteString(caretSegmentStrings[0])
|
||||
for _, matcher := range caretSegmentStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString("))")
|
||||
sb.WriteString("|")
|
||||
|
||||
// Finally add the rest
|
||||
sb.WriteString("(?:")
|
||||
sb.WriteString(matcherStrings[0])
|
||||
for _, matcher := range matcherStrings[1:] {
|
||||
sb.WriteString(")|(?:")
|
||||
sb.WriteString(matcher)
|
||||
}
|
||||
sb.WriteString(")")
|
||||
|
||||
// Compile the whole thing as the isVendorRegExp
|
||||
isVendorRegExp = regexp.MustCompile(sb.String())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user