Make IsVendor quicker
Although iterating across the regexps is quicker than naively concatenating them, it is still quite slow. This PR proposes a slightly cleverer solution. First instead of just concatenating with groups this PR uses non-capturing groups. This speeds up the regexp processing. Secondly we group the regexps in to 3 groups - those that have to be at the start, those that are segments or at the start and the rest. This makes a considerable speed improvement. Thirdly the regexps are sorted within those groups - which also speeds things up. All in all for a non-vendored file this makes IsVendor around twice as fast. Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
parent
d2d4c32d4d
commit
20726a1de3
115
utils.go
115
utils.go
@ -3,6 +3,8 @@ package enry
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/go-enry/go-enry/v2/data"
|
"github.com/go-enry/go-enry/v2/data"
|
||||||
@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
|
|||||||
return strings.HasPrefix(base, ".") && base != "."
|
return strings.HasPrefix(base, ".") && base != "."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var isVendorRegExp *regexp.Regexp
|
||||||
|
|
||||||
// IsVendor returns whether or not path is a vendor path.
|
// IsVendor returns whether or not path is a vendor path.
|
||||||
func IsVendor(path string) bool {
|
func IsVendor(path string) bool {
|
||||||
return matchRegexSlice(data.VendorMatchers, path)
|
return isVendorRegExp.MatchString(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsTest returns whether or not path is a test path.
|
// IsTest returns whether or not path is a test path.
|
||||||
@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
|
|||||||
|
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// We now collate the individual regexps that make up the VendorMatchers to
|
||||||
|
// produce a single large regexp which is around twice as fast to test than
|
||||||
|
// simply iterating through all the regexps or naïvely collating the
|
||||||
|
// regexps.
|
||||||
|
//
|
||||||
|
// ---
|
||||||
|
//
|
||||||
|
// data.VendorMatchers here is a slice containing individual regexps that
|
||||||
|
// match a vendor file therefore if we want to test if a filename is a
|
||||||
|
// Vendor we need to test whether that filename matches one or more of
|
||||||
|
// those regexps.
|
||||||
|
//
|
||||||
|
// Now we could test each matcher in turn using a shortcircuiting test i.e.
|
||||||
|
//
|
||||||
|
// func IsVendor(filename string) bool {
|
||||||
|
// for _, matcher := range data.VendorMatchers {
|
||||||
|
// if matcher.Match(filename) {
|
||||||
|
// return true
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return false
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// Or concatentate all these regexps using groups i.e.
|
||||||
|
//
|
||||||
|
// `(regexp1)|(regexp2)|(regexp3)|...`
|
||||||
|
//
|
||||||
|
// However both of these are relatively slow and they don't take advantage
|
||||||
|
// of the inherent structure within our regexps...
|
||||||
|
//
|
||||||
|
// If we look at our regexps there are essentially three types of regexp:
|
||||||
|
//
|
||||||
|
// 1. Those that start with `^`
|
||||||
|
// 2. Those that start with `(^|/)`
|
||||||
|
// 3. Others
|
||||||
|
//
|
||||||
|
// If we collate our regexps into these groups that will significantly
|
||||||
|
// reduce the likelihood of backtracking within the regexp trie matcher.
|
||||||
|
//
|
||||||
|
// A further improvement is to use non-capturing groups as otherwise the
|
||||||
|
// regexp parser, whilst matching, will have to allocate slices for
|
||||||
|
// matching positions. (A future improvement here could be in the use of
|
||||||
|
// enforcing non-capturing groups within the sub-regexps too.)
|
||||||
|
//
|
||||||
|
// Finally if we sort the segments we can help the matcher build a more
|
||||||
|
// efficient matcher and trie.
|
||||||
|
|
||||||
|
// alias the VendorMatchers to simplify things
|
||||||
|
matchers := data.VendorMatchers
|
||||||
|
|
||||||
|
// Create three temporary string slices for our three groups above - prefixes removed
|
||||||
|
caretStrings := make([]string, 0, 10)
|
||||||
|
caretSegmentStrings := make([]string, 0, 10)
|
||||||
|
matcherStrings := make([]string, 0, len(matchers))
|
||||||
|
|
||||||
|
// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
|
||||||
|
for _, matcher := range matchers {
|
||||||
|
str := matcher.String()
|
||||||
|
if str[0] == '^' {
|
||||||
|
caretStrings = append(caretStrings, str[1:])
|
||||||
|
} else if str[0:5] == "(^|/)" {
|
||||||
|
caretSegmentStrings = append(caretSegmentStrings, str[5:])
|
||||||
|
} else {
|
||||||
|
matcherStrings = append(matcherStrings, str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
|
||||||
|
sort.Strings(caretSegmentStrings)
|
||||||
|
sort.Strings(caretStrings)
|
||||||
|
sort.Strings(matcherStrings)
|
||||||
|
|
||||||
|
// Now build the collated regexp
|
||||||
|
sb := &strings.Builder{}
|
||||||
|
|
||||||
|
// Start with group 1 - those that started with `^`
|
||||||
|
sb.WriteString("(?:^(?:")
|
||||||
|
sb.WriteString(caretStrings[0])
|
||||||
|
for _, matcher := range caretStrings[1:] {
|
||||||
|
sb.WriteString(")|(?:")
|
||||||
|
sb.WriteString(matcher)
|
||||||
|
}
|
||||||
|
sb.WriteString("))")
|
||||||
|
sb.WriteString("|")
|
||||||
|
|
||||||
|
// Now add group 2 - those that started with `(^|/)`
|
||||||
|
sb.WriteString("(?:(?:^|/)(?:")
|
||||||
|
sb.WriteString(caretSegmentStrings[0])
|
||||||
|
for _, matcher := range caretSegmentStrings[1:] {
|
||||||
|
sb.WriteString(")|(?:")
|
||||||
|
sb.WriteString(matcher)
|
||||||
|
}
|
||||||
|
sb.WriteString("))")
|
||||||
|
sb.WriteString("|")
|
||||||
|
|
||||||
|
// Finally add the rest
|
||||||
|
sb.WriteString("(?:")
|
||||||
|
sb.WriteString(matcherStrings[0])
|
||||||
|
for _, matcher := range matcherStrings[1:] {
|
||||||
|
sb.WriteString(")|(?:")
|
||||||
|
sb.WriteString(matcher)
|
||||||
|
}
|
||||||
|
sb.WriteString(")")
|
||||||
|
|
||||||
|
// Compile the whole thing as the isVendorRegExp
|
||||||
|
isVendorRegExp = regexp.MustCompile(sb.String())
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user