Do not return empty lang.
It's better to return any potential candidate than nothing. Signed-off-by: kuba-- <kuba@sourced.tech>
This commit is contained in:
parent
e067e45044
commit
5adfee5761
1
.gitignore
vendored
1
.gitignore
vendored
@ -6,4 +6,5 @@ Makefile.main
|
|||||||
.idea
|
.idea
|
||||||
.docsrv-resources
|
.docsrv-resources
|
||||||
build/
|
build/
|
||||||
|
vendor/
|
||||||
java/lib/
|
java/lib/
|
||||||
|
@ -26,9 +26,6 @@ type scoredLanguage struct {
|
|||||||
|
|
||||||
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
||||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
||||||
if len(content) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var languages map[string]float64
|
var languages map[string]float64
|
||||||
if len(candidates) == 0 {
|
if len(candidates) == 0 {
|
||||||
@ -44,15 +41,23 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) []s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tokens := tokenizer.Tokenize(content)
|
empty := len(content) == 0
|
||||||
scoredLangs := make([]*scoredLanguage, 0, len(languages))
|
scoredLangs := make([]*scoredLanguage, 0, len(languages))
|
||||||
for language := range languages {
|
|
||||||
scoredLang := &scoredLanguage{
|
|
||||||
language: language,
|
|
||||||
score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language],
|
|
||||||
}
|
|
||||||
|
|
||||||
scoredLangs = append(scoredLangs, scoredLang)
|
var tokens []string
|
||||||
|
if !empty {
|
||||||
|
tokens = tokenizer.Tokenize(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
for language := range languages {
|
||||||
|
score := c.languagesLogProbabilities[language]
|
||||||
|
if !empty {
|
||||||
|
score += c.tokensLogProbability(tokens, language)
|
||||||
|
}
|
||||||
|
scoredLangs = append(scoredLangs, &scoredLanguage{
|
||||||
|
language: language,
|
||||||
|
score: score,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return sortLanguagesByScore(scoredLangs)
|
return sortLanguagesByScore(scoredLangs)
|
||||||
|
@ -41,11 +41,12 @@ func GetLanguage(filename string, content []byte) (language string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func firstLanguage(languages []string) string {
|
func firstLanguage(languages []string) string {
|
||||||
if len(languages) == 0 {
|
for _, l := range languages {
|
||||||
return OtherLanguage
|
if l != "" {
|
||||||
|
return l
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return OtherLanguage
|
||||||
return languages[0]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
|
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
|
||||||
|
@ -101,9 +101,10 @@ func (s *EnryTestSuite) TestGetLanguage() {
|
|||||||
expected string
|
expected string
|
||||||
safe bool
|
safe bool
|
||||||
}{
|
}{
|
||||||
|
{name: "TestGetLanguage_0", filename: "foo.h", content: []byte{}, expected: "C"},
|
||||||
{name: "TestGetLanguage_1", filename: "foo.py", content: []byte{}, expected: "Python"},
|
{name: "TestGetLanguage_1", filename: "foo.py", content: []byte{}, expected: "Python"},
|
||||||
{name: "TestGetLanguage_2", filename: "foo.m", content: []byte(":- module"), expected: "Mercury"},
|
{name: "TestGetLanguage_2", filename: "foo.m", content: []byte(":- module"), expected: "Mercury"},
|
||||||
{name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: OtherLanguage},
|
{name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: "MATLAB"},
|
||||||
{name: "TestGetLanguage_4", filename: "foo.mo", content: []byte{0xDE, 0x12, 0x04, 0x95, 0x00, 0x00, 0x00, 0x00}, expected: OtherLanguage},
|
{name: "TestGetLanguage_4", filename: "foo.mo", content: []byte{0xDE, 0x12, 0x04, 0x95, 0x00, 0x00, 0x00, 0x00}, expected: OtherLanguage},
|
||||||
{name: "TestGetLanguage_5", filename: "", content: nil, expected: OtherLanguage},
|
{name: "TestGetLanguage_5", filename: "", content: nil, expected: OtherLanguage},
|
||||||
}
|
}
|
||||||
@ -276,6 +277,7 @@ func (s *EnryTestSuite) TestGetLanguagesByExtension() {
|
|||||||
candidates []string
|
candidates []string
|
||||||
expected []string
|
expected []string
|
||||||
}{
|
}{
|
||||||
|
{name: "TestGetLanguagesByExtension_0", filename: "foo.h", expected: []string{"C", "C++", "Objective-C"}},
|
||||||
{name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil},
|
{name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil},
|
||||||
{name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}},
|
{name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}},
|
||||||
{name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}},
|
{name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}},
|
||||||
@ -301,7 +303,7 @@ func (s *EnryTestSuite) TestGetLanguagesByClassifier() {
|
|||||||
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"},
|
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"},
|
||||||
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"},
|
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"},
|
||||||
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"},
|
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"},
|
||||||
{name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: OtherLanguage},
|
{name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: "Python"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range test {
|
for _, test := range test {
|
||||||
@ -339,7 +341,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
|
|||||||
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
|
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
|
||||||
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
|
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
|
||||||
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
|
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
|
||||||
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage},
|
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range test {
|
for _, test := range test {
|
||||||
|
Loading…
Reference in New Issue
Block a user