From 5adfee5761204c9c65d29dd9a418fc0519ad46ad Mon Sep 17 00:00:00 2001 From: kuba-- Date: Thu, 14 Mar 2019 13:26:00 +0100 Subject: [PATCH] Do not return empty lang. It's better to return any potential candidate than nothing. Signed-off-by: kuba-- --- .gitignore | 1 + classifier.go | 25 +++++++++++++++---------- common.go | 9 +++++---- common_test.go | 8 +++++--- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index c2cb0a9..b76c892 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ Makefile.main .idea .docsrv-resources build/ +vendor/ java/lib/ diff --git a/classifier.go b/classifier.go index 8f72296..fed1d0e 100644 --- a/classifier.go +++ b/classifier.go @@ -26,9 +26,6 @@ type scoredLanguage struct { // Classify returns a sorted slice of possible languages sorted by decreasing language's probability func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { - if len(content) == 0 { - return nil - } var languages map[string]float64 if len(candidates) == 0 { @@ -44,15 +41,23 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) []s } } - tokens := tokenizer.Tokenize(content) + empty := len(content) == 0 scoredLangs := make([]*scoredLanguage, 0, len(languages)) - for language := range languages { - scoredLang := &scoredLanguage{ - language: language, - score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language], - } - scoredLangs = append(scoredLangs, scoredLang) + var tokens []string + if !empty { + tokens = tokenizer.Tokenize(content) + } + + for language := range languages { + score := c.languagesLogProbabilities[language] + if !empty { + score += c.tokensLogProbability(tokens, language) + } + scoredLangs = append(scoredLangs, &scoredLanguage{ + language: language, + score: score, + }) } return sortLanguagesByScore(scoredLangs) diff --git a/common.go b/common.go index 9bc968a..3486274 100644 --- a/common.go +++ b/common.go @@ -41,11 +41,12 @@ func GetLanguage(filename string, content []byte) (language string) { } func firstLanguage(languages []string) string { - if len(languages) == 0 { - return OtherLanguage + for _, l := range languages { + if l != "" { + return l + } } - - return languages[0] + return OtherLanguage } // GetLanguageByModeline returns detected language. If there are more than one possibles languages diff --git a/common_test.go b/common_test.go index edaed4c..4a5f8a0 100644 --- a/common_test.go +++ b/common_test.go @@ -101,9 +101,10 @@ func (s *EnryTestSuite) TestGetLanguage() { expected string safe bool }{ + {name: "TestGetLanguage_0", filename: "foo.h", content: []byte{}, expected: "C"}, {name: "TestGetLanguage_1", filename: "foo.py", content: []byte{}, expected: "Python"}, {name: "TestGetLanguage_2", filename: "foo.m", content: []byte(":- module"), expected: "Mercury"}, - {name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: OtherLanguage}, + {name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: "MATLAB"}, {name: "TestGetLanguage_4", filename: "foo.mo", content: []byte{0xDE, 0x12, 0x04, 0x95, 0x00, 0x00, 0x00, 0x00}, expected: OtherLanguage}, {name: "TestGetLanguage_5", filename: "", content: nil, expected: OtherLanguage}, } @@ -276,6 +277,7 @@ func (s *EnryTestSuite) TestGetLanguagesByExtension() { candidates []string expected []string }{ + {name: "TestGetLanguagesByExtension_0", filename: "foo.h", expected: []string{"C", "C++", "Objective-C"}}, {name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil}, {name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}}, {name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}}, @@ -301,7 +303,7 @@ func (s *EnryTestSuite) TestGetLanguagesByClassifier() { {name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"}, {name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"}, {name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"}, - {name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: OtherLanguage}, + {name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: "Python"}, } for _, test := range test { @@ -339,7 +341,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() { {name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"}, {name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"}, {name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"}, - {name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage}, + {name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"}, } for _, test := range test {