Skip to content

Commit

Permalink
Merge pull request #205 from kuba--/fix-langs
Browse files Browse the repository at this point in the history
Do not return empty lang.
  • Loading branch information
bzz committed Mar 15, 2019
2 parents 56214e3 + 5adfee5 commit 6526da7
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ Makefile.main
.idea
.docsrv-resources
build/
vendor/
java/lib/
23 changes: 14 additions & 9 deletions classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ type scoredLanguage struct {

// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
if len(content) == 0 {
return nil
}

var languages map[string]float64
if len(candidates) == 0 {
Expand All @@ -44,15 +41,23 @@ func (c *classifier) Classify(content []byte, candidates map[string]float64) []s
}
}

tokens := tokenizer.Tokenize(content)
empty := len(content) == 0
scoredLangs := make([]*scoredLanguage, 0, len(languages))

var tokens []string
if !empty {
tokens = tokenizer.Tokenize(content)
}

for language := range languages {
scoredLang := &scoredLanguage{
language: language,
score: c.tokensLogProbability(tokens, language) + c.languagesLogProbabilities[language],
score := c.languagesLogProbabilities[language]
if !empty {
score += c.tokensLogProbability(tokens, language)
}

scoredLangs = append(scoredLangs, scoredLang)
scoredLangs = append(scoredLangs, &scoredLanguage{
language: language,
score: score,
})
}

return sortLanguagesByScore(scoredLangs)
Expand Down
9 changes: 5 additions & 4 deletions common.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ func GetLanguage(filename string, content []byte) (language string) {
}

func firstLanguage(languages []string) string {
if len(languages) == 0 {
return OtherLanguage
for _, l := range languages {
if l != "" {
return l
}
}

return languages[0]
return OtherLanguage
}

// GetLanguageByModeline returns detected language. If there are more than one possibles languages
Expand Down
8 changes: 5 additions & 3 deletions common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ func (s *EnryTestSuite) TestGetLanguage() {
expected string
safe bool
}{
{name: "TestGetLanguage_0", filename: "foo.h", content: []byte{}, expected: "C"},
{name: "TestGetLanguage_1", filename: "foo.py", content: []byte{}, expected: "Python"},
{name: "TestGetLanguage_2", filename: "foo.m", content: []byte(":- module"), expected: "Mercury"},
{name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: OtherLanguage},
{name: "TestGetLanguage_3", filename: "foo.m", content: nil, expected: "MATLAB"},
{name: "TestGetLanguage_4", filename: "foo.mo", content: []byte{0xDE, 0x12, 0x04, 0x95, 0x00, 0x00, 0x00, 0x00}, expected: OtherLanguage},
{name: "TestGetLanguage_5", filename: "", content: nil, expected: OtherLanguage},
}
Expand Down Expand Up @@ -276,6 +277,7 @@ func (s *EnryTestSuite) TestGetLanguagesByExtension() {
candidates []string
expected []string
}{
{name: "TestGetLanguagesByExtension_0", filename: "foo.h", expected: []string{"C", "C++", "Objective-C"}},
{name: "TestGetLanguagesByExtension_1", filename: "foo.foo", expected: nil},
{name: "TestGetLanguagesByExtension_2", filename: "foo.go", expected: []string{"Go"}},
{name: "TestGetLanguagesByExtension_3", filename: "foo.go.php", expected: []string{"Hack", "PHP"}},
Expand All @@ -301,7 +303,7 @@ func (s *EnryTestSuite) TestGetLanguagesByClassifier() {
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, expected: "Python"},
{name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: OtherLanguage},
{name: "TestGetLanguagesByClassifier_7", filename: "", candidates: []string{"python"}, expected: "Python"},
}

for _, test := range test {
Expand Down Expand Up @@ -339,7 +341,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: OtherLanguage},
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"},
}

for _, test := range test {
Expand Down

0 comments on commit 6526da7

Please sign in to comment.