2016-07-13 19:05:09 +02:00
package slinguist
2017-05-29 10:05:16 +02:00
import (
2017-05-31 12:07:46 +02:00
"math"
2017-05-29 10:05:16 +02:00
"path/filepath"
"strings"
2016-07-18 16:20:12 +02:00
)
2016-07-13 19:05:09 +02:00
2017-05-29 10:05:16 +02:00
// OtherLanguage is used as a zero value when a function can not return a specific language.
const OtherLanguage = "Other"
2016-07-13 19:05:09 +02:00
2017-05-31 12:07:46 +02:00
// Strategy type fix the signature for the functions that can be used as a strategy.
type Strategy func ( filename string , content [ ] byte ) ( languages [ ] string )
var strategies = [ ] Strategy {
GetLanguagesByModeline ,
GetLanguagesByFilename ,
GetLanguagesByShebang ,
GetLanguagesByExtension ,
GetLanguagesByContent ,
}
2017-05-29 10:05:16 +02:00
// GetLanguage applies a sequence of strategies based on the given filename and content
// to find out the most probably language to return.
2016-07-18 16:20:12 +02:00
func GetLanguage ( filename string , content [ ] byte ) string {
2017-05-31 12:07:46 +02:00
candidates := map [ string ] float64 { }
for _ , strategy := range strategies {
languages := strategy ( filename , content )
if len ( languages ) == 1 {
return languages [ 0 ]
}
if len ( languages ) > 0 {
for _ , language := range languages {
candidates [ language ] ++
}
}
2016-07-18 16:20:12 +02:00
}
2016-07-13 19:05:09 +02:00
2017-05-31 12:07:46 +02:00
if len ( candidates ) == 0 {
return OtherLanguage
2017-05-25 12:34:32 +02:00
}
2017-05-31 12:07:46 +02:00
lang := GetLanguageByClassifier ( content , candidates , nil )
2016-07-18 16:20:12 +02:00
return lang
}
2017-05-29 10:05:16 +02:00
// GetLanguageByModeline returns the language of the given content looking for the modeline,
// and safe to indicate the sureness of returned language.
func GetLanguageByModeline ( content [ ] byte ) ( lang string , safe bool ) {
2017-05-31 12:07:46 +02:00
return getLangAndSafe ( "" , content , GetLanguagesByModeline )
2017-05-29 10:05:16 +02:00
}
// GetLanguageByFilename returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByFilename ( filename string ) ( lang string , safe bool ) {
2017-05-31 12:07:46 +02:00
return getLangAndSafe ( filename , nil , GetLanguagesByFilename )
2017-05-29 10:05:16 +02:00
}
2017-05-31 12:07:46 +02:00
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByFilename ( filename string , content [ ] byte ) [ ] string {
return languagesByFilename [ filename ]
2017-05-29 10:05:16 +02:00
}
// GetLanguageByShebang returns the language of the given content looking for the shebang line,
// and safe to indicate the sureness of returned language.
func GetLanguageByShebang ( content [ ] byte ) ( lang string , safe bool ) {
2017-05-31 12:07:46 +02:00
return getLangAndSafe ( "" , content , GetLanguagesByShebang )
2017-05-29 10:05:16 +02:00
}
// GetLanguageByExtension returns a language based on the given filename, and safe to indicate
// the sureness of returned language.
func GetLanguageByExtension ( filename string ) ( lang string , safe bool ) {
2017-05-31 12:07:46 +02:00
return getLangAndSafe ( filename , nil , GetLanguagesByExtension )
2017-05-29 10:05:16 +02:00
}
2017-05-31 12:07:46 +02:00
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByExtension ( filename string , content [ ] byte ) [ ] string {
2017-05-29 10:05:16 +02:00
ext := strings . ToLower ( filepath . Ext ( filename ) )
2017-05-31 12:07:46 +02:00
return languagesByExtension [ ext ]
2017-05-29 10:05:16 +02:00
}
// GetLanguageByContent returns a language based on the filename and heuristics applies to the content,
// and safe to indicate the sureness of returned language.
func GetLanguageByContent ( filename string , content [ ] byte ) ( lang string , safe bool ) {
2017-05-31 12:07:46 +02:00
return getLangAndSafe ( filename , content , GetLanguagesByContent )
2017-05-29 10:05:16 +02:00
}
2017-05-31 12:07:46 +02:00
// GetLanguagesByContent returns a slice of possible languages for the given content, filename will be ignored.
// It accomplish the signature to be a Strategy type.
func GetLanguagesByContent ( filename string , content [ ] byte ) [ ] string {
2017-05-29 10:05:16 +02:00
ext := strings . ToLower ( filepath . Ext ( filename ) )
2017-05-31 12:07:46 +02:00
fnMatcher , ok := contentMatchers [ ext ]
if ! ok {
return nil
}
return fnMatcher ( content )
}
func getLangAndSafe ( filename string , content [ ] byte , getLanguageByStrategy Strategy ) ( lang string , safe bool ) {
languages := getLanguageByStrategy ( filename , content )
if len ( languages ) == 0 {
2017-05-29 10:05:16 +02:00
lang = OtherLanguage
2017-05-31 12:07:46 +02:00
return
2017-05-29 10:05:16 +02:00
}
2017-05-31 12:07:46 +02:00
lang = languages [ 0 ]
safe = len ( languages ) == 1
2017-05-29 10:05:16 +02:00
return
}
// GetLanguageByClassifier takes in a content and a list of candidates, and apply the classifier's Classify method to
2017-05-31 12:07:46 +02:00
// get the most probably language. If classifier is null then DefaultClassfier will be used. If there aren't candidates
// OtherLanguage is returned.
func GetLanguageByClassifier ( content [ ] byte , candidates map [ string ] float64 , classifier Classifier ) string {
scores := GetLanguagesByClassifier ( content , candidates , classifier )
if len ( scores ) == 0 {
return OtherLanguage
}
return getLangugeHigherScore ( scores )
}
func getLangugeHigherScore ( scores map [ string ] float64 ) string {
var language string
higher := - math . MaxFloat64
for lang , score := range scores {
if higher < score {
language = lang
higher = score
}
}
return language
}
// GetLanguagesByClassifier returns a map of possible languages as keys and a score as value based on content and candidates. The values can be ordered
// with the highest value as the most probably language. If classifier is null then DefaultClassfier will be used.
func GetLanguagesByClassifier ( content [ ] byte , candidates map [ string ] float64 , classifier Classifier ) map [ string ] float64 {
if classifier == nil {
classifier = DefaultClassifier
}
return classifier . Classify ( content , candidates )
2017-05-29 10:05:16 +02:00
}
// GetLanguageExtensions returns the different extensions being used by the language.
func GetLanguageExtensions ( language string ) [ ] string {
return extensionsByLanguage [ language ]
}
// Type represent language's type. Either data, programming, markup, prose, or unknown.
type Type int
// Type's values.
const (
Unknown Type = iota
Data
Programming
Markup
Prose
)
// GetLanguageType returns the given language's type.
func GetLanguageType ( language string ) ( langType Type ) {
langType , ok := languagesType [ language ]
if ! ok {
langType = Unknown
}
return langType
}
// GetLanguageByAlias returns either the language related to the given alias and ok set to true
// or Otherlanguage and ok set to false if the alias is not recognized.
func GetLanguageByAlias ( alias string ) ( lang string , ok bool ) {
a := strings . Split ( alias , ` , ` ) [ 0 ]
a = strings . ToLower ( a )
lang , ok = languagesByAlias [ a ]
if ! ok {
lang = OtherLanguage
2016-07-13 19:05:09 +02:00
}
2016-07-18 16:20:12 +02:00
2016-07-13 19:05:09 +02:00
return
}