go-enry/common.go

449 lines
14 KiB
Go
Raw Normal View History

2017-06-13 13:56:07 +02:00
package enry
2016-07-13 19:05:09 +02:00
2017-05-29 10:05:16 +02:00
import (
2017-06-12 13:42:20 +02:00
"bufio"
"bytes"
2017-05-29 10:05:16 +02:00
"path/filepath"
2017-06-12 13:42:20 +02:00
"regexp"
2017-05-29 10:05:16 +02:00
"strings"
"gopkg.in/src-d/enry.v1/data"
2016-07-18 16:20:12 +02:00
)
2016-07-13 19:05:09 +02:00
2017-05-29 10:05:16 +02:00
// OtherLanguage is used as a zero value when a function can not return a specific language.
2017-06-12 13:42:20 +02:00
const OtherLanguage = ""
2016-07-13 19:05:09 +02:00
2017-05-31 12:07:46 +02:00
// Strategy type fix the signature for the functions that can be used as a strategy.
2017-06-12 13:42:20 +02:00
type Strategy func(filename string, content []byte, candidates []string) (languages []string)
2017-05-31 12:07:46 +02:00
2017-06-12 13:42:20 +02:00
// DefaultStrategies is the strategies' sequence GetLanguage uses to detect languages.
var DefaultStrategies = []Strategy{
2017-05-31 12:07:46 +02:00
GetLanguagesByModeline,
GetLanguagesByFilename,
GetLanguagesByShebang,
GetLanguagesByExtension,
GetLanguagesByContent,
2017-06-12 13:42:20 +02:00
GetLanguagesByClassifier,
2017-05-31 12:07:46 +02:00
}
var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,
tokensTotal: data.TokensTotal,
}
2017-05-29 10:05:16 +02:00
// GetLanguage applies a sequence of strategies based on the given filename and content
// to find out the most probably language to return.
func GetLanguage(filename string, content []byte) (language string) {
languages := GetLanguages(filename, content)
2017-06-12 13:42:20 +02:00
return firstLanguage(languages)
}
func firstLanguage(languages []string) string {
if len(languages) == 0 {
2017-05-31 12:07:46 +02:00
return OtherLanguage
}
2017-06-12 13:42:20 +02:00
return languages[0]
}
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
}
// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
2016-07-18 16:20:12 +02:00
}
2017-06-12 13:42:20 +02:00
// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
// GetLanguageByFilename returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByFilename(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
}
// GetLanguageByShebang returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByShebang(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
}
// GetLanguageByExtension returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByExtension(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
}
// GetLanguageByContent returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByContent(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByContent, "", content, nil)
}
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
}
func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
languages := strategy(filename, content, candidates)
return getFirstLanguageAndSafe(languages)
}
func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
language = firstLanguage(languages)
safe = len(languages) == 1
return
}
2017-06-12 13:42:20 +02:00
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}
// GetLanguages applies a sequence of strategies based on the given filename and content
// to find out the most probably languages to return.
func GetLanguages(filename string, content []byte) []string {
if IsBinary(content) {
return nil
}
var languages []string
candidates := []string{}
for _, strategy := range DefaultStrategies {
languages = strategy(filename, content, candidates)
if len(languages) == 1 {
return languages
}
if len(languages) > 0 {
candidates = append(candidates, languages...)
}
}
return languages
}
2017-06-12 13:42:20 +02:00
// GetLanguagesByModeline returns a slice of possible languages for the given content, filename will be ignored.
// It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByModeline(filename string, content []byte, candidates []string) []string {
headFoot := getHeaderAndFooter(content)
var languages []string
for _, getLang := range modelinesFunc {
languages = getLang("", headFoot, candidates)
if len(languages) > 0 {
break
}
}
return languages
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
var modelinesFunc = []Strategy{
GetLanguagesByEmacsModeline,
GetLanguagesByVimModeline,
}
func getHeaderAndFooter(content []byte) []byte {
const searchScope = 5
if bytes.Count(content, []byte("\n")) < 2*searchScope {
return content
}
header := headScope(content, searchScope)
footer := footScope(content, searchScope)
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
headerAndFooter = append(headerAndFooter, content[:header]...)
headerAndFooter = append(headerAndFooter, content[footer:]...)
return headerAndFooter
}
func headScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
eol := bytes.IndexAny(content, "\n")
content = content[eol+1:]
index += eol
}
return index + scope - 1
}
func footScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
index = bytes.LastIndexAny(content, "\n")
content = content[:index]
}
return index + 1
}
var (
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByEmacsModeline(filename string, content []byte, candidates []string) []string {
matched := reEmacsModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
var alias string
if matchedAlias != nil {
alias = string(matchedAlias[1])
} else {
alias = string(lastLineMatched)
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByVimModeline returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByVimModeline(filename string, content []byte, candidates []string) []string {
matched := reVimModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
if matchedAlias == nil {
return nil
}
alias := string(matchedAlias[0][1])
if len(matchedAlias) > 1 {
// cases:
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
for _, match := range matchedAlias {
otherAlias := string(match[1])
if otherAlias != alias {
return nil
}
}
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByFilename returns a slice of possible languages for the given filename, content and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByFilename(filename string, content []byte, candidates []string) []string {
return data.LanguagesByFilename[filepath.Base(filename)]
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
// GetLanguagesByShebang returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByShebang(filename string, content []byte, candidates []string) (languages []string) {
interpreter := getInterpreter(content)
return data.LanguagesByInterpreter[interpreter]
2017-06-12 13:42:20 +02:00
}
var (
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
)
func getInterpreter(data []byte) (interpreter string) {
line := getFirstLine(data)
if !hasShebang(line) {
return ""
}
// skip shebang
line = bytes.TrimSpace(line[2:])
splitted := bytes.Fields(line)
if bytes.Contains(splitted[0], []byte("env")) {
if len(splitted) > 1 {
interpreter = string(splitted[1])
}
} else {
splittedPath := bytes.Split(splitted[0], []byte{'/'})
interpreter = string(splittedPath[len(splittedPath)-1])
}
if interpreter == "sh" {
interpreter = lookForMultilineExec(data)
}
if pythonVersion.MatchString(interpreter) {
interpreter = interpreter[:strings.Index(interpreter, `.`)]
}
return
}
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
}
return line
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
func hasShebang(line []byte) bool {
const shebang = `#!`
prefix := []byte(shebang)
return bytes.HasPrefix(line, prefix)
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
func lookForMultilineExec(data []byte) string {
const magicNumOfLines = 5
interpreter := "sh"
buf := bufio.NewScanner(bytes.NewReader(data))
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
line := buf.Bytes()
if shebangExecHack.Match(line) {
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
break
}
}
if err := buf.Err(); err != nil {
return interpreter
}
return interpreter
}
// GetLanguagesByExtension returns a slice of possible languages for the given filename, content and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByExtension(filename string, content []byte, candidates []string) []string {
2017-06-13 13:56:07 +02:00
if !strings.Contains(filename, ".") {
return nil
}
filename = strings.ToLower(filename)
dots := getDotIndexes(filename)
for _, dot := range dots {
ext := filename[dot:]
languages, ok := data.LanguagesByExtension[ext]
2017-06-13 13:56:07 +02:00
if ok {
return languages
}
}
return nil
}
func getDotIndexes(filename string) []int {
dots := make([]int, 0, 2)
for i, letter := range filename {
if letter == rune('.') {
dots = append(dots, i)
}
}
return dots
2017-05-29 10:05:16 +02:00
}
2017-06-12 13:42:20 +02:00
// GetLanguagesByContent returns a slice of possible languages for the given content, filename and candidates
// will be ignored. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByContent(filename string, content []byte, candidates []string) []string {
2017-05-29 10:05:16 +02:00
ext := strings.ToLower(filepath.Ext(filename))
2017-06-28 11:04:51 +02:00
fnMatcher, ok := data.ContentMatchers[ext]
2017-05-31 12:07:46 +02:00
if !ok {
return nil
}
return fnMatcher(content)
}
2017-06-12 13:42:20 +02:00
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
2017-06-12 13:42:20 +02:00
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
if len(candidates) == 0 {
return nil
2017-05-31 12:07:46 +02:00
}
2017-06-12 13:42:20 +02:00
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
2017-05-31 12:07:46 +02:00
}
2017-06-12 13:42:20 +02:00
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
mapCandidates := make(map[string]float64)
for _, candidate := range candidates {
mapCandidates[candidate]++
2017-05-31 12:07:46 +02:00
}
2017-06-12 13:42:20 +02:00
return classifier.Classify(content, mapCandidates)
2017-05-29 10:05:16 +02:00
}
// GetLanguageExtensions returns the different extensions being used by the language.
func GetLanguageExtensions(language string) []string {
return data.ExtensionsByLanguage[language]
2017-05-29 10:05:16 +02:00
}
// Type represent language's type. Either data, programming, markup, prose, or unknown.
type Type int
// Type's values.
const (
Unknown Type = iota
Data
Programming
Markup
Prose
)
2017-06-12 13:42:20 +02:00
// GetLanguageType returns the type of the given language.
2017-05-29 10:05:16 +02:00
func GetLanguageType(language string) (langType Type) {
intType, ok := data.LanguagesType[language]
langType = Type(intType)
2017-05-29 10:05:16 +02:00
if !ok {
langType = Unknown
}
return langType
}
// GetLanguageByAlias returns either the language related to the given alias and ok set to true
// or Otherlanguage and ok set to false if the alias is not recognized.
func GetLanguageByAlias(alias string) (lang string, ok bool) {
a := strings.Split(alias, `,`)[0]
a = strings.ToLower(a)
lang, ok = data.LanguagesByAlias[a]
2017-05-29 10:05:16 +02:00
if !ok {
lang = OtherLanguage
2016-07-13 19:05:09 +02:00
}
2016-07-18 16:20:12 +02:00
2016-07-13 19:05:09 +02:00
return
}