Skip to content
55 changes: 55 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
"gopkg.in/src-d/enry.v1"
)

var (
Expand Down Expand Up @@ -157,3 +158,57 @@ func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
func IsLicenseDirectory(fileName string) bool {
return licenseDirectoryRe.MatchString(strings.ToLower(fileName))
}

// ExtractSourceFiles searches for source code files and their returns header comments, when available.
// Enry is used to get possible valuable files.
func ExtractSourceFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
langs := []string{}
for _, file := range files {
lang, safe := enry.GetLanguageByExtension(file)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use GetLanguage which is more accurate.

if safe == true {
langs = append(langs, lang)
text, err := fs.ReadFile(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
}
if len(candidates) > 0 {
candidates = ExtractHeaderComments(candidates, langs)
}
return candidates
}

// ExtractHeaderComments searches in source code files for header comments and outputs license text on them them.
func ExtractHeaderComments(candidates [][]byte, lang []string) [][]byte {
// TO DO: split code from comments, preferably only header comments
comments := [][]byte{}
return comments
}

// InvestigateHeaderComments scans the header comments for licensing information and outputs the
// probable names using NER.
func InvestigateHeaderComments(texts [][]byte, fs filer.Filer) map[string]float32 {
// TO DO: split license-comments from description-comments.
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateHeaderComment(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
}

// InvestigateHeaderComment scans the header comments for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateHeaderComment(text []byte) map[string]float32 {
return globalLicenseDatabase().QueryLicenseText(string(text))
}
13 changes: 10 additions & 3 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,17 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
}
// Plan B: take the README, find the section about the license and apply NER
candidates = internal.ExtractReadmeFiles(fileNames, fs)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
if len(candidates) > 0 {
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) > 0 {
return licenses, nil
}
}
// Plan C: look for licence texts in source code files with comments at header
candidates = internal.ExtractSourceFiles(fileNames, fs)
if len(candidates) > 0 {
licenses = internal.InvestigateHeaderComments(candidates, fs)
}
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
Expand Down