Browse Source

optimize keywords (#841)

* optimize keywords

* use defaults for concurrency again
Zachary Rice 3 năm trước cách đây
mục cha
commit
48b79facfa
4 tập tin đã thay đổi với 56 bổ sung17 xóa
  1. 8 2
      config/config.go
  2. 45 15
      detect/detect.go
  3. 1 0
      go.mod
  4. 2 0
      go.sum

+ 8 - 2
config/config.go

@@ -43,10 +43,14 @@ type Config struct {
 	Description string
 	Description string
 	Rules       []*Rule
 	Rules       []*Rule
 	Allowlist   Allowlist
 	Allowlist   Allowlist
+	Keywords    []string
 }
 }
 
 
 func (vc *ViperConfig) Translate() (Config, error) {
 func (vc *ViperConfig) Translate() (Config, error) {
-	var rules []*Rule
+	var (
+		rules    []*Rule
+		keywords []string
+	)
 	for _, r := range vc.Rules {
 	for _, r := range vc.Rules {
 		var allowlistRegexes []*regexp.Regexp
 		var allowlistRegexes []*regexp.Regexp
 		for _, a := range r.Allowlist.Regexes {
 		for _, a := range r.Allowlist.Regexes {
@@ -59,6 +63,8 @@ func (vc *ViperConfig) Translate() (Config, error) {
 
 
 		if r.Keywords == nil {
 		if r.Keywords == nil {
 			r.Keywords = []string{}
 			r.Keywords = []string{}
+		} else {
+			keywords = append(keywords, r.Keywords...)
 		}
 		}
 
 
 		if r.Tags == nil {
 		if r.Tags == nil {
@@ -96,7 +102,6 @@ func (vc *ViperConfig) Translate() (Config, error) {
 			return Config{}, fmt.Errorf("%s invalid regex secret group %d, max regex secret group %d", r.Description, r.SecretGroup, r.Regex.NumSubexp())
 			return Config{}, fmt.Errorf("%s invalid regex secret group %d, max regex secret group %d", r.Description, r.SecretGroup, r.Regex.NumSubexp())
 		}
 		}
 		rules = append(rules, r)
 		rules = append(rules, r)
-
 	}
 	}
 	var allowlistRegexes []*regexp.Regexp
 	var allowlistRegexes []*regexp.Regexp
 	for _, a := range vc.Allowlist.Regexes {
 	for _, a := range vc.Allowlist.Regexes {
@@ -114,5 +119,6 @@ func (vc *ViperConfig) Translate() (Config, error) {
 			Paths:   allowlistPaths,
 			Paths:   allowlistPaths,
 			Commits: vc.Allowlist.Commits,
 			Commits: vc.Allowlist.Commits,
 		},
 		},
+		Keywords: keywords,
 	}, nil
 	}, nil
 }
 }

+ 45 - 15
detect/detect.go

@@ -16,6 +16,7 @@ import (
 	"github.com/fatih/semgroup"
 	"github.com/fatih/semgroup"
 	"github.com/gitleaks/go-gitdiff/gitdiff"
 	"github.com/gitleaks/go-gitdiff/gitdiff"
 	"github.com/h2non/filetype"
 	"github.com/h2non/filetype"
+	ahocorasick "github.com/petar-dambovaliev/aho-corasick"
 	"github.com/rs/zerolog/log"
 	"github.com/rs/zerolog/log"
 	"github.com/spf13/viper"
 	"github.com/spf13/viper"
 )
 )
@@ -59,6 +60,10 @@ type Detector struct {
 	// of the detector's scan which can then be used to generate a
 	// of the detector's scan which can then be used to generate a
 	// report.
 	// report.
 	findings []report.Finding
 	findings []report.Finding
+
+	// prefilter is a ahocorasick struct used for doing efficient string
+	// matching given a set of words (keywords from the rules in the config)
+	prefilter ahocorasick.AhoCorasick
 }
 }
 
 
 // Fragment contains the data to be scanned
 // Fragment contains the data to be scanned
@@ -75,15 +80,27 @@ type Fragment struct {
 	// newlineIndices is a list of indices of newlines in the raw content.
 	// newlineIndices is a list of indices of newlines in the raw content.
 	// This is used to calculate the line location of a finding
 	// This is used to calculate the line location of a finding
 	newlineIndices [][]int
 	newlineIndices [][]int
+
+	// keywords is a map of all the keywords contain within the contents
+	// of this fragment
+	keywords map[string]bool
 }
 }
 
 
 // NewDetector creates a new detector with the given config
 // NewDetector creates a new detector with the given config
 func NewDetector(cfg config.Config) *Detector {
 func NewDetector(cfg config.Config) *Detector {
+	builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
+		AsciiCaseInsensitive: true,
+		MatchOnlyWholeWords:  false,
+		MatchKind:            ahocorasick.LeftMostLongestMatch,
+		DFA:                  true,
+	})
+
 	return &Detector{
 	return &Detector{
 		commitMap:    make(map[string]bool),
 		commitMap:    make(map[string]bool),
 		findingMutex: &sync.Mutex{},
 		findingMutex: &sync.Mutex{},
 		findings:     make([]report.Finding, 0),
 		findings:     make([]report.Finding, 0),
 		Config:       cfg,
 		Config:       cfg,
+		prefilter:    builder.Build(cfg.Keywords),
 	}
 	}
 }
 }
 
 
@@ -154,18 +171,6 @@ func (d *Detector) detectRule(fragment Fragment, rule *config.Rule) []report.Fin
 		return findings
 		return findings
 	}
 	}
 
 
-	containsKeyword := false
-	for _, k := range rule.Keywords {
-		if strings.Contains(strings.ToLower(fragment.Raw),
-			strings.ToLower(k)) {
-			containsKeyword = true
-			break
-		}
-	}
-	if !containsKeyword && len(rule.Keywords) != 0 {
-		return findings
-	}
-
 	matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
 	matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
 	for _, matchIndex := range matchIndices {
 	for _, matchIndex := range matchIndices {
 		// extract secret from match
 		// extract secret from match
@@ -194,13 +199,13 @@ func (d *Detector) detectRule(fragment Fragment, rule *config.Rule) []report.Fin
 			gitleaksAllowSignature) {
 			gitleaksAllowSignature) {
 			continue
 			continue
 		}
 		}
-		
+
 		// check if the secret is in the allowlist
 		// check if the secret is in the allowlist
 		if rule.Allowlist.RegexAllowed(finding.Secret) ||
 		if rule.Allowlist.RegexAllowed(finding.Secret) ||
 			d.Config.Allowlist.RegexAllowed(finding.Secret) {
 			d.Config.Allowlist.RegexAllowed(finding.Secret) {
 			continue
 			continue
 		}
 		}
-		
+
 		// extract secret from secret group if set
 		// extract secret from secret group if set
 		if rule.SecretGroup != 0 {
 		if rule.SecretGroup != 0 {
 			groups := rule.Regex.FindStringSubmatch(secret)
 			groups := rule.Regex.FindStringSubmatch(secret)
@@ -374,6 +379,10 @@ func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
 // Detect scans the given fragment and returns a list of findings
 // Detect scans the given fragment and returns a list of findings
 func (d *Detector) Detect(fragment Fragment) []report.Finding {
 func (d *Detector) Detect(fragment Fragment) []report.Finding {
 	var findings []report.Finding
 	var findings []report.Finding
+
+	// initiate fragment keywords
+	fragment.keywords = make(map[string]bool)
+
 	// check if filepath is allowed
 	// check if filepath is allowed
 	if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
 	if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
 		fragment.FilePath == d.Config.Path) {
 		fragment.FilePath == d.Config.Path) {
@@ -383,8 +392,29 @@ func (d *Detector) Detect(fragment Fragment) []report.Finding {
 	// add newline indices for location calculation in detectRule
 	// add newline indices for location calculation in detectRule
 	fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
 	fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
 
 
+	// build keyword map for prefiltering rules
+	matches := d.prefilter.FindAll(strings.ToLower(fragment.Raw))
+	for _, m := range matches {
+		fragment.keywords[strings.ToLower(fragment.Raw[m.Start():m.End()])] = true
+	}
+
 	for _, rule := range d.Config.Rules {
 	for _, rule := range d.Config.Rules {
-		findings = append(findings, d.detectRule(fragment, rule)...)
+		if len(rule.Keywords) == 0 {
+			// if not keywords are associated with the rule always scan the
+			// fragment using the rule
+			findings = append(findings, d.detectRule(fragment, rule)...)
+			continue
+		}
+		fragmentContainsKeyword := false
+		// check if keywords are in the fragment
+		for _, k := range rule.Keywords {
+			if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
+				fragmentContainsKeyword = true
+			}
+		}
+		if fragmentContainsKeyword {
+			findings = append(findings, d.detectRule(fragment, rule)...)
+		}
 	}
 	}
 	return filter(findings, d.Redact)
 	return filter(findings, d.Redact)
 }
 }

+ 1 - 0
go.mod

@@ -20,6 +20,7 @@ require (
 	github.com/magiconair/properties v1.8.5 // indirect
 	github.com/magiconair/properties v1.8.5 // indirect
 	github.com/mitchellh/mapstructure v1.4.1 // indirect
 	github.com/mitchellh/mapstructure v1.4.1 // indirect
 	github.com/pelletier/go-toml v1.9.3 // indirect
 	github.com/pelletier/go-toml v1.9.3 // indirect
+	github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/spf13/afero v1.6.0 // indirect
 	github.com/spf13/afero v1.6.0 // indirect
 	github.com/spf13/cast v1.3.1 // indirect
 	github.com/spf13/cast v1.3.1 // indirect

+ 2 - 0
go.sum

@@ -203,6 +203,8 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb
 github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
 github.com/pelletier/go-toml v1.9.3 h1:zeC5b1GviRUyKYd6OJPvBU/mcVDVoL1OhT17FCt5dSQ=
 github.com/pelletier/go-toml v1.9.3 h1:zeC5b1GviRUyKYd6OJPvBU/mcVDVoL1OhT17FCt5dSQ=
 github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
 github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
+github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 h1:lL+y4Xv20pVlCGyLzNHRC0I0rIHhIL1lTvHizoS/dU8=
+github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI=
 github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI=