hai 6 meses · 5f737c311b
--- a/config/config.go
+++ b/config/config.go
@@ -45,9 +45,10 @@ type ViperConfig struct {
 
				 		// TODO: Remove this in 9.x.
			
 
				 		AllowList *viperRuleAllowlist
			
 
				 
			
 
				-		Allowlists []*viperRuleAllowlist
			
 
				-		Required   []*viperRequired
			
 
				-		SkipReport bool
			
 
				+		Allowlists  []*viperRuleAllowlist
			
 
				+		Required    []*viperRequired
			
 
				+		SkipReport  bool
			
 
				+		SmartFilter bool
			
 
				 	}
			
 
				 	// Deprecated: this is a shim for backwards-compatibility.
			
 
				 	// TODO: Remove this in 9.x.
			
@@ -141,6 +142,7 @@ func (vc *ViperConfig) Translate() (Config, error) {
 
				 			Keywords:    vr.Keywords,
			
 
				 			Tags:        vr.Tags,
			
 
				 			SkipReport:  vr.SkipReport,
			
 
				+			SmartFilter: vr.SmartFilter,
			
 
				 		}
			
 
				 
			
 
				 		// Parse the rule allowlists, including the older format for backwards compatibility.
			
--- a/config/gitleaks.toml
+++ b/config/gitleaks.toml
@@ -626,6 +626,7 @@ keywords = [
 
				     "secret",
			
 
				     "token",
			
 
				 ]
			
 
				+smartFilter=true
			
 
				 [[rules.allowlists]]
			
 
				 regexes = [
			
 
				     '''^[a-zA-Z_.-]+$''',
			
--- a/config/rule.go
+++ b/config/rule.go
@@ -52,6 +52,8 @@ type Rule struct {
 
				 	RequiredRules []*Required
			
 
				 
			
 
				 	SkipReport bool
			
 
				+
			
 
				+	SmartFilter bool
			
 
				 }
			
 
				 
			
 
				 type Required struct {
			
--- a/detect/detect.go
+++ b/detect/detect.go
@@ -18,7 +18,12 @@ import (
 
				 	"github.com/zricethezav/gitleaks/v8/sources"
			
 
				 
			
 
				 	ahocorasick "github.com/BobuSumisu/aho-corasick"
			
 
				+	tiktoken_loader "github.com/pkoukk/tiktoken-go-loader"
			
 
				+	"github.com/zricethezav/icanhazwordz"
			
 
				+
			
 
				+	"github.com/agnivade/levenshtein"
			
 
				 	"github.com/fatih/semgroup"
			
 
				+	"github.com/pkoukk/tiktoken-go"
			
 
				 	"github.com/rs/zerolog"
			
 
				 	"github.com/spf13/viper"
			
 
				 	"golang.org/x/exp/maps"
			
@@ -104,6 +109,10 @@ type Detector struct {
 
				 	Reporter   report.Reporter
			
 
				 
			
 
				 	TotalBytes atomic.Uint64
			
 
				+
			
 
				+	tokenizer *tiktoken.Tiktoken
			
 
				+
			
 
				+	nltkSearcher *icanhazwordz.Searcher
			
 
				 }
			
 
				 
			
 
				 // Fragment is an alias for sources.Fragment for backwards compatibility
			
@@ -113,6 +122,13 @@ type Fragment sources.Fragment
 
				 
			
 
				 // NewDetector creates a new detector with the given config
			
 
				 func NewDetector(cfg config.Config) *Detector {
			
 
				+	// grab offline tiktoken encoder
			
 
				+	tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader())
			
 
				+	tke, err := tiktoken.GetEncoding("cl100k_base")
			
 
				+	if err != nil {
			
 
				+		logging.Warn().Err(err).Msgf("Could not pull down cl100k_base tiktokenizer")
			
 
				+	}
			
 
				+
			
 
				 	return &Detector{
			
 
				 		commitMap:      make(map[string]bool),
			
 
				 		gitleaksIgnore: make(map[string]struct{}),
			
@@ -122,6 +138,10 @@ func NewDetector(cfg config.Config) *Detector {
 
				 		Config:         cfg,
			
 
				 		prefilter:      *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
			
 
				 		Sema:           semgroup.NewGroup(context.Background(), 40),
			
 
				+
			
 
				+		// tokenizer and nltkSearcher are used for a generic filter
			
 
				+		tokenizer:    tke,
			
 
				+		nltkSearcher: icanhazwordz.NewSearcher(icanhazwordz.Filter{MinLength: 4, PreferLongestNonOverlapping: true}),
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -357,7 +377,7 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 
				 		}()
			
 
				 	)
			
 
				 
			
 
				-	if r.SkipReport == true && !fragment.InheritedFromFinding {
			
 
				+	if r.SkipReport && !fragment.InheritedFromFinding {
			
 
				 		return findings
			
 
				 	}
			
 
				 
			
@@ -530,6 +550,16 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 
				 			}
			
 
				 		}
			
 
				 
			
 
				+		// check if this is a generic rule
			
 
				+		if r.SmartFilter {
			
 
				+			if !d.passesSmartFilter(finding.Secret) {
			
 
				+				// logger.Info().
			
 
				+				// 	Str("finding", finding.Secret).
			
 
				+				// 	Msg("skipping finding: fails smart filter")
			
 
				+				continue
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				 		// check if the result matches any of the global allowlists.
			
 
				 		if isAllowed, event := checkFindingAllowed(logger, finding, fragment, currentLine, d.Config.Allowlists); isAllowed {
			
 
				 			event.Msg("skipping finding: global allowlist")
			
@@ -553,6 +583,95 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 
				 	return d.processRequiredRules(fragment, currentRaw, r, encodedSegments, findings, logger)
			
 
				 }
			
 
				 
			
 
				+// passesSmartFilter applies heuristics to determine if a string is likely a real looking secret
			
 
				+// rather than random text or common words. It uses token density, character distribution,
			
 
				+// and word analysis to filter out false positives. Returns true if the string passes
			
 
				+// the filter (likely a secret), false if it should be skipped.
			
 
				+func (d *Detector) passesSmartFilter(secret string) bool {
			
 
				+	tokens := d.tokenizer.Encode(secret, nil, nil)
			
 
				+	tokenLen := len(tokens)
			
 
				+	// token vals < 100
			
 
				+	numShortTokens := 0
			
 
				+	for _, t := range tokens {
			
 
				+		if t < 100 {
			
 
				+			numShortTokens++
			
 
				+		}
			
 
				+	}
			
 
				+	// token vals > 100
			
 
				+	// longTokens := tokenLen - numShortTokens
			
 
				+	density := len(secret) / tokenLen
			
 
				+	shortTokenRatio := float32(numShortTokens / tokenLen)
			
 
				+
			
 
				+	result := d.nltkSearcher.Find(secret)
			
 
				+	fourPlusCharWords := len(result.Matches)
			
 
				+
			
 
				+	// check if the secret has a close levenshtein distance to any of the results
			
 
				+	// if it does, consider this c4. normalize cases
			
 
				+	c4 := false
			
 
				+	secretLower := strings.ToLower(secret)
			
 
				+	for _, match := range result.Matches {
			
 
				+		// Only check against words with 5+ characters
			
 
				+		if len(match.Word) <= 5 {
			
 
				+			continue
			
 
				+		}
			
 
				+		wordLower := strings.ToLower(match.Word)
			
 
				+		distance := levenshtein.ComputeDistance(secretLower, wordLower)
			
 
				+		// Consider it close if distance is <= 2 or <= 20% of the longer string length
			
 
				+		maxLen := max(len(secretLower), len(wordLower))
			
 
				+		threshold := max(2, maxLen/5)
			
 
				+		if distance <= threshold {
			
 
				+			c4 = true
			
 
				+			break
			
 
				+		}
			
 
				+
			
 
				+		// OR check if the match is 6 characters or more _and_ is a substring of the
			
 
				+		// potential secret
			
 
				+		if len(match.Word) <= 6 {
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		if strings.Contains(strings.ToLower(secret), strings.ToLower(match.Word)) {
			
 
				+			c4 = true
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	c1 := density <= 2.0
			
 
				+	c2 := shortTokenRatio >= 0.25
			
 
				+	c3 := len(secret) >= 9 && float32(density) <= 2.5
			
 
				+
			
 
				+	likelySecret := c1 || c2 || c3
			
 
				+
			
 
				+	// Check for irregularly cased English words
			
 
				+	// majorityIrregularlyCased := false
			
 
				+	hasIrregularCasing := false
			
 
				+	for _, word := range result.UniqueWords {
			
 
				+		if isIrregularlyCased(word, secret) {
			
 
				+			hasIrregularCasing = true
			
 
				+			break
			
 
				+		}
			
 
				+	}
			
 
				+	// if len(result.UniqueWords) > 0 && irregularlyCased > len(result.UniqueWords)/2 {
			
 
				+	// 	majorityIrregularlyCased = true
			
 
				+	// }
			
 
				+	// fmt.Println(hasIrregularCasing, fourPlusCharWords, secret)
			
 
				+	// fmt.Println(c1, c2, c3, fourPlusCharWords)
			
 
				+
			
 
				+	// Make exception if words have irregular casing
			
 
				+	if hasIrregularCasing && fourPlusCharWords > 1 {
			
 
				+		pass := likelySecret
			
 
				+		if !pass {
			
 
				+			// fmt.Println("skipping: ", secret, "--", c1, c2, c3, fourPlusCharWords, result.UniqueWords, "(irregular casing exception)")
			
 
				+		}
			
 
				+		return pass
			
 
				+	}
			
 
				+
			
 
				+	pass := likelySecret && !(fourPlusCharWords > 1) && !c4
			
 
				+	if !pass {
			
 
				+		// fmt.Println("skipping: ", secret, "--", c1, c2, c3, fourPlusCharWords, result.UniqueWords)
			
 
				+	}
			
 
				+	return pass
			
 
				+}
			
 
				+
			
 
				 // processRequiredRules handles the logic for multi-part rules with auxiliary findings
			
 
				 func (d *Detector) processRequiredRules(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []*codec.EncodedSegment, primaryFindings []report.Finding, logger zerolog.Logger) []report.Finding {
			
 
				 	if len(primaryFindings) == 0 {
			
--- a/detect/utils.go
+++ b/detect/utils.go
@@ -8,6 +8,7 @@ import (
 
				 	"strings"
			
 
				 
			
 
				 	"github.com/zricethezav/gitleaks/v8/cmd/scm"
			
 
				+	"github.com/zricethezav/gitleaks/v8/detect/codec"
			
 
				 	"github.com/zricethezav/gitleaks/v8/logging"
			
 
				 	"github.com/zricethezav/gitleaks/v8/report"
			
 
				 	"github.com/zricethezav/gitleaks/v8/sources"
			
@@ -136,14 +137,32 @@ func shannonEntropy(data string) (entropy float64) {
 
				 // filter will dedupe and redact findings
			
 
				 func filter(findings []report.Finding, redact uint) []report.Finding {
			
 
				 	var retFindings []report.Finding
			
 
				+
			
 
				+	decoder := codec.NewDecoder()
			
 
				+	encodedSegments := []*codec.EncodedSegment{}
			
 
				+
			
 
				 	for _, f := range findings {
			
 
				 		include := true
			
 
				 		if strings.Contains(strings.ToLower(f.RuleID), "generic") {
			
 
				 			for _, fPrime := range findings {
			
 
				+				// TODO also check if a decoded secret == the generic secret. If it does, skip the generic secret
			
 
				+				isDecoded := false
			
 
				+				decodedSecret := ""
			
 
				+				for _, t := range fPrime.Tags {
			
 
				+					if strings.Contains(t, "decoded") {
			
 
				+						isDecoded = true
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if isDecoded {
			
 
				+					decodedSecret, _ = decoder.Decode(f.Secret, encodedSegments)
			
 
				+					decodedSecret = strings.TrimSuffix(decodedSecret, "\n")
			
 
				+				}
			
 
				+
			
 
				 				if f.StartLine == fPrime.StartLine &&
			
 
				 					f.Commit == fPrime.Commit &&
			
 
				 					f.RuleID != fPrime.RuleID &&
			
 
				-					strings.Contains(fPrime.Secret, f.Secret) &&
			
 
				+					(strings.Contains(fPrime.Secret, f.Secret) || decodedSecret == fPrime.Secret) &&
			
 
				 					!strings.Contains(strings.ToLower(fPrime.RuleID), "generic") {
			
 
				 
			
 
				 					genericMatch := strings.ReplaceAll(f.Match, f.Secret, "REDACTED")
			
@@ -261,3 +280,39 @@ func printFinding(f report.Finding, noColor bool) {
 
				 	f.PrintRequiredFindings()
			
 
				 	fmt.Println("")
			
 
				 }
			
 
				+
			
 
				+func isIrregularlyCased(word, secret string) bool {
			
 
				+	// Find the word in the secret (case-insensitive)
			
 
				+	secretLower := strings.ToLower(secret)
			
 
				+	wordLower := strings.ToLower(word)
			
 
				+
			
 
				+	index := strings.Index(secretLower, wordLower)
			
 
				+	if index == -1 {
			
 
				+		return false
			
 
				+	}
			
 
				+
			
 
				+	// Extract the actual casing from the secret
			
 
				+	actualWord := secret[index : index+len(word)]
			
 
				+
			
 
				+	// Check if it matches conventional casing patterns
			
 
				+	return !isConventionallyCased(actualWord)
			
 
				+}
			
 
				+
			
 
				+// Helper function to determine if a word follows conventional casing
			
 
				+func isConventionallyCased(word string) bool {
			
 
				+	if len(word) == 0 {
			
 
				+		return true
			
 
				+	}
			
 
				+
			
 
				+	// Conventional patterns:
			
 
				+	// - All lowercase (common)
			
 
				+	// - All uppercase (acronyms, constants)
			
 
				+	// - Title case (proper nouns)
			
 
				+	// - First letter uppercase, rest lowercase
			
 
				+
			
 
				+	allLower := strings.ToLower(word) == word
			
 
				+	allUpper := strings.ToUpper(word) == word
			
 
				+	titleCase := strings.Title(strings.ToLower(word)) == word
			
 
				+
			
 
				+	return allLower || allUpper || titleCase
			
 
				+}
			
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,8 @@
 
				 module github.com/zricethezav/gitleaks/v8
			
 
				 
			
 
				-go 1.23.8
			
 
				+go 1.24
			
 
				+
			
 
				+toolchain go1.24.3
			
 
				 
			
 
				 require (
			
 
				 	github.com/BobuSumisu/aho-corasick v1.0.3
			
@@ -11,10 +13,13 @@ require (
 
				 	github.com/google/go-cmp v0.6.0
			
 
				 	github.com/h2non/filetype v1.1.3
			
 
				 	github.com/mholt/archives v0.1.2
			
 
				+	github.com/pkoukk/tiktoken-go v0.1.7
			
 
				+	github.com/pkoukk/tiktoken-go-loader v0.0.1
			
 
				 	github.com/rs/zerolog v1.33.0
			
 
				 	github.com/spf13/cobra v1.9.1
			
 
				 	github.com/spf13/viper v1.19.0
			
 
				 	github.com/stretchr/testify v1.10.0
			
 
				+	github.com/zricethezav/icanhazwordz v0.0.3
			
 
				 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
			
 
				 )
			
 
				 
			
@@ -28,6 +33,7 @@ require (
 
				 	github.com/bodgit/plumbing v1.3.0 // indirect
			
 
				 	github.com/bodgit/sevenzip v1.6.0 // indirect
			
 
				 	github.com/bodgit/windows v1.0.1 // indirect
			
 
				+	github.com/dlclark/regexp2 v1.10.0 // indirect
			
 
				 	github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 // indirect
			
 
				 	github.com/google/uuid v1.6.0 // indirect
			
 
				 	github.com/hashicorp/errwrap v1.1.0 // indirect
			
--- a/go.sum
+++ b/go.sum
@@ -52,6 +52,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
 
				 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
			
 
				 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
			
 
				 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
			
 
				+github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
			
 
				+github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
			
 
				 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 h1:2tV76y6Q9BB+NEBasnqvs7e49aEBFI8ejC89PSnWH+4=
			
 
				 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
			
 
				 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
			
@@ -172,6 +174,10 @@ github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xl
 
				 github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
			
 
				 github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
			
 
				 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
			
 
				+github.com/pkoukk/tiktoken-go v0.1.7 h1:qOBHXX4PHtvIvmOtyg1EeKlwFRiMKAcoMp4Q+bLQDmw=
			
 
				+github.com/pkoukk/tiktoken-go v0.1.7/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg=
			
 
				+github.com/pkoukk/tiktoken-go-loader v0.0.1 h1:aOB2gRFzZTCCPi3YsOQXJO771P/5876JAsdebMyazig=
			
 
				+github.com/pkoukk/tiktoken-go-loader v0.0.1/go.mod h1:4mIkYyZooFlnenDlormIo6cd5wrlUKNr97wp9nGgEKo=
			
 
				 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
			
 
				 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
			
 
				 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
			
@@ -232,6 +238,8 @@ github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52/go.mod h1:
 
				 github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
			
 
				 github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
			
 
				 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
			
 
				+github.com/zricethezav/icanhazwordz v0.0.3 h1:fFNkjTgDPVfyeQtrwudcrnKGjowQ4VDYOq75ATT7WqU=
			
 
				+github.com/zricethezav/icanhazwordz v0.0.3/go.mod h1:5fMiqbebc6dKbzmgcHbU2TWrsjbgg5aF50JhLNfOtZ0=
			
 
				 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
			
 
				 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
			
 
				 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=