Zachary Rice hai 6 meses
pai
achega
5f737c311b
Modificáronse 7 ficheiros con 199 adicións e 6 borrados
  1. 5 3
      config/config.go
  2. 1 0
      config/gitleaks.toml
  3. 2 0
      config/rule.go
  4. 120 1
      detect/detect.go
  5. 56 1
      detect/utils.go
  6. 7 1
      go.mod
  7. 8 0
      go.sum

+ 5 - 3
config/config.go

@@ -45,9 +45,10 @@ type ViperConfig struct {
 		// TODO: Remove this in 9.x.
 		AllowList *viperRuleAllowlist
 
-		Allowlists []*viperRuleAllowlist
-		Required   []*viperRequired
-		SkipReport bool
+		Allowlists  []*viperRuleAllowlist
+		Required    []*viperRequired
+		SkipReport  bool
+		SmartFilter bool
 	}
 	// Deprecated: this is a shim for backwards-compatibility.
 	// TODO: Remove this in 9.x.
@@ -141,6 +142,7 @@ func (vc *ViperConfig) Translate() (Config, error) {
 			Keywords:    vr.Keywords,
 			Tags:        vr.Tags,
 			SkipReport:  vr.SkipReport,
+			SmartFilter: vr.SmartFilter,
 		}
 
 		// Parse the rule allowlists, including the older format for backwards compatibility.

+ 1 - 0
config/gitleaks.toml

@@ -626,6 +626,7 @@ keywords = [
     "secret",
     "token",
 ]
+smartFilter=true
 [[rules.allowlists]]
 regexes = [
     '''^[a-zA-Z_.-]+$''',

+ 2 - 0
config/rule.go

@@ -52,6 +52,8 @@ type Rule struct {
 	RequiredRules []*Required
 
 	SkipReport bool
+
+	SmartFilter bool
 }
 
 type Required struct {

+ 120 - 1
detect/detect.go

@@ -18,7 +18,12 @@ import (
 	"github.com/zricethezav/gitleaks/v8/sources"
 
 	ahocorasick "github.com/BobuSumisu/aho-corasick"
+	tiktoken_loader "github.com/pkoukk/tiktoken-go-loader"
+	"github.com/zricethezav/icanhazwordz"
+
+	"github.com/agnivade/levenshtein"
 	"github.com/fatih/semgroup"
+	"github.com/pkoukk/tiktoken-go"
 	"github.com/rs/zerolog"
 	"github.com/spf13/viper"
 	"golang.org/x/exp/maps"
@@ -104,6 +109,10 @@ type Detector struct {
 	Reporter   report.Reporter
 
 	TotalBytes atomic.Uint64
+
+	tokenizer *tiktoken.Tiktoken
+
+	nltkSearcher *icanhazwordz.Searcher
 }
 
 // Fragment is an alias for sources.Fragment for backwards compatibility
@@ -113,6 +122,13 @@ type Fragment sources.Fragment
 
 // NewDetector creates a new detector with the given config
 func NewDetector(cfg config.Config) *Detector {
+	// grab offline tiktoken encoder
+	tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader())
+	tke, err := tiktoken.GetEncoding("cl100k_base")
+	if err != nil {
+		logging.Warn().Err(err).Msgf("Could not pull down cl100k_base tiktokenizer")
+	}
+
 	return &Detector{
 		commitMap:      make(map[string]bool),
 		gitleaksIgnore: make(map[string]struct{}),
@@ -122,6 +138,10 @@ func NewDetector(cfg config.Config) *Detector {
 		Config:         cfg,
 		prefilter:      *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
 		Sema:           semgroup.NewGroup(context.Background(), 40),
+
+		// tokenizer and nltkSearcher are used for a generic filter
+		tokenizer:    tke,
+		nltkSearcher: icanhazwordz.NewSearcher(icanhazwordz.Filter{MinLength: 4, PreferLongestNonOverlapping: true}),
 	}
 }
 
@@ -357,7 +377,7 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 		}()
 	)
 
-	if r.SkipReport == true && !fragment.InheritedFromFinding {
+	if r.SkipReport && !fragment.InheritedFromFinding {
 		return findings
 	}
 
@@ -530,6 +550,16 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 			}
 		}
 
+		// check if this is a generic rule
+		if r.SmartFilter {
+			if !d.passesSmartFilter(finding.Secret) {
+				// logger.Info().
+				// 	Str("finding", finding.Secret).
+				// 	Msg("skipping finding: fails smart filter")
+				continue
+			}
+		}
+
 		// check if the result matches any of the global allowlists.
 		if isAllowed, event := checkFindingAllowed(logger, finding, fragment, currentLine, d.Config.Allowlists); isAllowed {
 			event.Msg("skipping finding: global allowlist")
@@ -553,6 +583,95 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 	return d.processRequiredRules(fragment, currentRaw, r, encodedSegments, findings, logger)
 }
 
+// passesSmartFilter applies heuristics to determine if a string is likely a real looking secret
+// rather than random text or common words. It uses token density, character distribution,
+// and word analysis to filter out false positives. Returns true if the string passes
+// the filter (likely a secret), false if it should be skipped.
+func (d *Detector) passesSmartFilter(secret string) bool {
+	tokens := d.tokenizer.Encode(secret, nil, nil)
+	tokenLen := len(tokens)
+	// token vals < 100
+	numShortTokens := 0
+	for _, t := range tokens {
+		if t < 100 {
+			numShortTokens++
+		}
+	}
+	// token vals > 100
+	// longTokens := tokenLen - numShortTokens
+	density := len(secret) / tokenLen
+	shortTokenRatio := float32(numShortTokens / tokenLen)
+
+	result := d.nltkSearcher.Find(secret)
+	fourPlusCharWords := len(result.Matches)
+
+	// check if the secret has a close levenshtein distance to any of the results
+	// if it does, consider this c4. normalize cases
+	c4 := false
+	secretLower := strings.ToLower(secret)
+	for _, match := range result.Matches {
+		// Only check against words with 5+ characters
+		if len(match.Word) <= 5 {
+			continue
+		}
+		wordLower := strings.ToLower(match.Word)
+		distance := levenshtein.ComputeDistance(secretLower, wordLower)
+		// Consider it close if distance is <= 2 or <= 20% of the longer string length
+		maxLen := max(len(secretLower), len(wordLower))
+		threshold := max(2, maxLen/5)
+		if distance <= threshold {
+			c4 = true
+			break
+		}
+
+		// OR check if the match is 6 characters or more _and_ is a substring of the
+		// potential secret
+		if len(match.Word) <= 6 {
+			continue
+		}
+
+		if strings.Contains(strings.ToLower(secret), strings.ToLower(match.Word)) {
+			c4 = true
+		}
+	}
+
+	c1 := density <= 2.0
+	c2 := shortTokenRatio >= 0.25
+	c3 := len(secret) >= 9 && float32(density) <= 2.5
+
+	likelySecret := c1 || c2 || c3
+
+	// Check for irregularly cased English words
+	// majorityIrregularlyCased := false
+	hasIrregularCasing := false
+	for _, word := range result.UniqueWords {
+		if isIrregularlyCased(word, secret) {
+			hasIrregularCasing = true
+			break
+		}
+	}
+	// if len(result.UniqueWords) > 0 && irregularlyCased > len(result.UniqueWords)/2 {
+	// 	majorityIrregularlyCased = true
+	// }
+	// fmt.Println(hasIrregularCasing, fourPlusCharWords, secret)
+	// fmt.Println(c1, c2, c3, fourPlusCharWords)
+
+	// Make exception if words have irregular casing
+	if hasIrregularCasing && fourPlusCharWords > 1 {
+		pass := likelySecret
+		if !pass {
+			// fmt.Println("skipping: ", secret, "--", c1, c2, c3, fourPlusCharWords, result.UniqueWords, "(irregular casing exception)")
+		}
+		return pass
+	}
+
+	pass := likelySecret && !(fourPlusCharWords > 1) && !c4
+	if !pass {
+		// fmt.Println("skipping: ", secret, "--", c1, c2, c3, fourPlusCharWords, result.UniqueWords)
+	}
+	return pass
+}
+
 // processRequiredRules handles the logic for multi-part rules with auxiliary findings
 func (d *Detector) processRequiredRules(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []*codec.EncodedSegment, primaryFindings []report.Finding, logger zerolog.Logger) []report.Finding {
 	if len(primaryFindings) == 0 {

+ 56 - 1
detect/utils.go

@@ -8,6 +8,7 @@ import (
 	"strings"
 
 	"github.com/zricethezav/gitleaks/v8/cmd/scm"
+	"github.com/zricethezav/gitleaks/v8/detect/codec"
 	"github.com/zricethezav/gitleaks/v8/logging"
 	"github.com/zricethezav/gitleaks/v8/report"
 	"github.com/zricethezav/gitleaks/v8/sources"
@@ -136,14 +137,32 @@ func shannonEntropy(data string) (entropy float64) {
 // filter will dedupe and redact findings
 func filter(findings []report.Finding, redact uint) []report.Finding {
 	var retFindings []report.Finding
+
+	decoder := codec.NewDecoder()
+	encodedSegments := []*codec.EncodedSegment{}
+
 	for _, f := range findings {
 		include := true
 		if strings.Contains(strings.ToLower(f.RuleID), "generic") {
 			for _, fPrime := range findings {
+				// TODO also check if a decoded secret == the generic secret. If it does, skip the generic secret
+				isDecoded := false
+				decodedSecret := ""
+				for _, t := range fPrime.Tags {
+					if strings.Contains(t, "decoded") {
+						isDecoded = true
+					}
+				}
+
+				if isDecoded {
+					decodedSecret, _ = decoder.Decode(f.Secret, encodedSegments)
+					decodedSecret = strings.TrimSuffix(decodedSecret, "\n")
+				}
+
 				if f.StartLine == fPrime.StartLine &&
 					f.Commit == fPrime.Commit &&
 					f.RuleID != fPrime.RuleID &&
-					strings.Contains(fPrime.Secret, f.Secret) &&
+					(strings.Contains(fPrime.Secret, f.Secret) || decodedSecret == fPrime.Secret) &&
 					!strings.Contains(strings.ToLower(fPrime.RuleID), "generic") {
 
 					genericMatch := strings.ReplaceAll(f.Match, f.Secret, "REDACTED")
@@ -261,3 +280,39 @@ func printFinding(f report.Finding, noColor bool) {
 	f.PrintRequiredFindings()
 	fmt.Println("")
 }
+
+func isIrregularlyCased(word, secret string) bool {
+	// Find the word in the secret (case-insensitive)
+	secretLower := strings.ToLower(secret)
+	wordLower := strings.ToLower(word)
+
+	index := strings.Index(secretLower, wordLower)
+	if index == -1 {
+		return false
+	}
+
+	// Extract the actual casing from the secret
+	actualWord := secret[index : index+len(word)]
+
+	// Check if it matches conventional casing patterns
+	return !isConventionallyCased(actualWord)
+}
+
+// Helper function to determine if a word follows conventional casing
+func isConventionallyCased(word string) bool {
+	if len(word) == 0 {
+		return true
+	}
+
+	// Conventional patterns:
+	// - All lowercase (common)
+	// - All uppercase (acronyms, constants)
+	// - Title case (proper nouns)
+	// - First letter uppercase, rest lowercase
+
+	allLower := strings.ToLower(word) == word
+	allUpper := strings.ToUpper(word) == word
+	titleCase := strings.Title(strings.ToLower(word)) == word
+
+	return allLower || allUpper || titleCase
+}

+ 7 - 1
go.mod

@@ -1,6 +1,8 @@
 module github.com/zricethezav/gitleaks/v8
 
-go 1.23.8
+go 1.24
+
+toolchain go1.24.3
 
 require (
 	github.com/BobuSumisu/aho-corasick v1.0.3
@@ -11,10 +13,13 @@ require (
 	github.com/google/go-cmp v0.6.0
 	github.com/h2non/filetype v1.1.3
 	github.com/mholt/archives v0.1.2
+	github.com/pkoukk/tiktoken-go v0.1.7
+	github.com/pkoukk/tiktoken-go-loader v0.0.1
 	github.com/rs/zerolog v1.33.0
 	github.com/spf13/cobra v1.9.1
 	github.com/spf13/viper v1.19.0
 	github.com/stretchr/testify v1.10.0
+	github.com/zricethezav/icanhazwordz v0.0.3
 	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa
 )
 
@@ -28,6 +33,7 @@ require (
 	github.com/bodgit/plumbing v1.3.0 // indirect
 	github.com/bodgit/sevenzip v1.6.0 // indirect
 	github.com/bodgit/windows v1.0.1 // indirect
+	github.com/dlclark/regexp2 v1.10.0 // indirect
 	github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/hashicorp/errwrap v1.1.0 // indirect

+ 8 - 0
go.sum

@@ -52,6 +52,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
+github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 h1:2tV76y6Q9BB+NEBasnqvs7e49aEBFI8ejC89PSnWH+4=
 github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
@@ -172,6 +174,10 @@ github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xl
 github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
 github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkoukk/tiktoken-go v0.1.7 h1:qOBHXX4PHtvIvmOtyg1EeKlwFRiMKAcoMp4Q+bLQDmw=
+github.com/pkoukk/tiktoken-go v0.1.7/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg=
+github.com/pkoukk/tiktoken-go-loader v0.0.1 h1:aOB2gRFzZTCCPi3YsOQXJO771P/5876JAsdebMyazig=
+github.com/pkoukk/tiktoken-go-loader v0.0.1/go.mod h1:4mIkYyZooFlnenDlormIo6cd5wrlUKNr97wp9nGgEKo=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@@ -232,6 +238,8 @@ github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52/go.mod h1:
 github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
 github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+github.com/zricethezav/icanhazwordz v0.0.3 h1:fFNkjTgDPVfyeQtrwudcrnKGjowQ4VDYOq75ATT7WqU=
+github.com/zricethezav/icanhazwordz v0.0.3/go.mod h1:5fMiqbebc6dKbzmgcHbU2TWrsjbgg5aF50JhLNfOtZ0=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
 go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=