Browse Source

Respect contexts with timeouts (#1948)

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
bplaxco 3 tháng trước cách đây
mục cha
commit
c5ccbb9468
13 tập tin đã thay đổi với 349 bổ sung244 xóa
  1. 68 64
      README.md
  2. 5 8
      cmd/detect.go
  3. 1 2
      cmd/directory.go
  4. 20 20
      cmd/generate/config/rules/notion.go
  5. 4 5
      cmd/git.go
  6. 1 1
      cmd/protect.go
  7. 14 1
      cmd/root.go
  8. 1 2
      cmd/stdin.go
  9. 57 34
      detect/detect.go
  10. 21 4
      detect/detect_test.go
  11. 64 59
      sources/file.go
  12. 43 32
      sources/files.go
  13. 50 12
      sources/git.go

+ 68 - 64
README.md

@@ -142,10 +142,13 @@ Detect hardcoded secrets................................................Skipped
 ## Usage
 
 ```
+Gitleaks scans code, past or present, for secrets
+
 Usage:
   gitleaks [command]
 
 Available Commands:
+  completion  Generate the autocompletion script for the specified shell
   dir         scan directories or files for secrets
   git         scan git repositories for secrets
   help        Help about any command
@@ -161,16 +164,16 @@ Flags:
                                       3. env var GITLEAKS_CONFIG_TOML with the file content
                                       4. (target path)/.gitleaks.toml
                                       If none of the four options are used, then gitleaks will use the default config
-      --diagnostics string            enable diagnostics (comma-separated list: cpu,mem,trace). cpu=CPU profiling, mem=memory profiling, trace=execution tracing
-      --diagnostics-dir string        directory to store diagnostics output files (defaults to current directory)
+      --diagnostics string            enable diagnostics (http OR comma-separated list: cpu,mem,trace). cpu=CPU prof, mem=memory prof, trace=exec tracing, http=serve via net/http/pprof
+      --diagnostics-dir string        directory to store diagnostics output files when not using http mode (defaults to current directory)
       --enable-rule strings           only enable specific rules by id
       --exit-code int                 exit code when leaks have been encountered (default 1)
   -i, --gitleaks-ignore-path string   path to .gitleaksignore file or folder containing one (default ".")
   -h, --help                          help for gitleaks
       --ignore-gitleaks-allow         ignore gitleaks:allow comments
   -l, --log-level string              log level (trace, debug, info, warn, error, fatal) (default "info")
-      --max-decode-depth int          allow recursive decoding up to this depth (default "0", no decoding is done)
       --max-archive-depth int         allow scanning into nested archives up to this depth (default "0", no archive traversal is done)
+      --max-decode-depth int          allow recursive decoding up to this depth (default "0", no decoding is done)
       --max-target-megabytes int      files larger than this will be skipped
       --no-banner                     suppress banner
       --no-color                      turn off color for verbose output
@@ -178,6 +181,7 @@ Flags:
   -f, --report-format string          output format (json, csv, junit, sarif, template)
   -r, --report-path string            report file
       --report-template string        template file used to generate the report (implies --report-format=template)
+      --timeout int                   set a timeout for gitleaks commands in seconds (default "0", no timeout is set)
   -v, --verbose                       show verbose output from scan
       --version                       version for gitleaks
 
@@ -422,7 +426,7 @@ In v8.28.0 Gitleaks introduced composite rules, which are made up of a single "p
 **Proximity matching:** Using the `withinLines` and `withinColumns` fields instructs the primary rule to only report a finding if the auxiliary `required` rules also find matches within the specified proximity. You can set:
 
 - **`withinLines: N`** - required findings must be within N lines (vertically)
-- **`withinColumns: N`** - required findings must be within N characters (horizontally)  
+- **`withinColumns: N`** - required findings must be within N characters (horizontally)
 - **Both** - creates a rectangular search area (both constraints must be satisfied)
 - **Neither** - fragment-level matching (required findings can be anywhere in the same fragment)
 
@@ -434,76 +438,76 @@ a = auxiliary (required) captured secret
 fragment = section of data gitleaks is looking at
 
 
-    *Fragment-level proximity*               
+    *Fragment-level proximity*
     Any required finding in the fragment
-          ┌────────┐                       
-   ┌──────┤fragment├─────┐                 
-   │      └──────┬─┤     │ ┌───────┐       
-   │             │a│◀────┼─│✓ MATCH│       
-   │          ┌─┐└─┘     │ └───────┘       
-   │┌─┐       │p│        │                 
-   ││a│    ┌─┐└─┘        │ ┌───────┐       
-   │└─┘    │a│◀──────────┼─│✓ MATCH│       
-   └─▲─────┴─┴───────────┘ └───────┘       
-     │    ┌───────┐                        
-     └────│✓ MATCH│                        
-          └───────┘                        
-                                           
-                                           
+          ┌────────┐
+   ┌──────┤fragment├─────┐
+   │      └──────┬─┤     │ ┌───────┐
+   │             │a│◀────┼─│✓ MATCH│
+   │          ┌─┐└─┘     │ └───────┘
+   │┌─┐       │p│        │
+   ││a│    ┌─┐└─┘        │ ┌───────┐
+   │└─┘    │a│◀──────────┼─│✓ MATCH│
+   └─▲─────┴─┴───────────┘ └───────┘
+     │    ┌───────┐
+     └────│✓ MATCH│
+          └───────┘
+
+
    *Column bounded proximity*
-   `withinColumns = 3`                    
-          ┌────────┐                       
-   ┌────┬─┤fragment├─┬───┐                 
-   │      └──────┬─┤     │ ┌───────────┐   
-   │    │        │a│◀┼───┼─│+1C ✓ MATCH│   
-   │          ┌─┐└─┘     │ └───────────┘   
-   │┌─┐ │     │p│    │   │                 
-┌──▶│a│  ┌─┐  └─┘        │ ┌───────────┐   
-│  │└─┘ ││a│◀────────┼───┼─│-2C ✓ MATCH│   
-│  │       ┘             │ └───────────┘   
-│  └── -3C ───0C─── +3C ─┘                 
-│  ┌─────────┐                             
-│  │ -4C ✗ NO│                             
-└──│  MATCH  │                             
-   └─────────┘                             
-                                           
-                                           
+   `withinColumns = 3`
+          ┌────────┐
+   ┌────┬─┤fragment├─┬───┐
+   │      └──────┬─┤     │ ┌───────────┐
+   │    │        │a│◀┼───┼─│+1C ✓ MATCH│
+   │          ┌─┐└─┘     │ └───────────┘
+   │┌─┐ │     │p│    │   │
+┌──▶│a│  ┌─┐  └─┘        │ ┌───────────┐
+│  │└─┘ ││a│◀────────┼───┼─│-2C ✓ MATCH│
+│  │       ┘             │ └───────────┘
+│  └── -3C ───0C─── +3C ─┘
+│  ┌─────────┐
+│  │ -4C ✗ NO│
+└──│  MATCH  │
+   └─────────┘
+
+
    *Line bounded proximity*
-   `withinLines = 4`                      
-         ┌────────┐                        
-   ┌─────┤fragment├─────┐                  
-  +4L─ ─ ┴────────┘─ ─ ─│                  
-   │                    │                  
-   │              ┌─┐   │ ┌────────────┐   
-   │         ┌─┐  │a│◀──┼─│+1L ✓ MATCH │   
-   0L  ┌─┐   │p│  └─┘   │ ├────────────┤   
-   │   │a│◀──┴─┴────────┼─│-1L ✓ MATCH │   
-   │   └─┘              │ └────────────┘   
-   │                    │ ┌─────────┐      
-  -4L─ ─ ─ ─ ─ ─ ─ ─┌─┐─│ │-5L ✗ NO │      
-   │                │a│◀┼─│  MATCH  │      
-   └────────────────┴─┴─┘ └─────────┘      
-                                           
-                                           
+   `withinLines = 4`
+         ┌────────┐
+   ┌─────┤fragment├─────┐
+  +4L─ ─ ┴────────┘─ ─ ─│
+   │                    │
+   │              ┌─┐   │ ┌────────────┐
+   │         ┌─┐  │a│◀──┼─│+1L ✓ MATCH │
+   0L  ┌─┐   │p│  └─┘   │ ├────────────┤
+   │   │a│◀──┴─┴────────┼─│-1L ✓ MATCH │
+   │   └─┘              │ └────────────┘
+   │                    │ ┌─────────┐
+  -4L─ ─ ─ ─ ─ ─ ─ ─┌─┐─│ │-5L ✗ NO │
+   │                │a│◀┼─│  MATCH  │
+   └────────────────┴─┴─┘ └─────────┘
+
+
    *Line and column bounded proximity*
-   `withinLines = 4`                      
-   `withinColumns = 3`                    
-         ┌────────┐                        
-   ┌─────┤fragment├─────┐                  
-  +4L   ┌└────────┴ ┐   │                  
+   `withinLines = 4`
+   `withinColumns = 3`
+         ┌────────┐
+   ┌─────┤fragment├─────┐
+  +4L   ┌└────────┴ ┐   │
    │            ┌─┐     │ ┌───────────────┐
    │    │       │a│◀┼───┼─│+2L/+1C ✓ MATCH│
    │         ┌─┐└─┘     │ └───────────────┘
-   0L   │    │p│    │   │                  
-   │         └─┘        │                  
-   │    │           │   │ ┌────────────┐   
-  -4L    ─ ─ ─ ─ ─ ─┌─┐ │ │-5L/+3C ✗ NO│   
-   │                │a│◀┼─│   MATCH    │   
-   └───-3C────0L───+3C┴─┘ └────────────┘   
+   0L   │    │p│    │   │
+   │         └─┘        │
+   │    │           │   │ ┌────────────┐
+  -4L    ─ ─ ─ ─ ─ ─┌─┐ │ │-5L/+3C ✗ NO│
+   │                │a│◀┼─│   MATCH    │
+   └───-3C────0L───+3C┴─┘ └────────────┘
 ```
 
 <details><summary>Some final quick thoughts on composite rules.</summary>This is an experimental feature! It's subject to change so don't go sellin' a new B2B SaaS feature built ontop of this feature. Scan type (git vs dir) based context is interesting. I'm monitoring the situation. Composite rules might not be super useful for git scans because gitleaks only looks at additions in the git history. It could be useful to scan non-additions in git history for `required` rules. Oh, right this is a readme, I'll shut up now.</details>
-  
+
 #### gitleaks:allow
 
 If you are knowingly committing a test secret that gitleaks will catch you can add a `gitleaks:allow` comment to that line which will instruct gitleaks

+ 5 - 8
cmd/detect.go

@@ -19,7 +19,6 @@
 package cmd
 
 import (
-	"context"
 	"os"
 	"time"
 
@@ -66,18 +65,16 @@ func runDetect(cmd *cobra.Command, args []string) {
 	exitCode := mustGetIntFlag(cmd, "exit-code")
 	noGit := mustGetBoolFlag(cmd, "no-git")
 	fromPipe := mustGetBoolFlag(cmd, "pipe")
-
 	// determine what type of scan:
 	// - git: scan the history of the repo
 	// - no-git: scan files by treating the repo as a plain directory
 	var (
 		err      error
 		findings []report.Finding
-		ctx      = context.Background()
 	)
 	if noGit {
 		findings, err = detector.DetectSource(
-			ctx, &sources.Files{
+			cmd.Context(), &sources.Files{
 				Config:          &cfg,
 				FollowSymlinks:  detector.FollowSymlinks,
 				MaxFileSize:     detector.MaxTargetMegaBytes * 1_000_000,
@@ -93,7 +90,7 @@ func runDetect(cmd *cobra.Command, args []string) {
 		}
 	} else if fromPipe {
 		findings, err = detector.DetectSource(
-			ctx, &sources.File{
+			cmd.Context(), &sources.File{
 				Content:         os.Stdin,
 				MaxArchiveDepth: detector.MaxArchiveDepth,
 			},
@@ -111,7 +108,7 @@ func runDetect(cmd *cobra.Command, args []string) {
 		)
 
 		logOpts := mustGetStringFlag(cmd, "log-opts")
-		if gitCmd, err = sources.NewGitLogCmd(sourcePath, logOpts); err != nil {
+		if gitCmd, err = sources.NewGitLogCmdContext(cmd.Context(), sourcePath, logOpts); err != nil {
 			logging.Fatal().Err(err).Msg("could not create Git cmd")
 		}
 
@@ -120,10 +117,10 @@ func runDetect(cmd *cobra.Command, args []string) {
 		}
 
 		findings, err = detector.DetectSource(
-			ctx, &sources.Git{
+			cmd.Context(), &sources.Git{
 				Cmd:             gitCmd,
 				Config:          &detector.Config,
-				Remote:          sources.NewRemoteInfo(scmPlatform, sourcePath),
+				Remote:          sources.NewRemoteInfoContext(cmd.Context(), scmPlatform, sourcePath),
 				Sema:            detector.Sema,
 				MaxArchiveDepth: detector.MaxArchiveDepth,
 			},

+ 1 - 2
cmd/directory.go

@@ -1,7 +1,6 @@
 package cmd
 
 import (
-	"context"
 	"time"
 
 	"github.com/spf13/cobra"
@@ -56,7 +55,7 @@ func runDirectory(cmd *cobra.Command, args []string) {
 	}
 
 	findings, err := detector.DetectSource(
-		context.Background(),
+		cmd.Context(),
 		&sources.Files{
 			Config:          &cfg,
 			FollowSymlinks:  detector.FollowSymlinks,

+ 20 - 20
cmd/generate/config/rules/notion.go

@@ -6,34 +6,34 @@ import (
 )
 
 func Notion() *config.Rule {
-    // Define the identifiers that match the Keywords
-    identifiers := []string{"ntn_"}
-    
-    // Define the regex pattern for Notion API token
-    secretRegex := `ntn_[0-9]{11}[A-Za-z0-9]{32}[A-Za-z0-9]{3}`
-    
-    regex := utils.GenerateUniqueTokenRegex(secretRegex, false)
-    
-    r := config.Rule{
-        Description: "Notion API token",
-        RuleID: "notion-api-token",
-        Regex: regex,
-        Entropy: 4,
-        Keywords: identifiers,
-    }
-
-    // validate
+	// Define the identifiers that match the Keywords
+	identifiers := []string{"ntn_"}
+
+	// Define the regex pattern for Notion API token
+	secretRegex := `ntn_[0-9]{11}[A-Za-z0-9]{32}[A-Za-z0-9]{3}`
+
+	regex := utils.GenerateUniqueTokenRegex(secretRegex, false)
+
+	r := config.Rule{
+		Description: "Notion API token",
+		RuleID:      "notion-api-token",
+		Regex:       regex,
+		Entropy:     4,
+		Keywords:    identifiers,
+	}
+
+	// validate
 	tps := []string{
 		"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",
 		"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",
 		"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",
 	}
 
-	fps:= []string{
+	fps := []string{
 		"ntn_12345678901",
 		"ntn_123456789012345678901234567890123456789012345678901234567890",
 		"ntn_12345678901abc",
 	}
 
-    return utils.Validate(r, tps, fps)
-}
+	return utils.Validate(r, tps, fps)
+}

+ 4 - 5
cmd/git.go

@@ -1,7 +1,6 @@
 package cmd
 
 import (
-	"context"
 	"time"
 
 	"github.com/spf13/cobra"
@@ -63,13 +62,13 @@ func runGit(cmd *cobra.Command, args []string) {
 	)
 
 	if preCommit || staged {
-		if gitCmd, err = sources.NewGitDiffCmd(source, staged); err != nil {
+		if gitCmd, err = sources.NewGitDiffCmdContext(cmd.Context(), source, staged); err != nil {
 			logging.Fatal().Err(err).Msg("could not create Git diff cmd")
 		}
 		// Remote info + links are irrelevant for staged changes.
 		scmPlatform = scm.NoPlatform
 	} else {
-		if gitCmd, err = sources.NewGitLogCmd(source, logOpts); err != nil {
+		if gitCmd, err = sources.NewGitLogCmdContext(cmd.Context(), source, logOpts); err != nil {
 			logging.Fatal().Err(err).Msg("could not create Git log cmd")
 		}
 		if scmPlatform, err = scm.PlatformFromString(mustGetStringFlag(cmd, "platform")); err != nil {
@@ -78,11 +77,11 @@ func runGit(cmd *cobra.Command, args []string) {
 	}
 
 	findings, err = detector.DetectSource(
-		context.Background(),
+		cmd.Context(),
 		&sources.Git{
 			Cmd:             gitCmd,
 			Config:          &detector.Config,
-			Remote:          sources.NewRemoteInfo(scmPlatform, source),
+			Remote:          sources.NewRemoteInfoContext(cmd.Context(), scmPlatform, source),
 			Sema:            detector.Sema,
 			MaxArchiveDepth: detector.MaxArchiveDepth,
 		},

+ 1 - 1
cmd/protect.go

@@ -53,7 +53,7 @@ func runProtect(cmd *cobra.Command, args []string) {
 		remote *detect.RemoteInfo
 	)
 
-	if gitCmd, err = sources.NewGitDiffCmd(source, staged); err != nil {
+	if gitCmd, err = sources.NewGitDiffCmdContext(cmd.Context(), source, staged); err != nil {
 		logging.Fatal().Err(err).Msg("could not create Git diff cmd")
 	}
 	remote = &detect.RemoteInfo{Platform: scm.NoPlatform}

+ 14 - 1
cmd/root.go

@@ -2,6 +2,7 @@ package cmd
 
 import (
 	"bytes"
+	"context"
 	"fmt"
 	"io"
 	"os"
@@ -44,6 +45,17 @@ var (
 		Use:     "gitleaks",
 		Short:   "Gitleaks scans code, past or present, for secrets",
 		Version: version.Version,
+		PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
+			// Set the timeout for all the commands
+			if timeout, err := cmd.Flags().GetInt("timeout"); err != nil {
+				return err
+			} else if timeout > 0 {
+				ctx, cancel := context.WithTimeout(cmd.Context(), time.Duration(timeout)*time.Second)
+				cmd.SetContext(ctx)
+				cobra.OnFinalize(cancel)
+			}
+			return nil
+		},
 	}
 
 	// diagnostics manager is global to ensure it can be started before a scan begins
@@ -78,6 +90,7 @@ func init() {
 	rootCmd.PersistentFlags().StringP("gitleaks-ignore-path", "i", ".", "path to .gitleaksignore file or folder containing one")
 	rootCmd.PersistentFlags().Int("max-decode-depth", 0, "allow recursive decoding up to this depth (default \"0\", no decoding is done)")
 	rootCmd.PersistentFlags().Int("max-archive-depth", 0, "allow scanning into nested archives up to this depth (default \"0\", no archive traversal is done)")
+	rootCmd.PersistentFlags().Int("timeout", 0, "set a timeout for gitleaks commands in seconds (default \"0\", no timeout is set)")
 
 	// Add diagnostics flags
 	rootCmd.PersistentFlags().String("diagnostics", "", "enable diagnostics (http OR comma-separated list: cpu,mem,trace). cpu=CPU prof, mem=memory prof, trace=exec tracing, http=serve via net/http/pprof")
@@ -237,7 +250,7 @@ func Detector(cmd *cobra.Command, cfg config.Config, source string) *detect.Dete
 	var err error
 
 	// Setup common detector
-	detector := detect.NewDetector(cfg)
+	detector := detect.NewDetectorContext(cmd.Context(), cfg)
 
 	if detector.MaxDecodeDepth, err = cmd.Flags().GetInt("max-decode-depth"); err != nil {
 		logging.Fatal().Err(err).Send()

+ 1 - 2
cmd/stdin.go

@@ -1,7 +1,6 @@
 package cmd
 
 import (
-	"context"
 	"os"
 	"time"
 
@@ -38,7 +37,7 @@ func runStdIn(cmd *cobra.Command, _ []string) {
 	exitCode := mustGetIntFlag(cmd, "exit-code")
 
 	findings, err := detector.DetectSource(
-		context.Background(),
+		cmd.Context(),
 		&sources.File{
 			Content:         os.Stdin,
 			MaxArchiveDepth: detector.MaxArchiveDepth,

+ 57 - 34
detect/detect.go

@@ -113,6 +113,12 @@ type Fragment sources.Fragment
 
 // NewDetector creates a new detector with the given config
 func NewDetector(cfg config.Config) *Detector {
+	return NewDetectorContext(context.Background(), cfg)
+}
+
+// NewDetectorContext is the same as NewDetector but supports passing in a
+// context to use for timeouts
+func NewDetectorContext(ctx context.Context, cfg config.Config) *Detector {
 	return &Detector{
 		commitMap:      make(map[string]bool),
 		gitleaksIgnore: make(map[string]struct{}),
@@ -121,7 +127,7 @@ func NewDetector(cfg config.Config) *Detector {
 		findings:       make([]report.Finding, 0),
 		Config:         cfg,
 		prefilter:      *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
-		Sema:           semgroup.NewGroup(context.Background(), 40),
+		Sema:           semgroup.NewGroup(ctx, 40),
 	}
 }
 
@@ -240,7 +246,7 @@ func (d *Detector) DetectSource(ctx context.Context, source sources.Source) ([]r
 			})
 		}
 
-		for _, finding := range d.Detect(Fragment(fragment)) {
+		for _, finding := range d.DetectContext(ctx, Fragment(fragment)) {
 			d.AddFinding(finding)
 		}
 
@@ -262,6 +268,12 @@ func (d *Detector) DetectSource(ctx context.Context, source sources.Source) ([]r
 
 // Detect scans the given fragment and returns a list of findings
 func (d *Detector) Detect(fragment Fragment) []report.Finding {
+	return d.DetectContext(context.Background(), fragment)
+}
+
+// DetectContext is the same as Detect but supports passing in a
+// context to use for timeouts
+func (d *Detector) DetectContext(ctx context.Context, fragment Fragment) []report.Finding {
 	if fragment.Bytes == nil {
 		d.TotalBytes.Add(uint64(len(fragment.Raw)))
 	}
@@ -298,46 +310,57 @@ func (d *Detector) Detect(fragment Fragment) []report.Finding {
 	currentDecodeDepth := 0
 	decoder := codec.NewDecoder()
 
+ScanLoop:
 	for {
-		// build keyword map for prefiltering rules
-		keywords := make(map[string]bool)
-		normalizedRaw := strings.ToLower(currentRaw)
-		matches := d.prefilter.MatchString(normalizedRaw)
-		for _, m := range matches {
-			keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
-		}
-
-		for _, rule := range d.Config.Rules {
-			if len(rule.Keywords) == 0 {
-				// if no keywords are associated with the rule always scan the
-				// fragment using the rule
-				findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
-				continue
+		select {
+		case <-ctx.Done():
+			break ScanLoop
+		default:
+			// build keyword map for prefiltering rules
+			keywords := make(map[string]bool)
+			normalizedRaw := strings.ToLower(currentRaw)
+			matches := d.prefilter.MatchString(normalizedRaw)
+			for _, m := range matches {
+				keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
 			}
 
-			// check if keywords are in the fragment
-			for _, k := range rule.Keywords {
-				if _, ok := keywords[strings.ToLower(k)]; ok {
-					findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
-					break
+			for _, rule := range d.Config.Rules {
+				select {
+				case <-ctx.Done():
+					break ScanLoop
+				default:
+					if len(rule.Keywords) == 0 {
+						// if no keywords are associated with the rule always scan the
+						// fragment using the rule
+						findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
+						continue
+					}
+
+					// check if keywords are in the fragment
+					for _, k := range rule.Keywords {
+						if _, ok := keywords[strings.ToLower(k)]; ok {
+							findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
+							break
+						}
+					}
 				}
 			}
-		}
 
-		// increment the depth by 1 as we start our decoding pass
-		currentDecodeDepth++
+			// increment the depth by 1 as we start our decoding pass
+			currentDecodeDepth++
 
-		// stop the loop if we've hit our max decoding depth
-		if currentDecodeDepth > d.MaxDecodeDepth {
-			break
-		}
+			// stop the loop if we've hit our max decoding depth
+			if currentDecodeDepth > d.MaxDecodeDepth {
+				break ScanLoop
+			}
 
-		// decode the currentRaw for the next pass
-		currentRaw, encodedSegments = decoder.Decode(currentRaw, encodedSegments)
+			// decode the currentRaw for the next pass
+			currentRaw, encodedSegments = decoder.Decode(currentRaw, encodedSegments)
 
-		// stop the loop when there's nothing else to decode
-		if len(encodedSegments) == 0 {
-			break
+			// stop the loop when there's nothing else to decode
+			if len(encodedSegments) == 0 {
+				break ScanLoop
+			}
 		}
 	}
 
@@ -357,7 +380,7 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 		}()
 	)
 
-	if r.SkipReport == true && !fragment.InheritedFromFinding {
+	if r.SkipReport && !fragment.InheritedFromFinding {
 		return findings
 	}
 

+ 21 - 4
detect/detect_test.go

@@ -1530,6 +1530,7 @@ func TestDetectWithArchives(t *testing.T) {
 	tests := []struct {
 		cfgName          string
 		source           string
+		expireContext    bool
 		expectedFindings []report.Finding
 	}{
 		{
@@ -2059,6 +2060,12 @@ func TestDetectWithArchives(t *testing.T) {
 				},
 			},
 		},
+		{
+			source:           filepath.Join(archivesBasePath, "nested.tar.gz"),
+			cfgName:          "archives",
+			expireContext:    true,
+			expectedFindings: []report.Finding{},
+		},
 	}
 
 	for _, tt := range tests {
@@ -2073,13 +2080,17 @@ func TestDetectWithArchives(t *testing.T) {
 			err = viper.Unmarshal(&vc)
 			require.NoError(t, err)
 
+			ctx, cancel := context.WithCancel(context.Background())
+			if tt.expireContext {
+				cancel()
+			}
+
 			cfg, _ := vc.Translate()
-			detector := NewDetector(cfg)
+			detector := NewDetectorContext(ctx, cfg)
 			detector.MaxArchiveDepth = 8
 
 			findings, err := detector.DetectSource(
-				context.Background(),
-				&sources.Files{
+				ctx, &sources.Files{
 					Path:            tt.source,
 					Sema:            detector.Sema,
 					Config:          &cfg,
@@ -2087,7 +2098,13 @@ func TestDetectWithArchives(t *testing.T) {
 				},
 			)
 
-			require.NoError(t, err)
+			if tt.expireContext {
+				require.EqualError(t, err, "context canceled")
+			} else {
+				cancel()
+				require.NoError(t, err)
+			}
+
 			// TODO: Temporary mitigation.
 			// https://github.com/gitleaks/gitleaks/issues/1641
 			normalizedFindings := make([]report.Finding, len(findings))

+ 64 - 59
sources/file.go

@@ -70,12 +70,12 @@ func (s *File) Fragments(ctx context.Context, yield FragmentsFunc) error {
 			return s.extractorFragments(ctx, extractor, s.Content, yield)
 		}
 		if decompressor, ok := format.(archives.Decompressor); ok {
-			return s.decompressorFragments(decompressor, s.Content, yield)
+			return s.decompressorFragments(ctx, decompressor, s.Content, yield)
 		}
 		logging.Warn().Str("path", s.FullPath()).Msg("skipping unknown archive type")
 	}
 
-	return s.fileFragments(bufio.NewReader(s.Content), yield)
+	return s.fileFragments(ctx, bufio.NewReader(s.Content), yield)
 }
 
 // extractorFragments recursively crawls archives and yields fragments
@@ -139,14 +139,14 @@ func (s *File) extractorFragments(ctx context.Context, extractor archives.Extrac
 }
 
 // decompressorFragments recursively crawls archives and yields fragments
-func (s *File) decompressorFragments(decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
+func (s *File) decompressorFragments(ctx context.Context, decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
 	innerReader, err := decompressor.OpenReader(reader)
 	if err != nil {
 		logging.Error().Str("path", s.FullPath()).Msg("could read compressed file")
 		return nil
 	}
 
-	if err := s.fileFragments(bufio.NewReader(innerReader), yield); err != nil {
+	if err := s.fileFragments(ctx, bufio.NewReader(innerReader), yield); err != nil {
 		_ = innerReader.Close()
 		return err
 	}
@@ -156,7 +156,7 @@ func (s *File) decompressorFragments(decompressor archives.Decompressor, reader
 }
 
 // fileFragments reads the file into fragments to yield
-func (s *File) fileFragments(reader *bufio.Reader, yield FragmentsFunc) error {
+func (s *File) fileFragments(ctx context.Context, reader *bufio.Reader, yield FragmentsFunc) error {
 	// Create a buffer if the caller hasn't provided one
 	if s.Buffer == nil {
 		s.Buffer = make([]byte, defaultBufferSize)
@@ -164,75 +164,80 @@ func (s *File) fileFragments(reader *bufio.Reader, yield FragmentsFunc) error {
 
 	totalLines := 0
 	for {
-		fragment := Fragment{
-			FilePath: s.FullPath(),
-		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+			fragment := Fragment{
+				FilePath: s.FullPath(),
+			}
 
-		n, err := reader.Read(s.Buffer)
-		if n == 0 {
-			if err != nil && err != io.EOF {
-				return yield(fragment, fmt.Errorf("could not read file: %w", err))
+			n, err := reader.Read(s.Buffer)
+			if n == 0 {
+				if err != nil && err != io.EOF {
+					return yield(fragment, fmt.Errorf("could not read file: %w", err))
+				}
+
+				return nil
 			}
 
-			return nil
-		}
+			// Only check the filetype at the start of file.
+			if totalLines == 0 {
+				// TODO: could other optimizations be introduced here?
+				if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
+					return yield(
+						fragment,
+						fmt.Errorf("could not read file: could not determine type: %w", err),
+					)
+				} else if mimetype.MIME.Type == "application" {
+					logging.Debug().
+						Str("mime_type", mimetype.MIME.Value).
+						Str("path", s.FullPath()).
+						Msgf("skipping binary file")
+
+					return nil
+				}
+			}
 
-		// Only check the filetype at the start of file.
-		if totalLines == 0 {
-			// TODO: could other optimizations be introduced here?
-			if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
+			// Try to split chunks across large areas of whitespace, if possible.
+			peekBuf := bytes.NewBuffer(s.Buffer[:n])
+			if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
 				return yield(
 					fragment,
-					fmt.Errorf("could not read file: could not determine type: %w", err),
+					fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
 				)
-			} else if mimetype.MIME.Type == "application" {
-				logging.Debug().
-					Str("mime_type", mimetype.MIME.Value).
-					Str("path", s.FullPath()).
-					Msgf("skipping binary file")
-
-				return nil
 			}
-		}
-
-		// Try to split chunks across large areas of whitespace, if possible.
-		peekBuf := bytes.NewBuffer(s.Buffer[:n])
-		if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
-			return yield(
-				fragment,
-				fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
-			)
-		}
 
-		fragment.Raw = peekBuf.String()
-		fragment.Bytes = peekBuf.Bytes()
-		fragment.StartLine = totalLines + 1
+			fragment.Raw = peekBuf.String()
+			fragment.Bytes = peekBuf.Bytes()
+			fragment.StartLine = totalLines + 1
 
-		// Count the number of newlines in this chunk
-		totalLines += strings.Count(fragment.Raw, "\n")
+			// Count the number of newlines in this chunk
+			totalLines += strings.Count(fragment.Raw, "\n")
 
-		if len(s.Symlink) > 0 {
-			fragment.SymlinkFile = s.Symlink
-		}
+			if len(s.Symlink) > 0 {
+				fragment.SymlinkFile = s.Symlink
+			}
 
-		if isWindows {
-			fragment.FilePath = filepath.ToSlash(fragment.FilePath)
-			fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
-			fragment.WindowsFilePath = s.FullPath()
-		}
+			if isWindows {
+				fragment.FilePath = filepath.ToSlash(fragment.FilePath)
+				fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
+				fragment.WindowsFilePath = s.FullPath()
+			}
 
-		// log errors but continue since there's content
-		if err != nil && err != io.EOF {
-			logging.Warn().Err(err).Msgf("issue reading file")
-		}
+			// log errors but continue since there's content
+			if err != nil && err != io.EOF {
+				logging.Warn().Err(err).Msgf("issue reading file")
+			}
 
-		// Done with the file!
-		if err == io.EOF {
-			return yield(fragment, nil)
-		}
+			// Done with the file!
+			if err == io.EOF {
+				return yield(fragment, nil)
+			}
 
-		if err := yield(fragment, err); err != nil {
-			return err
+			if err := yield(fragment, err); err != nil {
+				return err
+			}
 		}
 	}
 }

+ 43 - 32
sources/files.go

@@ -34,7 +34,8 @@ func DirectoryTargets(sourcePath string, s *semgroup.Group, followSymlinks bool,
 	}
 
 	s.Go(func() error {
-		err := files.scanTargets(func(scanTarget ScanTarget, err error) error {
+		ctx := context.Background()
+		err := files.scanTargets(ctx, func(scanTarget ScanTarget, err error) error {
 			paths <- scanTarget
 			return nil
 		})
@@ -56,7 +57,7 @@ type Files struct {
 }
 
 // scanTargets yields scan targets to a callback func
-func (s *Files) scanTargets(yield func(ScanTarget, error) error) error {
+func (s *Files) scanTargets(ctx context.Context, yield func(ScanTarget, error) error) error {
 	return filepath.WalkDir(s.Path, func(path string, d fs.DirEntry, err error) error {
 		scanTarget := ScanTarget{Path: path}
 		logger := logging.With().Str("path", path).Logger()
@@ -141,40 +142,50 @@ func (s *Files) scanTargets(yield func(ScanTarget, error) error) error {
 func (s *Files) Fragments(ctx context.Context, yield FragmentsFunc) error {
 	var wg sync.WaitGroup
 
-	err := s.scanTargets(func(scanTarget ScanTarget, err error) error {
-		wg.Add(1)
-		s.Sema.Go(func() error {
-			logger := logging.With().Str("path", scanTarget.Path).Logger()
-			logger.Trace().Msg("scanning path")
-
-			f, err := os.Open(scanTarget.Path)
-			if err != nil {
-				if os.IsPermission(err) {
-					logger.Warn().Msg("skipping file: permission denied")
+	err := s.scanTargets(ctx, func(scanTarget ScanTarget, err error) error {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+			wg.Add(1)
+			s.Sema.Go(func() error {
+				logger := logging.With().Str("path", scanTarget.Path).Logger()
+				logger.Trace().Msg("scanning path")
+
+				f, err := os.Open(scanTarget.Path)
+				if err != nil {
+					if os.IsPermission(err) {
+						logger.Warn().Msg("skipping file: permission denied")
+					}
+					wg.Done()
+					return nil
 				}
-				wg.Done()
-				return nil
-			}
 
-			// Convert this to a file source
-			file := File{
-				Content:         f,
-				Path:            scanTarget.Path,
-				Symlink:         scanTarget.Symlink,
-				Config:          s.Config,
-				MaxArchiveDepth: s.MaxArchiveDepth,
-			}
+				// Convert this to a file source
+				file := File{
+					Content:         f,
+					Path:            scanTarget.Path,
+					Symlink:         scanTarget.Symlink,
+					Config:          s.Config,
+					MaxArchiveDepth: s.MaxArchiveDepth,
+				}
 
-			err = file.Fragments(ctx, yield)
-			// Avoiding a defer in a hot loop
-			_ = f.Close()
-			wg.Done()
-			return err
-		})
+				err = file.Fragments(ctx, yield)
+				// Avoiding a defer in a hot loop
+				_ = f.Close()
+				wg.Done()
+				return err
+			})
 
-		return nil
+			return nil
+		}
 	})
 
-	wg.Wait()
-	return err
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	default:
+		wg.Wait()
+		return err
+	}
 }

+ 50 - 12
sources/git.go

@@ -63,6 +63,12 @@ func (br *blobReader) Close() error {
 // Caller should read everything from channels until receiving a signal about their closure and call
 // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
 func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
+	return NewGitLogCmdContext(context.Background(), source, logOpts)
+}
+
+// NewGitLogCmdContext is the same as NewGitLogCmd but supports passing in a
+// context to use for timeouts
+func NewGitLogCmdContext(ctx context.Context, source string, logOpts string) (*GitCmd, error) {
 	sourceClean := filepath.Clean(source)
 	var cmd *exec.Cmd
 	if logOpts != "" {
@@ -82,9 +88,9 @@ func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
 		}
 
 		args = append(args, userArgs...)
-		cmd = exec.Command("git", args...)
+		cmd = exec.CommandContext(ctx, "git", args...)
 	} else {
-		cmd = exec.Command("git", "-C", sourceClean, "log", "-p", "-U0",
+		cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "log", "-p", "-U0",
 			"--full-history", "--all", "--diff-filter=tuxdb")
 	}
 
@@ -122,11 +128,17 @@ func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
 // Caller should read everything from channels until receiving a signal about their closure and call
 // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
 func NewGitDiffCmd(source string, staged bool) (*GitCmd, error) {
+	return NewGitDiffCmdContext(context.Background(), source, staged)
+}
+
+// NewGitDiffCmdContext is the same as NewGitDiffCmd but supports passing in a
+// context to use for timeouts
+func NewGitDiffCmdContext(ctx context.Context, source string, staged bool) (*GitCmd, error) {
 	sourceClean := filepath.Clean(source)
 	var cmd *exec.Cmd
-	cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff", ".")
+	cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff", ".")
 	if staged {
-		cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff",
+		cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff",
 			"--staged", ".")
 	}
 	logging.Debug().Msgf("executing: %s", cmd.String())
@@ -177,13 +189,24 @@ func (c *GitCmd) Wait() error {
 	return c.cmd.Wait()
 }
 
+// String displays the command used for GitCmd
+func (c *GitCmd) String() string {
+	return c.cmd.String()
+}
+
 // NewBlobReader returns an io.ReadCloser that can be used to read a blob
 // within the git repo used to create the GitCmd.
 //
 // The caller is responsible for closing the reader.
 func (c *GitCmd) NewBlobReader(commit, path string) (io.ReadCloser, error) {
+	return c.NewBlobReaderContext(context.Background(), commit, path)
+}
+
+// NewBlobReaderContext is the same as NewBlobReader but supports passing in a
+// context to use for timeouts
+func (c *GitCmd) NewBlobReaderContext(ctx context.Context, commit, path string) (io.ReadCloser, error) {
 	gitArgs := []string{"-C", c.repoPath, "cat-file", "blob", commit + ":" + path}
-	cmd := exec.Command("git", gitArgs...)
+	cmd := exec.CommandContext(ctx, "git", gitArgs...)
 	cmd.Stderr = io.Discard
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
@@ -275,7 +298,9 @@ type CommitInfo struct {
 // Fragments yields fragments from a git repo
 func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
 	defer func() {
-		_ = s.Cmd.Wait()
+		if err := s.Cmd.Wait(); err != nil {
+			logging.Debug().Err(err).Str("cmd", s.Cmd.String()).Msg("command aborted")
+		}
 	}()
 
 	var (
@@ -287,6 +312,8 @@ func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
 	// loop to range over both DiffFiles (stdout) and ErrCh (stderr)
 	for diffFilesCh != nil || errCh != nil {
 		select {
+		case <-ctx.Done():
+			return ctx.Err()
 		case gitdiffFile, open := <-diffFilesCh:
 			if !open {
 				diffFilesCh = nil
@@ -336,7 +363,7 @@ func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
 				defer wg.Done()
 
 				if yieldAsArchive {
-					blob, err := s.Cmd.NewBlobReader(commitSHA, gitdiffFile.NewName)
+					blob, err := s.Cmd.NewBlobReaderContext(ctx, commitSHA, gitdiffFile.NewName)
 					if err != nil {
 						logging.Error().Err(err).Msg("could not read archive blob")
 						return nil
@@ -394,17 +421,28 @@ func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
 		}
 	}
 
-	wg.Wait()
-	return nil
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	default:
+		wg.Wait()
+		return nil
+	}
 }
 
 // NewRemoteInfo builds a new RemoteInfo for generating finding links
 func NewRemoteInfo(platform scm.Platform, source string) *RemoteInfo {
+	return NewRemoteInfoContext(context.Background(), platform, source)
+}
+
+// NewRemoteInfoContext is the same as NewRemoteInfo but supports passing in a
+// context to use for timeouts
+func NewRemoteInfoContext(ctx context.Context, platform scm.Platform, source string) *RemoteInfo {
 	if platform == scm.NoPlatform {
 		return &RemoteInfo{Platform: platform}
 	}
 
-	remoteUrl, err := getRemoteUrl(source)
+	remoteUrl, err := getRemoteUrl(ctx, source)
 	if err != nil {
 		if strings.Contains(err.Error(), "No remote configured") {
 			logging.Debug().Msg("skipping finding links: repository has no configured remote.")
@@ -442,9 +480,9 @@ End:
 
 var sshUrlpat = regexp.MustCompile(`^git@([a-zA-Z0-9.-]+):(?:\d{1,5}/)?([\w/.-]+?)(?:\.git)?$`)
 
-func getRemoteUrl(source string) (*url.URL, error) {
+func getRemoteUrl(ctx context.Context, source string) (*url.URL, error) {
 	// This will return the first remote — typically, "origin".
-	cmd := exec.Command("git", "ls-remote", "--quiet", "--get-url")
+	cmd := exec.CommandContext(ctx, "git", "ls-remote", "--quiet", "--get-url")
 	if source != "." {
 		cmd.Dir = source
 	}