Richard Gomez 1 год назад
Родитель
Сommit
0bf13fc25a
1 измененных файлов с 79 добавлено и 29 удалено
  1. 79 29
      detect/directory.go

+ 79 - 29
detect/directory.go

@@ -1,16 +1,20 @@
 package detect
 
 import (
+	"bytes"
 	"io"
 	"os"
 	"strings"
 
 	"github.com/h2non/filetype"
 	"github.com/rs/zerolog/log"
+
 	"github.com/zricethezav/gitleaks/v8/report"
 	"github.com/zricethezav/gitleaks/v8/sources"
 )
 
+const maxPeekSize = 25 * 1_000 // 10kb
+
 func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
 	for pa := range paths {
 		d.Sema.Go(func() error {
@@ -50,41 +54,87 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin
 			totalLines := 0
 			for {
 				n, err := f.Read(buf)
-				if err != nil && err != io.EOF {
-					return err
-				}
-				if n == 0 {
-					break
+				if n > 0 {
+					// TODO: optimization could be introduced here
+					if mimetype, err := filetype.Match(buf[:n]); err != nil {
+						return nil
+					} else if mimetype.MIME.Type == "application" {
+						return nil // skip binary files
+					}
+
+					// If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.
+					// This hopefully avoids splitting
+					// See: https://github.com/gitleaks/gitleaks/issues/1651
+					var (
+						peekBuf      = bytes.NewBuffer(buf[:n])
+						tempBuf      = make([]byte, 1)
+						newlineCount = 0 // Tracks consecutive newlines
+					)
+					for {
+						data := peekBuf.Bytes()
+						if len(data) == 0 {
+							break
+						}
+
+						// Check if the last character is a newline.
+						lastChar := data[len(data)-1]
+						if lastChar == '\n' || lastChar == '\r' {
+							newlineCount++
+
+							// Stop if two consecutive newlines are found
+							if newlineCount >= 2 {
+								break
+							}
+						} else {
+							newlineCount = 0 // Reset if a non-newline character is found
+						}
+
+						// Stop growing the buffer if it reaches maxSize
+						if (peekBuf.Len() - n) >= maxPeekSize {
+							break
+						}
+
+						// Read additional data into a temporary buffer
+						m, readErr := f.Read(tempBuf)
+						if m > 0 {
+							peekBuf.Write(tempBuf[:m])
+						}
+
+						// Stop if EOF is reached
+						if readErr != nil {
+							if readErr == io.EOF {
+								break
+							}
+							return readErr
+						}
+					}
+
+					// Count the number of newlines in this chunk
+					chunk := string(peekBuf.Bytes())
+					linesInChunk := strings.Count(chunk, "\n")
+					totalLines += linesInChunk
+					fragment := Fragment{
+						Raw:      chunk,
+						FilePath: pa.Path,
+					}
+					if pa.Symlink != "" {
+						fragment.SymlinkFile = pa.Symlink
+					}
+					for _, finding := range d.Detect(fragment) {
+						// need to add 1 since line counting starts at 1
+						finding.StartLine += (totalLines - linesInChunk) + 1
+						finding.EndLine += (totalLines - linesInChunk) + 1
+						d.addFinding(finding)
+					}
 				}
 
-				// TODO: optimization could be introduced here
-				mimetype, err := filetype.Match(buf[:n])
 				if err != nil {
+					if err == io.EOF {
+						return nil
+					}
 					return err
 				}
-				if mimetype.MIME.Type == "application" {
-					return nil // skip binary files
-				}
-
-				// Count the number of newlines in this chunk
-				linesInChunk := strings.Count(string(buf[:n]), "\n")
-				totalLines += linesInChunk
-				fragment := Fragment{
-					Raw:      string(buf[:n]),
-					FilePath: pa.Path,
-				}
-				if pa.Symlink != "" {
-					fragment.SymlinkFile = pa.Symlink
-				}
-				for _, finding := range d.Detect(fragment) {
-					// need to add 1 since line counting starts at 1
-					finding.StartLine += (totalLines - linesInChunk) + 1
-					finding.EndLine += (totalLines - linesInChunk) + 1
-					d.addFinding(finding)
-				}
 			}
-
-			return nil
 		})
 	}