|
|
@@ -1,16 +1,20 @@
|
|
|
package detect
|
|
|
|
|
|
import (
|
|
|
+ "bytes"
|
|
|
"io"
|
|
|
"os"
|
|
|
"strings"
|
|
|
|
|
|
"github.com/h2non/filetype"
|
|
|
"github.com/rs/zerolog/log"
|
|
|
+
|
|
|
"github.com/zricethezav/gitleaks/v8/report"
|
|
|
"github.com/zricethezav/gitleaks/v8/sources"
|
|
|
)
|
|
|
|
|
|
+const maxPeekSize = 25 * 1_000 // 10kb
|
|
|
+
|
|
|
func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
|
|
|
for pa := range paths {
|
|
|
d.Sema.Go(func() error {
|
|
|
@@ -50,41 +54,87 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin
|
|
|
totalLines := 0
|
|
|
for {
|
|
|
n, err := f.Read(buf)
|
|
|
- if err != nil && err != io.EOF {
|
|
|
- return err
|
|
|
- }
|
|
|
- if n == 0 {
|
|
|
- break
|
|
|
+ if n > 0 {
|
|
|
+ // TODO: optimization could be introduced here
|
|
|
+ if mimetype, err := filetype.Match(buf[:n]); err != nil {
|
|
|
+ return nil
|
|
|
+ } else if mimetype.MIME.Type == "application" {
|
|
|
+ return nil // skip binary files
|
|
|
+ }
|
|
|
+
|
|
|
+ // If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.
|
|
|
+ // This hopefully avoids splitting
|
|
|
+ // See: https://github.com/gitleaks/gitleaks/issues/1651
|
|
|
+ var (
|
|
|
+ peekBuf = bytes.NewBuffer(buf[:n])
|
|
|
+ tempBuf = make([]byte, 1)
|
|
|
+ newlineCount = 0 // Tracks consecutive newlines
|
|
|
+ )
|
|
|
+ for {
|
|
|
+ data := peekBuf.Bytes()
|
|
|
+ if len(data) == 0 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Check if the last character is a newline.
|
|
|
+ lastChar := data[len(data)-1]
|
|
|
+ if lastChar == '\n' || lastChar == '\r' {
|
|
|
+ newlineCount++
|
|
|
+
|
|
|
+ // Stop if two consecutive newlines are found
|
|
|
+ if newlineCount >= 2 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ newlineCount = 0 // Reset if a non-newline character is found
|
|
|
+ }
|
|
|
+
|
|
|
+ // Stop growing the buffer if it reaches maxSize
|
|
|
+ if (peekBuf.Len() - n) >= maxPeekSize {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // Read additional data into a temporary buffer
|
|
|
+ m, readErr := f.Read(tempBuf)
|
|
|
+ if m > 0 {
|
|
|
+ peekBuf.Write(tempBuf[:m])
|
|
|
+ }
|
|
|
+
|
|
|
+ // Stop if EOF is reached
|
|
|
+ if readErr != nil {
|
|
|
+ if readErr == io.EOF {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ return readErr
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Count the number of newlines in this chunk
|
|
|
+ chunk := string(peekBuf.Bytes())
|
|
|
+ linesInChunk := strings.Count(chunk, "\n")
|
|
|
+ totalLines += linesInChunk
|
|
|
+ fragment := Fragment{
|
|
|
+ Raw: chunk,
|
|
|
+ FilePath: pa.Path,
|
|
|
+ }
|
|
|
+ if pa.Symlink != "" {
|
|
|
+ fragment.SymlinkFile = pa.Symlink
|
|
|
+ }
|
|
|
+ for _, finding := range d.Detect(fragment) {
|
|
|
+ // need to add 1 since line counting starts at 1
|
|
|
+ finding.StartLine += (totalLines - linesInChunk) + 1
|
|
|
+ finding.EndLine += (totalLines - linesInChunk) + 1
|
|
|
+ d.addFinding(finding)
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- // TODO: optimization could be introduced here
|
|
|
- mimetype, err := filetype.Match(buf[:n])
|
|
|
if err != nil {
|
|
|
+ if err == io.EOF {
|
|
|
+ return nil
|
|
|
+ }
|
|
|
return err
|
|
|
}
|
|
|
- if mimetype.MIME.Type == "application" {
|
|
|
- return nil // skip binary files
|
|
|
- }
|
|
|
-
|
|
|
- // Count the number of newlines in this chunk
|
|
|
- linesInChunk := strings.Count(string(buf[:n]), "\n")
|
|
|
- totalLines += linesInChunk
|
|
|
- fragment := Fragment{
|
|
|
- Raw: string(buf[:n]),
|
|
|
- FilePath: pa.Path,
|
|
|
- }
|
|
|
- if pa.Symlink != "" {
|
|
|
- fragment.SymlinkFile = pa.Symlink
|
|
|
- }
|
|
|
- for _, finding := range d.Detect(fragment) {
|
|
|
- // need to add 1 since line counting starts at 1
|
|
|
- finding.StartLine += (totalLines - linesInChunk) + 1
|
|
|
- finding.EndLine += (totalLines - linesInChunk) + 1
|
|
|
- d.addFinding(finding)
|
|
|
- }
|
|
|
}
|
|
|
-
|
|
|
- return nil
|
|
|
})
|
|
|
}
|
|
|
|