| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- package detect
- import (
- "bufio"
- "bytes"
- "io"
- "os"
- "path/filepath"
- "strings"
- "github.com/h2non/filetype"
- "github.com/zricethezav/gitleaks/v8/logging"
- "github.com/zricethezav/gitleaks/v8/report"
- "github.com/zricethezav/gitleaks/v8/sources"
- )
- const maxPeekSize = 25 * 1_000 // 10kb
- // DetectFiles schedules each ScanTarget—file or archive—for concurrent scanning.
- func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
- for pa := range paths {
- d.Sema.Go(func() error {
- return d.detectScanTarget(pa)
- })
- }
- if err := d.Sema.Wait(); err != nil {
- return d.findings, err
- }
- return d.findings, nil
- }
- // detectScanTarget handles one ScanTarget: it unpacks archives recursively
- // or scans a regular file, always using VirtualPath for reporting.
- func (d *Detector) detectScanTarget(scanTarget sources.ScanTarget) error {
- // Choose display path: either VirtualPath (archive chain) or on-disk path.
- display := scanTarget.Path
- if scanTarget.VirtualPath != "" {
- display = scanTarget.VirtualPath
- }
- logger := logging.With().Str("path", display).Logger()
- logger.Trace().Msg("Scanning path")
- if isArchive(scanTarget.Path) {
- logger.Debug().Msg("Found archive")
- targets, tmpArchiveDir, err := extractArchive(scanTarget.Path)
- if err != nil {
- logger.Warn().Err(err).Msg("Failed to extract archive")
- return nil
- }
- // Schedule each extracted file for its own scan, carrying forward VirtualPath.
- for _, t := range targets {
- t := t
- // compute path INSIDE this archive
- rel, rerr := filepath.Rel(tmpArchiveDir, t.Path)
- if rerr != nil {
- rel = filepath.Base(t.Path)
- }
- rel = filepath.ToSlash(rel)
- // prepend existing chain or archive base name
- if scanTarget.VirtualPath != "" {
- t.VirtualPath = scanTarget.VirtualPath + "/" + rel
- } else {
- t.VirtualPath = filepath.Base(scanTarget.Path) + "/" + rel
- }
- d.Sema.Go(func() error {
- return d.detectScanTarget(t)
- })
- }
- return nil
- }
- // --- Regular file branch ---
- f, err := os.Open(scanTarget.Path)
- if err != nil {
- if os.IsPermission(err) {
- logger.Warn().Msg("Skipping file: permission denied")
- return nil
- }
- return err
- }
- defer f.Close()
- // Get file size
- fileInfo, err := f.Stat()
- if err != nil {
- return err
- }
- fileSize := fileInfo.Size()
- if d.MaxTargetMegaBytes > 0 {
- rawLength := fileSize / 1000000
- if rawLength > int64(d.MaxTargetMegaBytes) {
- logger.Debug().
- Int64("size", rawLength).
- Msg("Skipping file: exceeds --max-target-megabytes")
- return nil
- }
- }
- // Skip binary files by sniffing header
- head := make([]byte, 261)
- if n, _ := io.ReadFull(f, head); n > 0 {
- if kind, _ := filetype.Match(head[:n]); kind != filetype.Unknown {
- logger.Debug().Str("kind", kind.Extension).Msg("Skipping binary")
- return nil
- }
- }
- if _, err := f.Seek(0, io.SeekStart); err != nil {
- return err
- }
- reader := bufio.NewReader(f)
- buf := make([]byte, chunkSize)
- totalLines := 0
- for {
- n, err := reader.Read(buf)
- if n > 0 {
- // Only check the filetype at the start of file.
- if totalLines == 0 {
- // TODO: could other optimizations be introduced here?
- if mimetype, err := filetype.Match(buf[:n]); err != nil {
- return nil
- } else if mimetype.MIME.Type == "application" {
- return nil // skip binary files
- }
- }
- peekBuf := bytes.NewBuffer(buf[:n])
- if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
- return readErr
- }
- chunk := peekBuf.String()
- linesInChunk := strings.Count(chunk, "\n")
- // build fragment and set FilePath to our display chain
- fragment := Fragment{
- Raw: chunk,
- Bytes: peekBuf.Bytes(),
- }
- fragment.FilePath = display
- // if this file was itself a symlink
- if scanTarget.Symlink != "" {
- fragment.SymlinkFile = scanTarget.Symlink
- }
- if isWindows {
- fragment.FilePath = filepath.ToSlash(scanTarget.Path)
- fragment.SymlinkFile = filepath.ToSlash(fragment.SymlinkFile)
- fragment.WindowsFilePath = scanTarget.Path
- }
- // run detection and adjust line numbers
- for _, finding := range d.Detect(fragment) {
- finding.StartLine += totalLines + 1
- finding.EndLine += totalLines + 1
- // We have to augment the finding if the source is coming
- // from a archive committed in Git
- if scanTarget.Source == "github-archive" {
- finding.Author = scanTarget.GitInfo.Author
- finding.Commit = scanTarget.GitInfo.Commit
- finding.Email = scanTarget.GitInfo.Email
- finding.Date = scanTarget.GitInfo.Date
- finding.Message = scanTarget.GitInfo.Message
- }
- d.AddFinding(finding)
- }
- totalLines += linesInChunk
- }
- if err != nil {
- if err == io.EOF {
- return nil
- }
- return err
- }
- }
- }
- // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
- // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
- func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
- if peekBuf.Len() == 0 {
- return nil
- }
- // Does the buffer end in consecutive newlines?
- var (
- data = peekBuf.Bytes()
- lastChar = data[len(data)-1]
- newlineCount = 0 // Tracks consecutive newlines
- )
- if isWhitespace(lastChar) {
- for i := len(data) - 1; i >= 0; i-- {
- lastChar = data[i]
- if lastChar == '\n' {
- newlineCount++
- // Stop if two consecutive newlines are found
- if newlineCount >= 2 {
- return nil
- }
- } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
- // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
- // (Intentionally do nothing.)
- } else {
- break
- }
- }
- }
- // If not, read ahead until we (hopefully) find some.
- newlineCount = 0
- for {
- data = peekBuf.Bytes()
- // Check if the last character is a newline.
- lastChar = data[len(data)-1]
- if lastChar == '\n' {
- newlineCount++
- // Stop if two consecutive newlines are found
- if newlineCount >= 2 {
- break
- }
- } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
- // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
- // (Intentionally do nothing.)
- } else {
- newlineCount = 0 // Reset if a non-newline character is found
- }
- // Stop growing the buffer if it reaches maxSize
- if (peekBuf.Len() - n) >= maxPeekSize {
- break
- }
- // Read additional data into a temporary buffer
- b, err := r.ReadByte()
- if err != nil {
- if err == io.EOF {
- break
- }
- return err
- }
- peekBuf.WriteByte(b)
- }
- return nil
- }
|