package detect import ( "bufio" "bytes" "io" "os" "path/filepath" "strings" "github.com/h2non/filetype" "github.com/zricethezav/gitleaks/v8/logging" "github.com/zricethezav/gitleaks/v8/report" "github.com/zricethezav/gitleaks/v8/sources" ) const maxPeekSize = 25 * 1_000 // 10kb // DetectFiles schedules each ScanTarget—file or archive—for concurrent scanning. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) { for pa := range paths { d.Sema.Go(func() error { return d.detectScanTarget(pa) }) } if err := d.Sema.Wait(); err != nil { return d.findings, err } return d.findings, nil } // detectScanTarget handles one ScanTarget: it unpacks archives recursively // or scans a regular file, always using VirtualPath for reporting. func (d *Detector) detectScanTarget(scanTarget sources.ScanTarget) error { // Choose display path: either VirtualPath (archive chain) or on-disk path. display := scanTarget.Path if scanTarget.VirtualPath != "" { display = scanTarget.VirtualPath } logger := logging.With().Str("path", display).Logger() logger.Trace().Msg("Scanning path") if isArchive(scanTarget.Path) { logger.Debug().Msg("Found archive") targets, tmpArchiveDir, err := extractArchive(scanTarget.Path) if err != nil { logger.Warn().Err(err).Msg("Failed to extract archive") return nil } // Schedule each extracted file for its own scan, carrying forward VirtualPath. for _, t := range targets { t := t // compute path INSIDE this archive rel, rerr := filepath.Rel(tmpArchiveDir, t.Path) if rerr != nil { rel = filepath.Base(t.Path) } rel = filepath.ToSlash(rel) // prepend existing chain or archive base name if scanTarget.VirtualPath != "" { t.VirtualPath = scanTarget.VirtualPath + "/" + rel } else { t.VirtualPath = filepath.Base(scanTarget.Path) + "/" + rel } d.Sema.Go(func() error { return d.detectScanTarget(t) }) } return nil } // --- Regular file branch --- f, err := os.Open(scanTarget.Path) if err != nil { if os.IsPermission(err) { logger.Warn().Msg("Skipping file: permission denied") return nil } return err } defer f.Close() // Get file size fileInfo, err := f.Stat() if err != nil { return err } fileSize := fileInfo.Size() if d.MaxTargetMegaBytes > 0 { rawLength := fileSize / 1000000 if rawLength > int64(d.MaxTargetMegaBytes) { logger.Debug(). Int64("size", rawLength). Msg("Skipping file: exceeds --max-target-megabytes") return nil } } // Skip binary files by sniffing header head := make([]byte, 261) if n, _ := io.ReadFull(f, head); n > 0 { if kind, _ := filetype.Match(head[:n]); kind != filetype.Unknown { logger.Debug().Str("kind", kind.Extension).Msg("Skipping binary") return nil } } if _, err := f.Seek(0, io.SeekStart); err != nil { return err } reader := bufio.NewReader(f) buf := make([]byte, chunkSize) totalLines := 0 for { n, err := reader.Read(buf) if n > 0 { // Only check the filetype at the start of file. if totalLines == 0 { // TODO: could other optimizations be introduced here? if mimetype, err := filetype.Match(buf[:n]); err != nil { return nil } else if mimetype.MIME.Type == "application" { return nil // skip binary files } } peekBuf := bytes.NewBuffer(buf[:n]) if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil { return readErr } chunk := peekBuf.String() linesInChunk := strings.Count(chunk, "\n") // build fragment and set FilePath to our display chain fragment := Fragment{ Raw: chunk, Bytes: peekBuf.Bytes(), } fragment.FilePath = display // if this file was itself a symlink if scanTarget.Symlink != "" { fragment.SymlinkFile = scanTarget.Symlink } if isWindows { fragment.FilePath = filepath.ToSlash(scanTarget.Path) fragment.SymlinkFile = filepath.ToSlash(fragment.SymlinkFile) fragment.WindowsFilePath = scanTarget.Path } // run detection and adjust line numbers for _, finding := range d.Detect(fragment) { finding.StartLine += totalLines + 1 finding.EndLine += totalLines + 1 // We have to augment the finding if the source is coming // from a archive committed in Git if scanTarget.Source == "github-archive" { finding.Author = scanTarget.GitInfo.Author finding.Commit = scanTarget.GitInfo.Commit finding.Email = scanTarget.GitInfo.Email finding.Date = scanTarget.GitInfo.Date finding.Message = scanTarget.GitInfo.Message } d.AddFinding(finding) } totalLines += linesInChunk } if err != nil { if err == io.EOF { return nil } return err } } } // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651) func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error { if peekBuf.Len() == 0 { return nil } // Does the buffer end in consecutive newlines? var ( data = peekBuf.Bytes() lastChar = data[len(data)-1] newlineCount = 0 // Tracks consecutive newlines ) if isWhitespace(lastChar) { for i := len(data) - 1; i >= 0; i-- { lastChar = data[i] if lastChar == '\n' { newlineCount++ // Stop if two consecutive newlines are found if newlineCount >= 2 { return nil } } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' { // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count. // (Intentionally do nothing.) } else { break } } } // If not, read ahead until we (hopefully) find some. newlineCount = 0 for { data = peekBuf.Bytes() // Check if the last character is a newline. lastChar = data[len(data)-1] if lastChar == '\n' { newlineCount++ // Stop if two consecutive newlines are found if newlineCount >= 2 { break } } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' { // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count. // (Intentionally do nothing.) } else { newlineCount = 0 // Reset if a non-newline character is found } // Stop growing the buffer if it reaches maxSize if (peekBuf.Len() - n) >= maxPeekSize { break } // Read additional data into a temporary buffer b, err := r.ReadByte() if err != nil { if err == io.EOF { break } return err } peekBuf.WriteByte(b) } return nil }