| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- package detect
- import (
- "bytes"
- "io"
- "os"
- "strings"
- "github.com/h2non/filetype"
- "github.com/rs/zerolog/log"
- "github.com/zricethezav/gitleaks/v8/report"
- "github.com/zricethezav/gitleaks/v8/sources"
- )
- const maxPeekSize = 25 * 1_000 // 10kb
- func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
- for pa := range paths {
- d.Sema.Go(func() error {
- logger := log.With().Str("path", pa.Path).Logger()
- logger.Trace().Msg("Scanning path")
- f, err := os.Open(pa.Path)
- if err != nil {
- if os.IsPermission(err) {
- logger.Warn().Msg("Skipping file: permission denied")
- return nil
- }
- return err
- }
- defer func() {
- _ = f.Close()
- }()
- // Get file size
- fileInfo, err := f.Stat()
- if err != nil {
- return err
- }
- fileSize := fileInfo.Size()
- if d.MaxTargetMegaBytes > 0 {
- rawLength := fileSize / 1000000
- if rawLength > int64(d.MaxTargetMegaBytes) {
- logger.Debug().
- Int64("size", rawLength).
- Msg("Skipping file: exceeds --max-target-megabytes")
- return nil
- }
- }
- // Buffer to hold file chunks
- buf := make([]byte, chunkSize)
- totalLines := 0
- for {
- n, err := f.Read(buf)
- if n > 0 {
- // TODO: optimization could be introduced here
- if mimetype, err := filetype.Match(buf[:n]); err != nil {
- return nil
- } else if mimetype.MIME.Type == "application" {
- return nil // skip binary files
- }
- // If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.
- // This hopefully avoids splitting
- // See: https://github.com/gitleaks/gitleaks/issues/1651
- var (
- peekBuf = bytes.NewBuffer(buf[:n])
- tempBuf = make([]byte, 1)
- newlineCount = 0 // Tracks consecutive newlines
- )
- for {
- data := peekBuf.Bytes()
- if len(data) == 0 {
- break
- }
- // Check if the last character is a newline.
- lastChar := data[len(data)-1]
- if lastChar == '\n' || lastChar == '\r' {
- newlineCount++
- // Stop if two consecutive newlines are found
- if newlineCount >= 2 {
- break
- }
- } else {
- newlineCount = 0 // Reset if a non-newline character is found
- }
- // Stop growing the buffer if it reaches maxSize
- if (peekBuf.Len() - n) >= maxPeekSize {
- break
- }
- // Read additional data into a temporary buffer
- m, readErr := f.Read(tempBuf)
- if m > 0 {
- peekBuf.Write(tempBuf[:m])
- }
- // Stop if EOF is reached
- if readErr != nil {
- if readErr == io.EOF {
- break
- }
- return readErr
- }
- }
- // Count the number of newlines in this chunk
- chunk := peekBuf.String()
- linesInChunk := strings.Count(chunk, "\n")
- totalLines += linesInChunk
- fragment := Fragment{
- Raw: chunk,
- Bytes: peekBuf.Bytes(),
- FilePath: pa.Path,
- }
- if pa.Symlink != "" {
- fragment.SymlinkFile = pa.Symlink
- }
- for _, finding := range d.Detect(fragment) {
- // need to add 1 since line counting starts at 1
- finding.StartLine += (totalLines - linesInChunk) + 1
- finding.EndLine += (totalLines - linesInChunk) + 1
- d.addFinding(finding)
- }
- }
- if err != nil {
- if err == io.EOF {
- return nil
- }
- return err
- }
- }
- })
- }
- if err := d.Sema.Wait(); err != nil {
- return d.findings, err
- }
- return d.findings, nil
- }
|