directory.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. package detect
  2. import (
  3. "bytes"
  4. "io"
  5. "os"
  6. "strings"
  7. "github.com/h2non/filetype"
  8. "github.com/rs/zerolog/log"
  9. "github.com/zricethezav/gitleaks/v8/report"
  10. "github.com/zricethezav/gitleaks/v8/sources"
  11. )
  12. const maxPeekSize = 25 * 1_000 // 10kb
  13. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  14. for pa := range paths {
  15. d.Sema.Go(func() error {
  16. logger := log.With().Str("path", pa.Path).Logger()
  17. logger.Trace().Msg("Scanning path")
  18. f, err := os.Open(pa.Path)
  19. if err != nil {
  20. if os.IsPermission(err) {
  21. logger.Warn().Msg("Skipping file: permission denied")
  22. return nil
  23. }
  24. return err
  25. }
  26. defer func() {
  27. _ = f.Close()
  28. }()
  29. // Get file size
  30. fileInfo, err := f.Stat()
  31. if err != nil {
  32. return err
  33. }
  34. fileSize := fileInfo.Size()
  35. if d.MaxTargetMegaBytes > 0 {
  36. rawLength := fileSize / 1000000
  37. if rawLength > int64(d.MaxTargetMegaBytes) {
  38. logger.Debug().
  39. Int64("size", rawLength).
  40. Msg("Skipping file: exceeds --max-target-megabytes")
  41. return nil
  42. }
  43. }
  44. // Buffer to hold file chunks
  45. buf := make([]byte, chunkSize)
  46. totalLines := 0
  47. for {
  48. n, err := f.Read(buf)
  49. if n > 0 {
  50. // TODO: optimization could be introduced here
  51. if mimetype, err := filetype.Match(buf[:n]); err != nil {
  52. return nil
  53. } else if mimetype.MIME.Type == "application" {
  54. return nil // skip binary files
  55. }
  56. // If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.
  57. // This hopefully avoids splitting
  58. // See: https://github.com/gitleaks/gitleaks/issues/1651
  59. var (
  60. peekBuf = bytes.NewBuffer(buf[:n])
  61. tempBuf = make([]byte, 1)
  62. newlineCount = 0 // Tracks consecutive newlines
  63. )
  64. for {
  65. data := peekBuf.Bytes()
  66. if len(data) == 0 {
  67. break
  68. }
  69. // Check if the last character is a newline.
  70. lastChar := data[len(data)-1]
  71. if lastChar == '\n' || lastChar == '\r' {
  72. newlineCount++
  73. // Stop if two consecutive newlines are found
  74. if newlineCount >= 2 {
  75. break
  76. }
  77. } else {
  78. newlineCount = 0 // Reset if a non-newline character is found
  79. }
  80. // Stop growing the buffer if it reaches maxSize
  81. if (peekBuf.Len() - n) >= maxPeekSize {
  82. break
  83. }
  84. // Read additional data into a temporary buffer
  85. m, readErr := f.Read(tempBuf)
  86. if m > 0 {
  87. peekBuf.Write(tempBuf[:m])
  88. }
  89. // Stop if EOF is reached
  90. if readErr != nil {
  91. if readErr == io.EOF {
  92. break
  93. }
  94. return readErr
  95. }
  96. }
  97. // Count the number of newlines in this chunk
  98. chunk := peekBuf.String()
  99. linesInChunk := strings.Count(chunk, "\n")
  100. totalLines += linesInChunk
  101. fragment := Fragment{
  102. Raw: chunk,
  103. Bytes: peekBuf.Bytes(),
  104. FilePath: pa.Path,
  105. }
  106. if pa.Symlink != "" {
  107. fragment.SymlinkFile = pa.Symlink
  108. }
  109. for _, finding := range d.Detect(fragment) {
  110. // need to add 1 since line counting starts at 1
  111. finding.StartLine += (totalLines - linesInChunk) + 1
  112. finding.EndLine += (totalLines - linesInChunk) + 1
  113. d.addFinding(finding)
  114. }
  115. }
  116. if err != nil {
  117. if err == io.EOF {
  118. return nil
  119. }
  120. return err
  121. }
  122. }
  123. })
  124. }
  125. if err := d.Sema.Wait(); err != nil {
  126. return d.findings, err
  127. }
  128. return d.findings, nil
  129. }