directory.go 5.1 KB


  1. package detect
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "time"
  10. "github.com/h2non/filetype"
  11. "github.com/zricethezav/gitleaks/v8/logging"
  12. "github.com/zricethezav/gitleaks/v8/report"
  13. "github.com/zricethezav/gitleaks/v8/sources"
  14. )
  15. const maxPeekSize = 25 * 1_000 // 10kb
  16. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  17. for pa := range paths {
  18. d.Sema.Go(func() error {
  19. logger := logging.With().Str("path", pa.Path).Logger()
  20. logger.Trace().Msg("Scanning path")
  21. f, err := os.Open(pa.Path)
  22. if err != nil {
  23. if os.IsPermission(err) {
  24. logger.Warn().Msg("Skipping file: permission denied")
  25. return nil
  26. }
  27. return err
  28. }
  29. defer func() {
  30. _ = f.Close()
  31. }()
  32. // Get file size
  33. fileInfo, err := f.Stat()
  34. if err != nil {
  35. return err
  36. }
  37. fileSize := fileInfo.Size()
  38. if d.MaxTargetMegaBytes > 0 {
  39. rawLength := fileSize / 1000000
  40. if rawLength > int64(d.MaxTargetMegaBytes) {
  41. logger.Debug().
  42. Int64("size", rawLength).
  43. Msg("Skipping file: exceeds --max-target-megabytes")
  44. return nil
  45. }
  46. }
  47. var (
  48. // Buffer to hold file chunks
  49. reader = bufio.NewReaderSize(f, chunkSize)
  50. buf = make([]byte, chunkSize)
  51. totalLines = 0
  52. )
  53. for {
  54. n, err := reader.Read(buf)
  55. // "Callers should always process the n > 0 bytes returned before considering the error err."
  56. // https://pkg.go.dev/io#Reader
  57. if n > 0 {
  58. // Only check the filetype at the start of file.
  59. if totalLines == 0 {
  60. // TODO: could other optimizations be introduced here?
  61. if mimetype, err := filetype.Match(buf[:n]); err != nil {
  62. return nil
  63. } else if mimetype.MIME.Type == "application" {
  64. return nil // skip binary files
  65. }
  66. }
  67. // Try to split chunks across large areas of whitespace, if possible.
  68. peekBuf := bytes.NewBuffer(buf[:n])
  69. if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
  70. return readErr
  71. }
  72. // Count the number of newlines in this chunk
  73. chunk := peekBuf.String()
  74. linesInChunk := strings.Count(chunk, "\n")
  75. totalLines += linesInChunk
  76. fragment := Fragment{
  77. Raw: chunk,
  78. Bytes: peekBuf.Bytes(),
  79. }
  80. if pa.Symlink != "" {
  81. fragment.SymlinkFile = pa.Symlink
  82. }
  83. if isWindows {
  84. fragment.FilePath = filepath.ToSlash(pa.Path)
  85. fragment.SymlinkFile = filepath.ToSlash(fragment.SymlinkFile)
  86. fragment.WindowsFilePath = pa.Path
  87. } else {
  88. fragment.FilePath = pa.Path
  89. }
  90. timer := time.AfterFunc(SlowWarningThreshold, func() {
  91. logger.Debug().Msgf("Taking longer than %s to inspect fragment", SlowWarningThreshold.String())
  92. })
  93. for _, finding := range d.Detect(fragment) {
  94. // need to add 1 since line counting starts at 1
  95. finding.StartLine += (totalLines - linesInChunk) + 1
  96. finding.EndLine += (totalLines - linesInChunk) + 1
  97. d.AddFinding(finding)
  98. }
  99. if timer != nil {
  100. timer.Stop()
  101. timer = nil
  102. }
  103. }
  104. if err != nil {
  105. if err == io.EOF {
  106. return nil
  107. }
  108. return err
  109. }
  110. }
  111. })
  112. }
  113. if err := d.Sema.Wait(); err != nil {
  114. return d.findings, err
  115. }
  116. return d.findings, nil
  117. }
  118. // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
  119. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
  120. func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
  121. if peekBuf.Len() == 0 {
  122. return nil
  123. }
  124. // Does the buffer end in consecutive newlines?
  125. var (
  126. data = peekBuf.Bytes()
  127. lastChar = data[len(data)-1]
  128. newlineCount = 0 // Tracks consecutive newlines
  129. )
  130. if isWhitespace(lastChar) {
  131. for i := len(data) - 1; i >= 0; i-- {
  132. lastChar = data[i]
  133. if lastChar == '\n' {
  134. newlineCount++
  135. // Stop if two consecutive newlines are found
  136. if newlineCount >= 2 {
  137. return nil
  138. }
  139. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  140. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  141. // (Intentionally do nothing.)
  142. } else {
  143. break
  144. }
  145. }
  146. }
  147. // If not, read ahead until we (hopefully) find some.
  148. newlineCount = 0
  149. for {
  150. data = peekBuf.Bytes()
  151. // Check if the last character is a newline.
  152. lastChar = data[len(data)-1]
  153. if lastChar == '\n' {
  154. newlineCount++
  155. // Stop if two consecutive newlines are found
  156. if newlineCount >= 2 {
  157. break
  158. }
  159. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  160. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  161. // (Intentionally do nothing.)
  162. } else {
  163. newlineCount = 0 // Reset if a non-newline character is found
  164. }
  165. // Stop growing the buffer if it reaches maxSize
  166. if (peekBuf.Len() - n) >= maxPeekSize {
  167. break
  168. }
  169. // Read additional data into a temporary buffer
  170. b, err := r.ReadByte()
  171. if err != nil {
  172. if err == io.EOF {
  173. break
  174. }
  175. return err
  176. }
  177. peekBuf.WriteByte(b)
  178. }
  179. return nil
  180. }