directory.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. package detect
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "github.com/h2non/filetype"
  10. "github.com/zricethezav/gitleaks/v8/logging"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. "github.com/zricethezav/gitleaks/v8/sources"
  13. )
  14. const maxPeekSize = 25 * 1_000 // 10kb
  15. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  16. for pa := range paths {
  17. d.Sema.Go(func() error {
  18. logger := logging.With().Str("path", pa.Path).Logger()
  19. logger.Trace().Msg("Scanning path")
  20. f, err := os.Open(pa.Path)
  21. if err != nil {
  22. if os.IsPermission(err) {
  23. logger.Warn().Msg("Skipping file: permission denied")
  24. return nil
  25. }
  26. return err
  27. }
  28. defer func() {
  29. _ = f.Close()
  30. }()
  31. // Get file size
  32. fileInfo, err := f.Stat()
  33. if err != nil {
  34. return err
  35. }
  36. fileSize := fileInfo.Size()
  37. if d.MaxTargetMegaBytes > 0 {
  38. rawLength := fileSize / 1000000
  39. if rawLength > int64(d.MaxTargetMegaBytes) {
  40. logger.Debug().
  41. Int64("size", rawLength).
  42. Msg("Skipping file: exceeds --max-target-megabytes")
  43. return nil
  44. }
  45. }
  46. var (
  47. // Buffer to hold file chunks
  48. reader = bufio.NewReaderSize(f, chunkSize)
  49. buf = make([]byte, chunkSize)
  50. totalLines = 0
  51. )
  52. for {
  53. n, err := reader.Read(buf)
  54. // "Callers should always process the n > 0 bytes returned before considering the error err."
  55. // https://pkg.go.dev/io#Reader
  56. if n > 0 {
  57. // Only check the filetype at the start of file.
  58. if totalLines == 0 {
  59. // TODO: could other optimizations be introduced here?
  60. if mimetype, err := filetype.Match(buf[:n]); err != nil {
  61. return nil
  62. } else if mimetype.MIME.Type == "application" {
  63. return nil // skip binary files
  64. }
  65. }
  66. // Try to split chunks across large areas of whitespace, if possible.
  67. peekBuf := bytes.NewBuffer(buf[:n])
  68. if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
  69. return readErr
  70. }
  71. // Count the number of newlines in this chunk
  72. chunk := peekBuf.String()
  73. linesInChunk := strings.Count(chunk, "\n")
  74. totalLines += linesInChunk
  75. fragment := Fragment{
  76. Raw: chunk,
  77. Bytes: peekBuf.Bytes(),
  78. }
  79. if pa.Symlink != "" {
  80. fragment.SymlinkFile = pa.Symlink
  81. }
  82. if isWindows {
  83. fragment.FilePath = filepath.ToSlash(pa.Path)
  84. fragment.SymlinkFile = filepath.ToSlash(fragment.SymlinkFile)
  85. fragment.WindowsFilePath = pa.Path
  86. } else {
  87. fragment.FilePath = pa.Path
  88. }
  89. for _, finding := range d.Detect(fragment) {
  90. // need to add 1 since line counting starts at 1
  91. finding.StartLine += (totalLines - linesInChunk) + 1
  92. finding.EndLine += (totalLines - linesInChunk) + 1
  93. d.addFinding(finding)
  94. }
  95. }
  96. if err != nil {
  97. if err == io.EOF {
  98. return nil
  99. }
  100. return err
  101. }
  102. }
  103. })
  104. }
  105. if err := d.Sema.Wait(); err != nil {
  106. return d.findings, err
  107. }
  108. return d.findings, nil
  109. }
  110. // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
  111. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
  112. func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
  113. if peekBuf.Len() == 0 {
  114. return nil
  115. }
  116. // Does the buffer end in consecutive newlines?
  117. var (
  118. data = peekBuf.Bytes()
  119. lastChar = data[len(data)-1]
  120. newlineCount = 0 // Tracks consecutive newlines
  121. )
  122. if isWhitespace(lastChar) {
  123. for i := len(data) - 1; i >= 0; i-- {
  124. lastChar = data[i]
  125. if lastChar == '\n' {
  126. newlineCount++
  127. // Stop if two consecutive newlines are found
  128. if newlineCount >= 2 {
  129. return nil
  130. }
  131. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  132. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  133. // (Intentionally do nothing.)
  134. } else {
  135. break
  136. }
  137. }
  138. }
  139. // If not, read ahead until we (hopefully) find some.
  140. newlineCount = 0
  141. for {
  142. data = peekBuf.Bytes()
  143. // Check if the last character is a newline.
  144. lastChar = data[len(data)-1]
  145. if lastChar == '\n' {
  146. newlineCount++
  147. // Stop if two consecutive newlines are found
  148. if newlineCount >= 2 {
  149. break
  150. }
  151. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  152. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  153. // (Intentionally do nothing.)
  154. } else {
  155. newlineCount = 0 // Reset if a non-newline character is found
  156. }
  157. // Stop growing the buffer if it reaches maxSize
  158. if (peekBuf.Len() - n) >= maxPeekSize {
  159. break
  160. }
  161. // Read additional data into a temporary buffer
  162. b, err := r.ReadByte()
  163. if err != nil {
  164. if err == io.EOF {
  165. break
  166. }
  167. return err
  168. }
  169. peekBuf.WriteByte(b)
  170. }
  171. return nil
  172. }