directory.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. package detect
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "strings"
  8. "github.com/h2non/filetype"
  9. "github.com/rs/zerolog/log"
  10. "github.com/zricethezav/gitleaks/v8/report"
  11. "github.com/zricethezav/gitleaks/v8/sources"
  12. )
  13. const maxPeekSize = 25 * 1_000 // 10kb
  14. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  15. for pa := range paths {
  16. d.Sema.Go(func() error {
  17. logger := log.With().Str("path", pa.Path).Logger()
  18. logger.Trace().Msg("Scanning path")
  19. f, err := os.Open(pa.Path)
  20. if err != nil {
  21. if os.IsPermission(err) {
  22. logger.Warn().Msg("Skipping file: permission denied")
  23. return nil
  24. }
  25. return err
  26. }
  27. defer func() {
  28. _ = f.Close()
  29. }()
  30. // Get file size
  31. fileInfo, err := f.Stat()
  32. if err != nil {
  33. return err
  34. }
  35. fileSize := fileInfo.Size()
  36. if d.MaxTargetMegaBytes > 0 {
  37. rawLength := fileSize / 1000000
  38. if rawLength > int64(d.MaxTargetMegaBytes) {
  39. logger.Debug().
  40. Int64("size", rawLength).
  41. Msg("Skipping file: exceeds --max-target-megabytes")
  42. return nil
  43. }
  44. }
  45. var (
  46. // Buffer to hold file chunks
  47. reader = bufio.NewReaderSize(f, chunkSize)
  48. buf = make([]byte, chunkSize)
  49. totalLines = 0
  50. )
  51. for {
  52. n, err := reader.Read(buf)
  53. // "Callers should always process the n > 0 bytes returned before considering the error err."
  54. // https://pkg.go.dev/io#Reader
  55. if n > 0 {
  56. // Only check the filetype at the start of file.
  57. if totalLines == 0 {
  58. // TODO: could other optimizations be introduced here?
  59. if mimetype, err := filetype.Match(buf[:n]); err != nil {
  60. return nil
  61. } else if mimetype.MIME.Type == "application" {
  62. return nil // skip binary files
  63. }
  64. }
  65. // Try to split chunks across large areas of whitespace, if possible.
  66. peekBuf := bytes.NewBuffer(buf[:n])
  67. if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
  68. return readErr
  69. }
  70. // Count the number of newlines in this chunk
  71. chunk := peekBuf.String()
  72. linesInChunk := strings.Count(chunk, "\n")
  73. totalLines += linesInChunk
  74. fragment := Fragment{
  75. Raw: chunk,
  76. Bytes: peekBuf.Bytes(),
  77. FilePath: pa.Path,
  78. }
  79. if pa.Symlink != "" {
  80. fragment.SymlinkFile = pa.Symlink
  81. }
  82. for _, finding := range d.Detect(fragment) {
  83. // need to add 1 since line counting starts at 1
  84. finding.StartLine += (totalLines - linesInChunk) + 1
  85. finding.EndLine += (totalLines - linesInChunk) + 1
  86. d.addFinding(finding)
  87. }
  88. }
  89. if err != nil {
  90. if err == io.EOF {
  91. return nil
  92. }
  93. return err
  94. }
  95. }
  96. })
  97. }
  98. if err := d.Sema.Wait(); err != nil {
  99. return d.findings, err
  100. }
  101. return d.findings, nil
  102. }
  103. // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
  104. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
  105. func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
  106. if peekBuf.Len() == 0 {
  107. return nil
  108. }
  109. // Does the buffer end in consecutive newlines?
  110. var (
  111. data = peekBuf.Bytes()
  112. lastChar = data[len(data)-1]
  113. newlineCount = 0 // Tracks consecutive newlines
  114. )
  115. if isWhitespace(lastChar) {
  116. for i := len(data) - 1; i >= 0; i-- {
  117. lastChar = data[i]
  118. if lastChar == '\n' {
  119. newlineCount++
  120. // Stop if two consecutive newlines are found
  121. if newlineCount >= 2 {
  122. return nil
  123. }
  124. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  125. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  126. // (Intentionally do nothing.)
  127. } else {
  128. break
  129. }
  130. }
  131. }
  132. // If not, read ahead until we (hopefully) find some.
  133. newlineCount = 0
  134. for {
  135. data = peekBuf.Bytes()
  136. // Check if the last character is a newline.
  137. lastChar = data[len(data)-1]
  138. if lastChar == '\n' {
  139. newlineCount++
  140. // Stop if two consecutive newlines are found
  141. if newlineCount >= 2 {
  142. break
  143. }
  144. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  145. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  146. // (Intentionally do nothing.)
  147. } else {
  148. newlineCount = 0 // Reset if a non-newline character is found
  149. }
  150. // Stop growing the buffer if it reaches maxSize
  151. if (peekBuf.Len() - n) >= maxPeekSize {
  152. break
  153. }
  154. // Read additional data into a temporary buffer
  155. b, err := r.ReadByte()
  156. if err != nil {
  157. if err == io.EOF {
  158. break
  159. }
  160. return err
  161. }
  162. peekBuf.WriteByte(b)
  163. }
  164. return nil
  165. }