directory.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. package detect
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "github.com/h2non/filetype"
  10. "github.com/zricethezav/gitleaks/v8/logging"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. "github.com/zricethezav/gitleaks/v8/sources"
  13. )
  14. const maxPeekSize = 25 * 1_000 // 10kb
  15. // DetectFiles schedules each ScanTarget—file or archive—for concurrent scanning.
  16. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  17. for pa := range paths {
  18. d.Sema.Go(func() error {
  19. return d.detectScanTarget(pa)
  20. })
  21. }
  22. if err := d.Sema.Wait(); err != nil {
  23. return d.findings, err
  24. }
  25. return d.findings, nil
  26. }
  27. // detectScanTarget handles one ScanTarget: it unpacks archives recursively
  28. // or scans a regular file, always using VirtualPath for reporting.
  29. func (d *Detector) detectScanTarget(scanTarget sources.ScanTarget) error {
  30. // Choose display path: either VirtualPath (archive chain) or on-disk path.
  31. display := scanTarget.Path
  32. if scanTarget.VirtualPath != "" {
  33. display = scanTarget.VirtualPath
  34. }
  35. logger := logging.With().Str("path", display).Logger()
  36. logger.Trace().Msg("Scanning path")
  37. if isArchive(scanTarget.Path) {
  38. logger.Debug().Msg("Found archive")
  39. targets, tmpArchiveDir, err := extractArchive(scanTarget.Path)
  40. if err != nil {
  41. logger.Warn().Err(err).Msg("Failed to extract archive")
  42. return nil
  43. }
  44. // Schedule each extracted file for its own scan, carrying forward VirtualPath.
  45. for _, t := range targets {
  46. t := t
  47. // compute path INSIDE this archive
  48. rel, rerr := filepath.Rel(tmpArchiveDir, t.Path)
  49. if rerr != nil {
  50. rel = filepath.Base(t.Path)
  51. }
  52. rel = filepath.ToSlash(rel)
  53. // prepend existing chain or archive base name
  54. if scanTarget.VirtualPath != "" {
  55. t.VirtualPath = scanTarget.VirtualPath + "/" + rel
  56. } else {
  57. t.VirtualPath = filepath.Base(scanTarget.Path) + "/" + rel
  58. }
  59. d.Sema.Go(func() error {
  60. return d.detectScanTarget(t)
  61. })
  62. }
  63. return nil
  64. }
  65. // --- Regular file branch ---
  66. f, err := os.Open(scanTarget.Path)
  67. if err != nil {
  68. if os.IsPermission(err) {
  69. logger.Warn().Msg("Skipping file: permission denied")
  70. return nil
  71. }
  72. return err
  73. }
  74. defer f.Close()
  75. // Get file size
  76. fileInfo, err := f.Stat()
  77. if err != nil {
  78. return err
  79. }
  80. fileSize := fileInfo.Size()
  81. if d.MaxTargetMegaBytes > 0 {
  82. rawLength := fileSize / 1000000
  83. if rawLength > int64(d.MaxTargetMegaBytes) {
  84. logger.Debug().
  85. Int64("size", rawLength).
  86. Msg("Skipping file: exceeds --max-target-megabytes")
  87. return nil
  88. }
  89. }
  90. // Skip binary files by sniffing header
  91. head := make([]byte, 261)
  92. if n, _ := io.ReadFull(f, head); n > 0 {
  93. if kind, _ := filetype.Match(head[:n]); kind != filetype.Unknown {
  94. logger.Debug().Str("kind", kind.Extension).Msg("Skipping binary")
  95. return nil
  96. }
  97. }
  98. if _, err := f.Seek(0, io.SeekStart); err != nil {
  99. return err
  100. }
  101. reader := bufio.NewReader(f)
  102. buf := make([]byte, chunkSize)
  103. totalLines := 0
  104. for {
  105. n, err := reader.Read(buf)
  106. if n > 0 {
  107. // Only check the filetype at the start of file.
  108. if totalLines == 0 {
  109. // TODO: could other optimizations be introduced here?
  110. if mimetype, err := filetype.Match(buf[:n]); err != nil {
  111. return nil
  112. } else if mimetype.MIME.Type == "application" {
  113. return nil // skip binary files
  114. }
  115. }
  116. peekBuf := bytes.NewBuffer(buf[:n])
  117. if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
  118. return readErr
  119. }
  120. chunk := peekBuf.String()
  121. linesInChunk := strings.Count(chunk, "\n")
  122. // build fragment and set FilePath to our display chain
  123. fragment := Fragment{
  124. Raw: chunk,
  125. Bytes: peekBuf.Bytes(),
  126. }
  127. fragment.FilePath = display
  128. // if this file was itself a symlink
  129. if scanTarget.Symlink != "" {
  130. fragment.SymlinkFile = scanTarget.Symlink
  131. }
  132. if isWindows {
  133. fragment.FilePath = filepath.ToSlash(scanTarget.Path)
  134. fragment.SymlinkFile = filepath.ToSlash(fragment.SymlinkFile)
  135. fragment.WindowsFilePath = scanTarget.Path
  136. }
  137. // run detection and adjust line numbers
  138. for _, finding := range d.Detect(fragment) {
  139. finding.StartLine += totalLines + 1
  140. finding.EndLine += totalLines + 1
  141. // We have to augment the finding if the source is coming
  142. // from a archive committed in Git
  143. if scanTarget.Source == "github-archive" {
  144. finding.Author = scanTarget.GitInfo.Author
  145. finding.Commit = scanTarget.GitInfo.Commit
  146. finding.Email = scanTarget.GitInfo.Email
  147. finding.Date = scanTarget.GitInfo.Date
  148. finding.Message = scanTarget.GitInfo.Message
  149. }
  150. d.AddFinding(finding)
  151. }
  152. totalLines += linesInChunk
  153. }
  154. if err != nil {
  155. if err == io.EOF {
  156. return nil
  157. }
  158. return err
  159. }
  160. }
  161. }
  162. // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
  163. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
  164. func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
  165. if peekBuf.Len() == 0 {
  166. return nil
  167. }
  168. // Does the buffer end in consecutive newlines?
  169. var (
  170. data = peekBuf.Bytes()
  171. lastChar = data[len(data)-1]
  172. newlineCount = 0 // Tracks consecutive newlines
  173. )
  174. if isWhitespace(lastChar) {
  175. for i := len(data) - 1; i >= 0; i-- {
  176. lastChar = data[i]
  177. if lastChar == '\n' {
  178. newlineCount++
  179. // Stop if two consecutive newlines are found
  180. if newlineCount >= 2 {
  181. return nil
  182. }
  183. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  184. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  185. // (Intentionally do nothing.)
  186. } else {
  187. break
  188. }
  189. }
  190. }
  191. // If not, read ahead until we (hopefully) find some.
  192. newlineCount = 0
  193. for {
  194. data = peekBuf.Bytes()
  195. // Check if the last character is a newline.
  196. lastChar = data[len(data)-1]
  197. if lastChar == '\n' {
  198. newlineCount++
  199. // Stop if two consecutive newlines are found
  200. if newlineCount >= 2 {
  201. break
  202. }
  203. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  204. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  205. // (Intentionally do nothing.)
  206. } else {
  207. newlineCount = 0 // Reset if a non-newline character is found
  208. }
  209. // Stop growing the buffer if it reaches maxSize
  210. if (peekBuf.Len() - n) >= maxPeekSize {
  211. break
  212. }
  213. // Read additional data into a temporary buffer
  214. b, err := r.ReadByte()
  215. if err != nil {
  216. if err == io.EOF {
  217. break
  218. }
  219. return err
  220. }
  221. peekBuf.WriteByte(b)
  222. }
  223. return nil
  224. }