directory.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. package detect
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "github.com/h2non/filetype"
  10. "github.com/zricethezav/gitleaks/v8/logging"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. "github.com/zricethezav/gitleaks/v8/sources"
  13. )
  14. const maxPeekSize = 25 * 1_000 // 10kb
  15. // DetectFiles schedules each ScanTarget—file or archive—for concurrent scanning.
  16. func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Finding, error) {
  17. for pa := range paths {
  18. pa := pa // capture
  19. d.Sema.Go(func() error {
  20. return d.DetectScanTarget(pa)
  21. })
  22. }
  23. if err := d.Sema.Wait(); err != nil {
  24. return d.findings, err
  25. }
  26. return d.findings, nil
  27. }
  28. // DetectScanTarget handles one ScanTarget: it unpacks archives recursively
  29. // or scans a regular file, always using VirtualPath for reporting.
  30. // TODO maybe find a better solution for this? relying on `scanTarget` seems off.
  31. func (d *Detector) DetectScanTarget(scanTarget sources.ScanTarget) error {
  32. // Choose display path: either VirtualPath (archive chain) or on-disk path.
  33. display := scanTarget.Path
  34. if scanTarget.VirtualPath != "" {
  35. display = scanTarget.VirtualPath
  36. }
  37. logger := logging.With().Str("path", display).Logger()
  38. logger.Trace().Msg("Scanning path")
  39. // --- Archive branch: extract and reschedule children ---
  40. if IsArchive(scanTarget.Path) {
  41. logger.Info().Msg("Found archive")
  42. targets, tmpdir, err := ExtractArchive(scanTarget.Path)
  43. if err != nil {
  44. logger.Warn().Err(err).Msg("Failed to extract archive")
  45. return nil
  46. }
  47. // Schedule each extracted file for its own scan, carrying forward VirtualPath.
  48. for _, t := range targets {
  49. t := t
  50. // compute path INSIDE this archive
  51. rel, rerr := filepath.Rel(tmpdir, t.Path)
  52. if rerr != nil {
  53. rel = filepath.Base(t.Path)
  54. }
  55. rel = filepath.ToSlash(rel)
  56. // prepend existing chain or archive base name
  57. if scanTarget.VirtualPath != "" {
  58. t.VirtualPath = scanTarget.VirtualPath + "/" + rel
  59. } else {
  60. t.VirtualPath = filepath.Base(scanTarget.Path) + "/" + rel
  61. }
  62. d.Sema.Go(func() error {
  63. return d.DetectScanTarget(t)
  64. })
  65. }
  66. // cleanup extraction directory
  67. // if err := os.RemoveAll(tmpdir); err != nil {
  68. // logger.Warn().Err(err).Msg("Failed to remove tempdir")
  69. // }
  70. return nil
  71. }
  72. // --- Regular file branch ---
  73. f, err := os.Open(scanTarget.Path)
  74. if err != nil {
  75. if os.IsPermission(err) {
  76. logger.Warn().Msg("Skipping file: permission denied")
  77. return nil
  78. }
  79. return err
  80. }
  81. defer f.Close()
  82. // Skip binary files by sniffing header
  83. head := make([]byte, 261)
  84. if n, _ := io.ReadFull(f, head); n > 0 {
  85. if kind, _ := filetype.Match(head[:n]); kind != filetype.Unknown {
  86. logger.Debug().Str("kind", kind.Extension).Msg("Skipping binary")
  87. return nil
  88. }
  89. }
  90. if _, err := f.Seek(0, io.SeekStart); err != nil {
  91. return err
  92. }
  93. reader := bufio.NewReader(f)
  94. buf := make([]byte, chunkSize)
  95. totalLines := 0
  96. for {
  97. n, err := reader.Read(buf)
  98. if n > 0 {
  99. peekBuf := bytes.NewBuffer(buf[:n])
  100. if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
  101. return readErr
  102. }
  103. chunk := peekBuf.String()
  104. linesInChunk := strings.Count(chunk, "\n")
  105. // build fragment and set FilePath to our display chain
  106. fragment := Fragment{
  107. Raw: chunk,
  108. Bytes: peekBuf.Bytes(),
  109. }
  110. fragment.FilePath = display
  111. // if this file was itself a symlink
  112. if scanTarget.Symlink != "" {
  113. fragment.SymlinkFile = scanTarget.Symlink
  114. }
  115. if isWindows {
  116. fragment.WindowsFilePath = scanTarget.Path
  117. }
  118. // run detection and adjust line numbers
  119. for _, finding := range d.Detect(fragment) {
  120. finding.StartLine += totalLines + 1
  121. finding.EndLine += totalLines + 1
  122. // We have to augment the finding if the source is coming
  123. // from a archive committed in Git
  124. if scanTarget.Source == "github-archive" {
  125. finding.Author = scanTarget.GitInfo.Author
  126. finding.Commit = scanTarget.GitInfo.Commit
  127. finding.Email = scanTarget.GitInfo.Email
  128. finding.Date = scanTarget.GitInfo.Date
  129. finding.Message = scanTarget.GitInfo.Message
  130. }
  131. d.AddFinding(finding)
  132. }
  133. totalLines += linesInChunk
  134. }
  135. if err != nil {
  136. if err == io.EOF {
  137. return nil
  138. }
  139. return err
  140. }
  141. }
  142. }
  143. // readUntilSafeBoundary consumes |f| until it finds two consecutive `\n` characters, up to |maxPeekSize|.
  144. // This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
  145. func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
  146. if peekBuf.Len() == 0 {
  147. return nil
  148. }
  149. // Does the buffer end in consecutive newlines?
  150. var (
  151. data = peekBuf.Bytes()
  152. lastChar = data[len(data)-1]
  153. newlineCount = 0 // Tracks consecutive newlines
  154. )
  155. if isWhitespace(lastChar) {
  156. for i := len(data) - 1; i >= 0; i-- {
  157. lastChar = data[i]
  158. if lastChar == '\n' {
  159. newlineCount++
  160. // Stop if two consecutive newlines are found
  161. if newlineCount >= 2 {
  162. return nil
  163. }
  164. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  165. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  166. // (Intentionally do nothing.)
  167. } else {
  168. break
  169. }
  170. }
  171. }
  172. // If not, read ahead until we (hopefully) find some.
  173. newlineCount = 0
  174. for {
  175. data = peekBuf.Bytes()
  176. // Check if the last character is a newline.
  177. lastChar = data[len(data)-1]
  178. if lastChar == '\n' {
  179. newlineCount++
  180. // Stop if two consecutive newlines are found
  181. if newlineCount >= 2 {
  182. break
  183. }
  184. } else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
  185. // The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count.
  186. // (Intentionally do nothing.)
  187. } else {
  188. newlineCount = 0 // Reset if a non-newline character is found
  189. }
  190. // Stop growing the buffer if it reaches maxSize
  191. if (peekBuf.Len() - n) >= maxPeekSize {
  192. break
  193. }
  194. // Read additional data into a temporary buffer
  195. b, err := r.ReadByte()
  196. if err != nil {
  197. if err == io.EOF {
  198. break
  199. }
  200. return err
  201. }
  202. peekBuf.WriteByte(b)
  203. }
  204. return nil
  205. }