file.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "fmt"
  7. "io"
  8. "os"
  9. "path/filepath"
  10. "strings"
  11. "github.com/h2non/filetype"
  12. "github.com/mholt/archives"
  13. "github.com/zricethezav/gitleaks/v8/config"
  14. "github.com/zricethezav/gitleaks/v8/logging"
  15. )
  16. const defaultBufferSize = 100 * 1_000 // 100kb
  17. const InnerPathSeparator = "!"
  18. type seekReaderAt interface {
  19. io.ReaderAt
  20. io.Seeker
  21. }
  22. // File is a source for yielding fragments from a file or other reader
  23. type File struct {
  24. // Content provides a reader to the file's content
  25. Content io.Reader
  26. // Path is the resolved real path of the file
  27. Path string
  28. // Symlink represents a symlink to the file if that's how it was discovered
  29. Symlink string
  30. // Buffer is used for reading the content in chunks
  31. Buffer []byte
  32. // Config is the gitleaks config used for shouldSkipPath. If not set, then
  33. // shouldSkipPath is ignored
  34. Config *config.Config
  35. // outerPaths is the list of container paths (e.g. archives) that lead to
  36. // this file
  37. outerPaths []string
  38. // MaxArchiveDepth limits how deep the sources will explore nested archives
  39. MaxArchiveDepth int
  40. // archiveDepth is the current archive nesting depth
  41. archiveDepth int
  42. }
  43. // Fragments yields fragments for the this source
  44. func (s *File) Fragments(ctx context.Context, yield FragmentsFunc) error {
  45. format, _, err := archives.Identify(ctx, s.Path, nil)
  46. // Process the file as an archive if there's no error && Identify returns
  47. // a format; but if there's an error or no format, just swallow the error
  48. // and fall back on treating it like a normal file and let fileFragments
  49. // decide what to do with it.
  50. if err == nil && format != nil {
  51. if s.archiveDepth+1 > s.MaxArchiveDepth {
  52. // Only warn when the feature is enabled
  53. if s.MaxArchiveDepth != 0 {
  54. logging.Warn().Str(
  55. "path", s.FullPath(),
  56. ).Int(
  57. "max_archive_depth", s.MaxArchiveDepth,
  58. ).Msg("skipping archive: exceeds max archive depth")
  59. }
  60. return nil
  61. }
  62. if extractor, ok := format.(archives.Extractor); ok {
  63. return s.extractorFragments(ctx, extractor, s.Content, yield)
  64. }
  65. if decompressor, ok := format.(archives.Decompressor); ok {
  66. return s.decompressorFragments(ctx, decompressor, s.Content, yield)
  67. }
  68. logging.Warn().Str("path", s.FullPath()).Msg("skipping unknown archive type")
  69. }
  70. return s.fileFragments(ctx, bufio.NewReader(s.Content), yield)
  71. }
  72. // extractorFragments recursively crawls archives and yields fragments
  73. func (s *File) extractorFragments(ctx context.Context, extractor archives.Extractor, reader io.Reader, yield FragmentsFunc) error {
  74. if _, isSeekReaderAt := reader.(seekReaderAt); !isSeekReaderAt {
  75. switch extractor.(type) {
  76. case archives.SevenZip, archives.Zip:
  77. tmpfile, err := os.CreateTemp("", "gitleaks-archive-")
  78. if err != nil {
  79. logging.Error().Str("path", s.FullPath()).Msg("could not create tmp file")
  80. return nil
  81. }
  82. defer func() {
  83. _ = tmpfile.Close()
  84. _ = os.Remove(tmpfile.Name())
  85. }()
  86. _, err = io.Copy(tmpfile, reader)
  87. if err != nil {
  88. logging.Error().Str("path", s.FullPath()).Msg("could not copy archive file")
  89. return nil
  90. }
  91. reader = tmpfile
  92. }
  93. }
  94. return extractor.Extract(ctx, reader, func(_ context.Context, d archives.FileInfo) error {
  95. if d.IsDir() {
  96. return nil
  97. }
  98. innerReader, err := d.Open()
  99. if err != nil {
  100. logging.Error().Err(err).Str("path", s.FullPath()).Msg("could not open archive inner file")
  101. return nil
  102. }
  103. defer innerReader.Close()
  104. path := filepath.Clean(d.NameInArchive)
  105. if s.Config != nil && shouldSkipPath(s.Config, path) {
  106. logging.Debug().Str("path", s.FullPath()).Msg("skipping file: global allowlist")
  107. return nil
  108. }
  109. file := &File{
  110. Content: innerReader,
  111. Path: path,
  112. Symlink: s.Symlink,
  113. outerPaths: append(s.outerPaths, filepath.ToSlash(s.Path)),
  114. MaxArchiveDepth: s.MaxArchiveDepth,
  115. archiveDepth: s.archiveDepth + 1,
  116. }
  117. if err := file.Fragments(ctx, yield); err != nil {
  118. return err
  119. }
  120. return nil
  121. })
  122. }
  123. // decompressorFragments recursively crawls archives and yields fragments
  124. func (s *File) decompressorFragments(ctx context.Context, decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
  125. innerReader, err := decompressor.OpenReader(reader)
  126. if err != nil {
  127. logging.Error().Str("path", s.FullPath()).Msg("could read compressed file")
  128. return nil
  129. }
  130. if err := s.fileFragments(ctx, bufio.NewReader(innerReader), yield); err != nil {
  131. _ = innerReader.Close()
  132. return err
  133. }
  134. _ = innerReader.Close()
  135. return nil
  136. }
  137. // fileFragments reads the file into fragments to yield
  138. func (s *File) fileFragments(ctx context.Context, reader *bufio.Reader, yield FragmentsFunc) error {
  139. // Create a buffer if the caller hasn't provided one
  140. if s.Buffer == nil {
  141. s.Buffer = make([]byte, defaultBufferSize)
  142. }
  143. totalLines := 0
  144. for {
  145. select {
  146. case <-ctx.Done():
  147. return ctx.Err()
  148. default:
  149. fragment := Fragment{
  150. FilePath: s.FullPath(),
  151. }
  152. n, err := reader.Read(s.Buffer)
  153. if n == 0 {
  154. if err != nil && err != io.EOF {
  155. return yield(fragment, fmt.Errorf("could not read file: %w", err))
  156. }
  157. return nil
  158. }
  159. // Only check the filetype at the start of file.
  160. if totalLines == 0 {
  161. // TODO: could other optimizations be introduced here?
  162. if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
  163. return yield(
  164. fragment,
  165. fmt.Errorf("could not read file: could not determine type: %w", err),
  166. )
  167. } else if mimetype.MIME.Type == "application" {
  168. logging.Debug().
  169. Str("mime_type", mimetype.MIME.Value).
  170. Str("path", s.FullPath()).
  171. Msgf("skipping binary file")
  172. return nil
  173. }
  174. }
  175. // Try to split chunks across large areas of whitespace, if possible.
  176. peekBuf := bytes.NewBuffer(s.Buffer[:n])
  177. if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
  178. return yield(
  179. fragment,
  180. fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
  181. )
  182. }
  183. fragment.Raw = peekBuf.String()
  184. fragment.Bytes = peekBuf.Bytes()
  185. fragment.StartLine = totalLines + 1
  186. // Count the number of newlines in this chunk
  187. totalLines += strings.Count(fragment.Raw, "\n")
  188. if len(s.Symlink) > 0 {
  189. fragment.SymlinkFile = s.Symlink
  190. }
  191. if isWindows {
  192. fragment.FilePath = filepath.ToSlash(fragment.FilePath)
  193. fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
  194. fragment.WindowsFilePath = s.FullPath()
  195. }
  196. // log errors but continue since there's content
  197. if err != nil && err != io.EOF {
  198. logging.Warn().Err(err).Msgf("issue reading file")
  199. }
  200. // Done with the file!
  201. if err == io.EOF {
  202. return yield(fragment, nil)
  203. }
  204. if err := yield(fragment, err); err != nil {
  205. return err
  206. }
  207. }
  208. }
  209. }
  210. // FullPath returns the File.Path with any preceding outer paths
  211. func (s *File) FullPath() string {
  212. if len(s.outerPaths) > 0 {
  213. return strings.Join(
  214. // outerPaths have already been normalized to slash
  215. append(s.outerPaths, s.Path),
  216. InnerPathSeparator,
  217. )
  218. }
  219. return s.Path
  220. }