file.go 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "fmt"
  7. "io"
  8. "os"
  9. "path/filepath"
  10. "strings"
  11. "github.com/h2non/filetype"
  12. "github.com/mholt/archives"
  13. "github.com/zricethezav/gitleaks/v8/config"
  14. "github.com/zricethezav/gitleaks/v8/logging"
  15. )
  16. const defaultBufferSize = 100 * 1_000 // 100kb
  17. const InnerPathSeparator = "!"
  18. type seekReaderAt interface {
  19. io.ReaderAt
  20. io.Seeker
  21. }
  22. // File is a source for yielding fragments from a file or other reader
  23. type File struct {
  24. // Content provides a reader to the file's content
  25. Content io.Reader
  26. // Path is the resolved real path of the file
  27. Path string
  28. // Symlink represents a symlink to the file if that's how it was discovered
  29. Symlink string
  30. // Buffer is used for reading the content in chunks
  31. Buffer []byte
  32. // Config is the gitleaks config used for shouldSkipPath. If not set, then
  33. // shouldSkipPath is ignored
  34. Config *config.Config
  35. // outerPaths is the list of container paths (e.g. archives) that lead to
  36. // this file
  37. outerPaths []string
  38. // MaxArchiveDepth limits how deep the sources will explore nested archives
  39. MaxArchiveDepth int
  40. // archiveDepth is the current archive nesting depth
  41. archiveDepth int
  42. }
  43. // Fragments yields fragments for the this source
  44. func (s *File) Fragments(ctx context.Context, yield FragmentsFunc) error {
  45. format, _, err := archives.Identify(ctx, s.Path, nil)
  46. // Process the file as an archive if there's no error && Identify returns
  47. // a format; but if there's an error or no format, just swallow the error
  48. // and fall back on treating it like a normal file and let fileFragments
  49. // decide what to do with it.
  50. if err == nil && format != nil {
  51. if s.archiveDepth+1 > s.MaxArchiveDepth {
  52. logging.Warn().Str(
  53. "path", s.FullPath(),
  54. ).Int(
  55. "max_archive_depth", s.MaxArchiveDepth,
  56. ).Msg("skipping archive: exceeds max archive depth")
  57. return nil
  58. }
  59. if extractor, ok := format.(archives.Extractor); ok {
  60. return s.extractorFragments(ctx, extractor, s.Content, yield)
  61. }
  62. if decompressor, ok := format.(archives.Decompressor); ok {
  63. return s.decompressorFragments(decompressor, s.Content, yield)
  64. }
  65. logging.Warn().Str("path", s.FullPath()).Msg("skipping unknown archive type")
  66. }
  67. return s.fileFragments(bufio.NewReader(s.Content), yield)
  68. }
  69. // extractorFragments recursively crawls archives and yields fragments
  70. func (s *File) extractorFragments(ctx context.Context, extractor archives.Extractor, reader io.Reader, yield FragmentsFunc) error {
  71. if _, isSeekReaderAt := reader.(seekReaderAt); !isSeekReaderAt {
  72. switch extractor.(type) {
  73. case archives.SevenZip, archives.Zip:
  74. tmpfile, err := os.CreateTemp("", "gitleaks-archive-")
  75. if err != nil {
  76. logging.Error().Str("path", s.FullPath()).Msg("could not create tmp file")
  77. return nil
  78. }
  79. defer func() {
  80. _ = tmpfile.Close()
  81. _ = os.Remove(tmpfile.Name())
  82. }()
  83. _, err = io.Copy(tmpfile, reader)
  84. if err != nil {
  85. logging.Error().Str("path", s.FullPath()).Msg("could not copy archive file")
  86. return nil
  87. }
  88. reader = tmpfile
  89. }
  90. }
  91. return extractor.Extract(ctx, reader, func(_ context.Context, d archives.FileInfo) error {
  92. if d.IsDir() {
  93. return nil
  94. }
  95. innerReader, err := d.Open()
  96. if err != nil {
  97. logging.Error().Err(err).Str("path", s.FullPath()).Msg("could not open archive inner file")
  98. return nil
  99. }
  100. defer innerReader.Close()
  101. path := filepath.Clean(d.NameInArchive)
  102. if s.Config != nil && shouldSkipPath(s.Config, path) {
  103. logging.Debug().Str("path", s.FullPath()).Msg("skipping file: global allowlist")
  104. return nil
  105. }
  106. file := &File{
  107. Content: innerReader,
  108. Path: path,
  109. Symlink: s.Symlink,
  110. outerPaths: append(s.outerPaths, filepath.ToSlash(s.Path)),
  111. MaxArchiveDepth: s.MaxArchiveDepth,
  112. archiveDepth: s.archiveDepth + 1,
  113. }
  114. if err := file.Fragments(ctx, yield); err != nil {
  115. return err
  116. }
  117. return nil
  118. })
  119. }
  120. // decompressorFragments recursively crawls archives and yields fragments
  121. func (s *File) decompressorFragments(decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
  122. innerReader, err := decompressor.OpenReader(reader)
  123. if err != nil {
  124. logging.Error().Str("path", s.FullPath()).Msg("could read compressed file")
  125. return nil
  126. }
  127. if err := s.fileFragments(bufio.NewReader(innerReader), yield); err != nil {
  128. _ = innerReader.Close()
  129. return err
  130. }
  131. _ = innerReader.Close()
  132. return nil
  133. }
  134. // fileFragments reads the file into fragments to yield
  135. func (s *File) fileFragments(reader *bufio.Reader, yield FragmentsFunc) error {
  136. // Create a buffer if the caller hasn't provided one
  137. if s.Buffer == nil {
  138. s.Buffer = make([]byte, defaultBufferSize)
  139. }
  140. totalLines := 0
  141. for {
  142. fragment := Fragment{
  143. FilePath: s.FullPath(),
  144. }
  145. n, err := reader.Read(s.Buffer)
  146. if n == 0 {
  147. if err != nil && err != io.EOF {
  148. return yield(fragment, fmt.Errorf("could not read file: %w", err))
  149. }
  150. return nil
  151. }
  152. // Only check the filetype at the start of file.
  153. if totalLines == 0 {
  154. // TODO: could other optimizations be introduced here?
  155. if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
  156. return yield(
  157. fragment,
  158. fmt.Errorf("could not read file: could not determine type: %w", err),
  159. )
  160. } else if mimetype.MIME.Type == "application" {
  161. logging.Debug().
  162. Str("mime_type", mimetype.MIME.Value).
  163. Str("path", s.FullPath()).
  164. Msgf("skipping binary file")
  165. return nil
  166. }
  167. }
  168. // Try to split chunks across large areas of whitespace, if possible.
  169. peekBuf := bytes.NewBuffer(s.Buffer[:n])
  170. if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
  171. return yield(
  172. fragment,
  173. fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
  174. )
  175. }
  176. fragment.Raw = peekBuf.String()
  177. fragment.Bytes = peekBuf.Bytes()
  178. fragment.StartLine = totalLines + 1
  179. // Count the number of newlines in this chunk
  180. totalLines += strings.Count(fragment.Raw, "\n")
  181. if len(s.Symlink) > 0 {
  182. fragment.SymlinkFile = s.Symlink
  183. }
  184. if isWindows {
  185. fragment.FilePath = filepath.ToSlash(fragment.FilePath)
  186. fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
  187. fragment.WindowsFilePath = s.FullPath()
  188. }
  189. // log errors but continue since there's content
  190. if err != nil && err != io.EOF {
  191. logging.Warn().Err(err).Msgf("issue reading file")
  192. }
  193. // Done with the file!
  194. if err == io.EOF {
  195. return yield(fragment, nil)
  196. }
  197. if err := yield(fragment, err); err != nil {
  198. return err
  199. }
  200. }
  201. }
  202. // FullPath returns the File.Path with any preceding outer paths
  203. func (s *File) FullPath() string {
  204. if len(s.outerPaths) > 0 {
  205. return strings.Join(
  206. // outerPaths have already been normalized to slash
  207. append(s.outerPaths, s.Path),
  208. InnerPathSeparator,
  209. )
  210. }
  211. return s.Path
  212. }