file.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "fmt"
  7. "io"
  8. "os"
  9. "path/filepath"
  10. "strings"
  11. "github.com/h2non/filetype"
  12. "github.com/mholt/archives"
  13. "github.com/zricethezav/gitleaks/v8/config"
  14. "github.com/zricethezav/gitleaks/v8/logging"
  15. )
  16. const defaultBufferSize = 100 * 1_000 // 100kb
  17. const InnerPathSeparator = "!"
  18. type seekReaderAt interface {
  19. io.ReaderAt
  20. io.Seeker
  21. }
  22. // File is a source for yielding fragments from a file or other reader
  23. type File struct {
  24. // Content provides a reader to the file's content
  25. Content io.Reader
  26. // Path is the resolved real path of the file
  27. Path string
  28. // Symlink represents a symlink to the file if that's how it was discovered
  29. Symlink string
  30. // Buffer is used for reading the content in chunks
  31. Buffer []byte
  32. // Config is the gitleaks config used for shouldSkipPath. If not set, then
  33. // shouldSkipPath is ignored
  34. Config *config.Config
  35. // outerPaths is the list of container paths (e.g. archives) that lead to
  36. // this file
  37. outerPaths []string
  38. // MaxArchiveDepth limits how deep the sources will explore nested archives
  39. MaxArchiveDepth int
  40. // archiveDepth is the current archive nesting depth
  41. archiveDepth int
  42. }
  43. // Fragments yields fragments for the this source
  44. func (s *File) Fragments(ctx context.Context, yield FragmentsFunc) error {
  45. format, _, err := archives.Identify(ctx, s.Path, nil)
  46. // Process the file as an archive if there's no error && Identify returns
  47. // a format; but if there's an error or no format, just swallow the error
  48. // and fall back on treating it like a normal file and let fileFragments
  49. // decide what to do with it.
  50. if err == nil && format != nil {
  51. if s.archiveDepth+1 > s.MaxArchiveDepth {
  52. // Only warn when the feature is enabled
  53. if s.MaxArchiveDepth != 0 {
  54. logging.Warn().Str(
  55. "path", s.FullPath(),
  56. ).Int(
  57. "max_archive_depth", s.MaxArchiveDepth,
  58. ).Msg("skipping archive: exceeds max archive depth")
  59. }
  60. return nil
  61. }
  62. if extractor, ok := format.(archives.Extractor); ok {
  63. return s.extractorFragments(ctx, extractor, s.Content, yield)
  64. }
  65. if decompressor, ok := format.(archives.Decompressor); ok {
  66. return s.decompressorFragments(decompressor, s.Content, yield)
  67. }
  68. logging.Warn().Str("path", s.FullPath()).Msg("skipping unknown archive type")
  69. }
  70. return s.fileFragments(bufio.NewReader(s.Content), yield)
  71. }
  72. // extractorFragments recursively crawls archives and yields fragments
  73. func (s *File) extractorFragments(ctx context.Context, extractor archives.Extractor, reader io.Reader, yield FragmentsFunc) error {
  74. if _, isSeekReaderAt := reader.(seekReaderAt); !isSeekReaderAt {
  75. switch extractor.(type) {
  76. case archives.SevenZip, archives.Zip:
  77. tmpfile, err := os.CreateTemp("", "gitleaks-archive-")
  78. if err != nil {
  79. logging.Error().Str("path", s.FullPath()).Msg("could not create tmp file")
  80. return nil
  81. }
  82. defer func() {
  83. _ = tmpfile.Close()
  84. _ = os.Remove(tmpfile.Name())
  85. }()
  86. _, err = io.Copy(tmpfile, reader)
  87. if err != nil {
  88. logging.Error().Str("path", s.FullPath()).Msg("could not copy archive file")
  89. return nil
  90. }
  91. reader = tmpfile
  92. }
  93. }
  94. return extractor.Extract(ctx, reader, func(_ context.Context, d archives.FileInfo) error {
  95. if d.IsDir() {
  96. return nil
  97. }
  98. innerReader, err := d.Open()
  99. if err != nil {
  100. logging.Error().Err(err).Str("path", s.FullPath()).Msg("could not open archive inner file")
  101. return nil
  102. }
  103. defer innerReader.Close()
  104. path := filepath.Clean(d.NameInArchive)
  105. if s.Config != nil && shouldSkipPath(s.Config, path) {
  106. logging.Debug().Str("path", s.FullPath()).Msg("skipping file: global allowlist")
  107. return nil
  108. }
  109. file := &File{
  110. Content: innerReader,
  111. Path: path,
  112. Symlink: s.Symlink,
  113. outerPaths: append(s.outerPaths, filepath.ToSlash(s.Path)),
  114. MaxArchiveDepth: s.MaxArchiveDepth,
  115. archiveDepth: s.archiveDepth + 1,
  116. }
  117. if err := file.Fragments(ctx, yield); err != nil {
  118. return err
  119. }
  120. return nil
  121. })
  122. }
  123. // decompressorFragments recursively crawls archives and yields fragments
  124. func (s *File) decompressorFragments(decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
  125. innerReader, err := decompressor.OpenReader(reader)
  126. if err != nil {
  127. logging.Error().Str("path", s.FullPath()).Msg("could read compressed file")
  128. return nil
  129. }
  130. if err := s.fileFragments(bufio.NewReader(innerReader), yield); err != nil {
  131. _ = innerReader.Close()
  132. return err
  133. }
  134. _ = innerReader.Close()
  135. return nil
  136. }
  137. // fileFragments reads the file into fragments to yield
  138. func (s *File) fileFragments(reader *bufio.Reader, yield FragmentsFunc) error {
  139. // Create a buffer if the caller hasn't provided one
  140. if s.Buffer == nil {
  141. s.Buffer = make([]byte, defaultBufferSize)
  142. }
  143. totalLines := 0
  144. for {
  145. fragment := Fragment{
  146. FilePath: s.FullPath(),
  147. }
  148. n, err := reader.Read(s.Buffer)
  149. if n == 0 {
  150. if err != nil && err != io.EOF {
  151. return yield(fragment, fmt.Errorf("could not read file: %w", err))
  152. }
  153. return nil
  154. }
  155. // Only check the filetype at the start of file.
  156. if totalLines == 0 {
  157. // TODO: could other optimizations be introduced here?
  158. if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
  159. return yield(
  160. fragment,
  161. fmt.Errorf("could not read file: could not determine type: %w", err),
  162. )
  163. } else if mimetype.MIME.Type == "application" {
  164. logging.Debug().
  165. Str("mime_type", mimetype.MIME.Value).
  166. Str("path", s.FullPath()).
  167. Msgf("skipping binary file")
  168. return nil
  169. }
  170. }
  171. // Try to split chunks across large areas of whitespace, if possible.
  172. peekBuf := bytes.NewBuffer(s.Buffer[:n])
  173. if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
  174. return yield(
  175. fragment,
  176. fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
  177. )
  178. }
  179. fragment.Raw = peekBuf.String()
  180. fragment.Bytes = peekBuf.Bytes()
  181. fragment.StartLine = totalLines + 1
  182. // Count the number of newlines in this chunk
  183. totalLines += strings.Count(fragment.Raw, "\n")
  184. if len(s.Symlink) > 0 {
  185. fragment.SymlinkFile = s.Symlink
  186. }
  187. if isWindows {
  188. fragment.FilePath = filepath.ToSlash(fragment.FilePath)
  189. fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
  190. fragment.WindowsFilePath = s.FullPath()
  191. }
  192. // log errors but continue since there's content
  193. if err != nil && err != io.EOF {
  194. logging.Warn().Err(err).Msgf("issue reading file")
  195. }
  196. // Done with the file!
  197. if err == io.EOF {
  198. return yield(fragment, nil)
  199. }
  200. if err := yield(fragment, err); err != nil {
  201. return err
  202. }
  203. }
  204. }
  205. // FullPath returns the File.Path with any preceding outer paths
  206. func (s *File) FullPath() string {
  207. if len(s.outerPaths) > 0 {
  208. return strings.Join(
  209. // outerPaths have already been normalized to slash
  210. append(s.outerPaths, s.Path),
  211. InnerPathSeparator,
  212. )
  213. }
  214. return s.Path
  215. }