4
0

file.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "fmt"
  7. "io"
  8. "os"
  9. "path/filepath"
  10. "strings"
  11. "github.com/h2non/filetype"
  12. "github.com/mholt/archives"
  13. "github.com/rs/zerolog"
  14. "github.com/zricethezav/gitleaks/v8/config"
  15. "github.com/zricethezav/gitleaks/v8/logging"
  16. )
  17. const defaultBufferSize = 100 * 1_000 // 100kb
  18. const InnerPathSeparator = "!"
  19. type seekReaderAt interface {
  20. io.ReaderAt
  21. io.Seeker
  22. }
  23. // File is a source for yielding fragments from a file or other reader
  24. type File struct {
  25. // Content provides a reader to the file's content
  26. Content io.Reader
  27. // Path is the resolved real path of the file
  28. Path string
  29. // Symlink represents a symlink to the file if that's how it was discovered
  30. Symlink string
  31. // Buffer is used for reading the content in chunks
  32. Buffer []byte
  33. // Config is the gitleaks config used for shouldSkipPath. If not set, then
  34. // shouldSkipPath is ignored
  35. Config *config.Config
  36. // outerPaths is the list of container paths (e.g. archives) that lead to
  37. // this file
  38. outerPaths []string
  39. // MaxArchiveDepth limits how deep the sources will explore nested archives
  40. MaxArchiveDepth int
  41. // archiveDepth is the current archive nesting depth
  42. archiveDepth int
  43. }
  44. // Fragments yields fragments for the this source
  45. func (s *File) Fragments(ctx context.Context, yield FragmentsFunc) error {
  46. format, _, err := archives.Identify(ctx, s.Path, nil)
  47. // Process the file as an archive if there's no error && Identify returns
  48. // a format; but if there's an error or no format, just swallow the error
  49. // and fall back on treating it like a normal file and let fileFragments
  50. // decide what to do with it.
  51. if err == nil && format != nil {
  52. if s.archiveDepth+1 > s.MaxArchiveDepth {
  53. var event *zerolog.Event
  54. // Warn if the feature is enabled; else emit a trace log.
  55. if s.MaxArchiveDepth != 0 {
  56. event = logging.Warn()
  57. } else {
  58. event = logging.Trace()
  59. }
  60. event.Str(
  61. "path", s.FullPath(),
  62. ).Int(
  63. "max_archive_depth", s.MaxArchiveDepth,
  64. ).Msg("skipping archive: exceeds max archive depth")
  65. return nil
  66. }
  67. if extractor, ok := format.(archives.Extractor); ok {
  68. return s.extractorFragments(ctx, extractor, s.Content, yield)
  69. }
  70. if decompressor, ok := format.(archives.Decompressor); ok {
  71. return s.decompressorFragments(ctx, decompressor, s.Content, yield)
  72. }
  73. logging.Warn().Str("path", s.FullPath()).Msg("skipping unknown archive type")
  74. }
  75. return s.fileFragments(ctx, bufio.NewReader(s.Content), yield)
  76. }
  77. // extractorFragments recursively crawls archives and yields fragments
  78. func (s *File) extractorFragments(ctx context.Context, extractor archives.Extractor, reader io.Reader, yield FragmentsFunc) error {
  79. if _, isSeekReaderAt := reader.(seekReaderAt); !isSeekReaderAt {
  80. switch extractor.(type) {
  81. case archives.SevenZip, archives.Zip:
  82. tmpfile, err := os.CreateTemp("", "gitleaks-archive-")
  83. if err != nil {
  84. logging.Error().Str("path", s.FullPath()).Msg("could not create tmp file")
  85. return nil
  86. }
  87. defer func() {
  88. _ = tmpfile.Close()
  89. _ = os.Remove(tmpfile.Name())
  90. }()
  91. _, err = io.Copy(tmpfile, reader)
  92. if err != nil {
  93. logging.Error().Str("path", s.FullPath()).Msg("could not copy archive file")
  94. return nil
  95. }
  96. reader = tmpfile
  97. }
  98. }
  99. return extractor.Extract(ctx, reader, func(_ context.Context, d archives.FileInfo) error {
  100. if d.IsDir() {
  101. return nil
  102. }
  103. innerReader, err := d.Open()
  104. if err != nil {
  105. logging.Error().Err(err).Str("path", s.FullPath()).Msg("could not open archive inner file")
  106. return nil
  107. }
  108. defer innerReader.Close()
  109. path := filepath.Clean(d.NameInArchive)
  110. if s.Config != nil && shouldSkipPath(s.Config, path) {
  111. logging.Debug().Str("path", s.FullPath()).Msg("skipping file: global allowlist")
  112. return nil
  113. }
  114. file := &File{
  115. Content: innerReader,
  116. Path: path,
  117. Symlink: s.Symlink,
  118. outerPaths: append(s.outerPaths, filepath.ToSlash(s.Path)),
  119. MaxArchiveDepth: s.MaxArchiveDepth,
  120. archiveDepth: s.archiveDepth + 1,
  121. }
  122. if err := file.Fragments(ctx, yield); err != nil {
  123. return err
  124. }
  125. return nil
  126. })
  127. }
  128. // decompressorFragments recursively crawls archives and yields fragments
  129. func (s *File) decompressorFragments(ctx context.Context, decompressor archives.Decompressor, reader io.Reader, yield FragmentsFunc) error {
  130. innerReader, err := decompressor.OpenReader(reader)
  131. if err != nil {
  132. logging.Error().Str("path", s.FullPath()).Msg("could read compressed file")
  133. return nil
  134. }
  135. if err := s.fileFragments(ctx, bufio.NewReader(innerReader), yield); err != nil {
  136. _ = innerReader.Close()
  137. return err
  138. }
  139. _ = innerReader.Close()
  140. return nil
  141. }
  142. // fileFragments reads the file into fragments to yield
  143. func (s *File) fileFragments(ctx context.Context, reader *bufio.Reader, yield FragmentsFunc) error {
  144. // Create a buffer if the caller hasn't provided one
  145. if s.Buffer == nil {
  146. s.Buffer = make([]byte, defaultBufferSize)
  147. }
  148. totalLines := 0
  149. for {
  150. select {
  151. case <-ctx.Done():
  152. return ctx.Err()
  153. default:
  154. fragment := Fragment{
  155. FilePath: s.FullPath(),
  156. }
  157. n, err := reader.Read(s.Buffer)
  158. if n == 0 {
  159. if err != nil && err != io.EOF {
  160. return yield(fragment, fmt.Errorf("could not read file: %w", err))
  161. }
  162. return nil
  163. }
  164. // Only check the filetype at the start of file.
  165. if totalLines == 0 {
  166. // TODO: could other optimizations be introduced here?
  167. if mimetype, err := filetype.Match(s.Buffer[:n]); err != nil {
  168. return yield(
  169. fragment,
  170. fmt.Errorf("could not read file: could not determine type: %w", err),
  171. )
  172. } else if mimetype.MIME.Type == "application" {
  173. logging.Debug().
  174. Str("mime_type", mimetype.MIME.Value).
  175. Str("path", s.FullPath()).
  176. Msgf("skipping binary file")
  177. return nil
  178. }
  179. }
  180. // Try to split chunks across large areas of whitespace, if possible.
  181. peekBuf := bytes.NewBuffer(s.Buffer[:n])
  182. if err := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); err != nil {
  183. return yield(
  184. fragment,
  185. fmt.Errorf("could not read file: could not read until safe boundary: %w", err),
  186. )
  187. }
  188. fragment.Raw = peekBuf.String()
  189. fragment.Bytes = peekBuf.Bytes()
  190. fragment.StartLine = totalLines + 1
  191. // Count the number of newlines in this chunk
  192. totalLines += strings.Count(fragment.Raw, "\n")
  193. if len(s.Symlink) > 0 {
  194. fragment.SymlinkFile = s.Symlink
  195. }
  196. if isWindows {
  197. fragment.FilePath = filepath.ToSlash(fragment.FilePath)
  198. fragment.SymlinkFile = filepath.ToSlash(s.Symlink)
  199. fragment.WindowsFilePath = s.FullPath()
  200. }
  201. // log errors but continue since there's content
  202. if err != nil && err != io.EOF {
  203. logging.Warn().Err(err).Msgf("issue reading file")
  204. }
  205. // Done with the file!
  206. if err == io.EOF {
  207. return yield(fragment, nil)
  208. }
  209. if err := yield(fragment, err); err != nil {
  210. return err
  211. }
  212. }
  213. }
  214. }
  215. // FullPath returns the File.Path with any preceding outer paths
  216. func (s *File) FullPath() string {
  217. if len(s.outerPaths) > 0 {
  218. return strings.Join(
  219. // outerPaths have already been normalized to slash
  220. append(s.outerPaths, s.Path),
  221. InnerPathSeparator,
  222. )
  223. }
  224. return s.Path
  225. }