scan.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. package scan
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "sync"
  7. "time"
  8. "github.com/zricethezav/gitleaks/v6/manager"
  9. "github.com/go-git/go-git/v5"
  10. "github.com/go-git/go-git/v5/plumbing"
  11. fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"
  12. "github.com/go-git/go-git/v5/plumbing/object"
  13. "github.com/go-git/go-git/v5/plumbing/storer"
  14. "github.com/sergi/go-diff/diffmatchpatch"
  15. log "github.com/sirupsen/logrus"
  16. )
  17. // Bundle contains various git information for scans.
  18. type Bundle struct {
  19. Commit *object.Commit
  20. Patch string
  21. Content string
  22. FilePath string
  23. Operation fdiff.Operation
  24. reader io.Reader
  25. lineLookup map[string]bool
  26. scanType int
  27. }
  28. // commitScanner is a function signature for scanning commits. There is some
  29. // redundant work needed by scanning all files at a commit (--files-at-commit=) and scanning
  30. // the patches generated by a commit (--commit=). The function scanCommit wraps that redundant work
  31. // and accepts a commitScanner for the different logic needed between the two cases described above.
  32. type commitScanner func(c *object.Commit, repo *Repo) error
  33. const (
  34. // We need to differentiate between scans as the logic for line searching is different between
  35. // scanning patches, commits, and uncommitted files.
  36. patchScan int = iota + 1
  37. uncommittedScan
  38. commitScan
  39. )
  40. // Scan is responsible for scanning the entire history (default behavior) of a
  41. // git repo. Options that can change the behavior of this function include: --Commit, --depth, --branch.
  42. // See options/options.go for an explanation on these options.
  43. func (repo *Repo) Scan() error {
  44. if err := repo.setupTimeout(); err != nil {
  45. return err
  46. }
  47. if repo.cancel != nil {
  48. defer repo.cancel()
  49. }
  50. if repo.Repository == nil {
  51. return fmt.Errorf("%s repo is empty", repo.Name)
  52. }
  53. // load up alternative config if possible, if not use manager's config
  54. if repo.Manager.Opts.RepoConfig {
  55. cfg, err := repo.loadRepoConfig()
  56. if err != nil {
  57. return err
  58. }
  59. repo.config = cfg
  60. }
  61. scanTimeStart := time.Now()
  62. // scan Commit patches OR all files at Commit. See https://github.com/zricethezav/gitleaks/issues/326
  63. if repo.Manager.Opts.Commit != "" {
  64. return scanCommit(repo.Manager.Opts.Commit, repo, scanCommitPatches)
  65. } else if repo.Manager.Opts.FilesAtCommit != "" {
  66. return scanCommit(repo.Manager.Opts.FilesAtCommit, repo, scanFilesAtCommit)
  67. }
  68. logOpts, err := getLogOptions(repo)
  69. if err != nil {
  70. return err
  71. }
  72. cIter, err := repo.Log(logOpts)
  73. if err != nil {
  74. return err
  75. }
  76. cc := 0
  77. semaphore := make(chan bool, howManyThreads(repo.Manager.Opts.Threads))
  78. wg := sync.WaitGroup{}
  79. err = cIter.ForEach(func(c *object.Commit) error {
  80. if c == nil || repo.timeoutReached() || repo.depthReached(cc) {
  81. return storer.ErrStop
  82. }
  83. // Check if Commit is allowlisted
  84. if isCommitAllowListed(c.Hash.String(), repo.config.Allowlist.Commits) {
  85. return nil
  86. }
  87. // Check if at root
  88. if len(c.ParentHashes) == 0 {
  89. cc++
  90. err = scanFilesAtCommit(c, repo)
  91. if err != nil {
  92. return err
  93. }
  94. return nil
  95. }
  96. // increase Commit counter
  97. cc++
  98. // inspect first parent only as all other parents will be eventually reached
  99. // (they exist as the tip of other branches, etc)
  100. // See https://github.com/zricethezav/gitleaks/issues/413 for details
  101. parent, err := c.Parent(0)
  102. if err != nil {
  103. return err
  104. }
  105. defer func() {
  106. if err := recover(); err != nil {
  107. // sometimes the Patch generation will fail due to a known bug in
  108. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  109. // Once a fix has been merged I will remove this recover.
  110. return
  111. }
  112. }()
  113. if repo.timeoutReached() {
  114. return nil
  115. }
  116. if parent == nil {
  117. // shouldn't reach this point but just in case
  118. return nil
  119. }
  120. start := time.Now()
  121. patch, err := parent.Patch(c)
  122. if err != nil {
  123. log.Errorf("could not generate Patch")
  124. }
  125. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  126. wg.Add(1)
  127. semaphore <- true
  128. go func(c *object.Commit, patch *object.Patch) {
  129. defer func() {
  130. <-semaphore
  131. wg.Done()
  132. }()
  133. scanPatch(patch, c, repo)
  134. }(c, patch)
  135. if c.Hash.String() == repo.Manager.Opts.CommitTo {
  136. return storer.ErrStop
  137. }
  138. return nil
  139. })
  140. wg.Wait()
  141. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  142. repo.Manager.IncrementCommits(cc)
  143. return nil
  144. }
  145. // scanEmpty scans an empty repo without any commits. See https://github.com/zricethezav/gitleaks/issues/352
  146. func (repo *Repo) scanEmpty() error {
  147. scanTimeStart := time.Now()
  148. wt, err := repo.Worktree()
  149. if err != nil {
  150. return err
  151. }
  152. status, err := wt.Status()
  153. for fn := range status {
  154. workTreeBuf := bytes.NewBuffer(nil)
  155. workTreeFile, err := wt.Filesystem.Open(fn)
  156. if err != nil {
  157. continue
  158. }
  159. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  160. return err
  161. }
  162. repo.CheckRules(&Bundle{
  163. Content: workTreeBuf.String(),
  164. FilePath: workTreeFile.Name(),
  165. Commit: emptyCommit(),
  166. scanType: uncommittedScan,
  167. })
  168. }
  169. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  170. return nil
  171. }
  172. // scanUncommitted will do a `git diff` and scan changed files that are being tracked. This is useful functionality
  173. // for a pre-Commit hook so you can make sure your code does not have any leaks before committing.
  174. func (repo *Repo) scanUncommitted() error {
  175. // load up alternative config if possible, if not use manager's config
  176. if repo.Manager.Opts.RepoConfig {
  177. cfg, err := repo.loadRepoConfig()
  178. if err != nil {
  179. return err
  180. }
  181. repo.config = cfg
  182. }
  183. if err := repo.setupTimeout(); err != nil {
  184. return err
  185. }
  186. r, err := repo.Head()
  187. if err == plumbing.ErrReferenceNotFound {
  188. // possibly an empty repo, or maybe its not, either way lets scan all the files in the directory
  189. return repo.scanEmpty()
  190. } else if err != nil {
  191. return err
  192. }
  193. scanTimeStart := time.Now()
  194. c, err := repo.CommitObject(r.Hash())
  195. if err != nil {
  196. return err
  197. }
  198. // Staged change so the Commit details do not yet exist. Insert empty defaults.
  199. c.Hash = plumbing.Hash{}
  200. c.Message = "***STAGED CHANGES***"
  201. c.Author.Name = ""
  202. c.Author.Email = ""
  203. c.Author.When = time.Unix(0, 0).UTC()
  204. prevTree, err := c.Tree()
  205. if err != nil {
  206. return err
  207. }
  208. wt, err := repo.Worktree()
  209. if err != nil {
  210. return err
  211. }
  212. status, err := wt.Status()
  213. for fn, state := range status {
  214. var (
  215. prevFileContents string
  216. currFileContents string
  217. filename string
  218. )
  219. if state.Staging != git.Untracked {
  220. if state.Staging == git.Deleted {
  221. // file in staging has been deleted, aka it is not on the filesystem
  222. // so the contents of the file are ""
  223. currFileContents = ""
  224. } else {
  225. workTreeBuf := bytes.NewBuffer(nil)
  226. workTreeFile, err := wt.Filesystem.Open(fn)
  227. if err != nil {
  228. continue
  229. }
  230. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  231. return err
  232. }
  233. currFileContents = workTreeBuf.String()
  234. filename = workTreeFile.Name()
  235. }
  236. // get files at HEAD state
  237. prevFile, err := prevTree.File(fn)
  238. if err != nil {
  239. prevFileContents = ""
  240. } else {
  241. prevFileContents, err = prevFile.Contents()
  242. if err != nil {
  243. return err
  244. }
  245. if filename == "" {
  246. filename = prevFile.Name
  247. }
  248. }
  249. dmp := diffmatchpatch.New()
  250. diffs := dmp.DiffCleanupSemantic(dmp.DiffMain(prevFileContents, currFileContents, false))
  251. var diffContents string
  252. for _, d := range diffs {
  253. if d.Type == diffmatchpatch.DiffInsert {
  254. diffContents += fmt.Sprintf("%s\n", d.Text)
  255. }
  256. }
  257. repo.CheckRules(&Bundle{
  258. Content: diffContents,
  259. FilePath: filename,
  260. Commit: c,
  261. scanType: uncommittedScan,
  262. })
  263. }
  264. }
  265. if err != nil {
  266. return err
  267. }
  268. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  269. return nil
  270. }
  271. // scan accepts a Patch, Commit, and repo. If the patches contains files that are
  272. // binary, then gitleaks will skip scanning that file OR if a file is matched on
  273. // allowlisted files set in the configuration. If a global rule for files is defined and a filename
  274. // matches said global rule, then a leak is sent to the manager.
  275. // After that, file chunks are created which are then inspected by InspectString()
  276. func scanPatch(patch *object.Patch, c *object.Commit, repo *Repo) {
  277. bundle := Bundle{
  278. Commit: c,
  279. Patch: patch.String(),
  280. scanType: patchScan,
  281. }
  282. for _, f := range patch.FilePatches() {
  283. if repo.timeoutReached() {
  284. return
  285. }
  286. if f.IsBinary() {
  287. continue
  288. }
  289. for _, chunk := range f.Chunks() {
  290. if chunk.Type() == fdiff.Add || (repo.Manager.Opts.Deletion && chunk.Type() == fdiff.Delete) {
  291. bundle.Content = chunk.Content()
  292. bundle.Operation = chunk.Type()
  293. // get filepath
  294. from, to := f.Files()
  295. if from != nil {
  296. bundle.FilePath = from.Path()
  297. } else if to != nil {
  298. bundle.FilePath = to.Path()
  299. } else {
  300. bundle.FilePath = "???"
  301. }
  302. repo.CheckRules(&bundle)
  303. }
  304. }
  305. }
  306. }
  307. // scanCommit accepts a Commit hash, repo, and commit scanning function. A new Commit
  308. // object will be created from the hash which will be passed into either scanCommitPatches
  309. // or scanFilesAtCommit depending on the options set.
  310. func scanCommit(commit string, repo *Repo, f commitScanner) error {
  311. if commit == "latest" {
  312. ref, err := repo.Repository.Head()
  313. if err != nil {
  314. return err
  315. }
  316. commit = ref.Hash().String()
  317. }
  318. repo.Manager.IncrementCommits(1)
  319. h := plumbing.NewHash(commit)
  320. c, err := repo.CommitObject(h)
  321. if err != nil {
  322. return err
  323. }
  324. return f(c, repo)
  325. }
  326. // scanCommitPatches accepts a Commit object and a repo. This function is only called when the --Commit=
  327. // option has been set. That option tells gitleaks to look only at a single Commit and check the contents
  328. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  329. // allowlisted then those files will be skipped.
  330. func scanCommitPatches(c *object.Commit, repo *Repo) error {
  331. if len(c.ParentHashes) == 0 {
  332. err := scanFilesAtCommit(c, repo)
  333. if err != nil {
  334. return err
  335. }
  336. }
  337. return c.Parents().ForEach(func(parent *object.Commit) error {
  338. defer func() {
  339. if err := recover(); err != nil {
  340. // sometimes the Patch generation will fail due to a known bug in
  341. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  342. // Once a fix has been merged I will remove this recover.
  343. return
  344. }
  345. }()
  346. if repo.timeoutReached() {
  347. return nil
  348. }
  349. if parent == nil {
  350. return nil
  351. }
  352. start := time.Now()
  353. patch, err := parent.Patch(c)
  354. if err != nil {
  355. return fmt.Errorf("could not generate Patch")
  356. }
  357. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  358. scanPatch(patch, c, repo)
  359. return nil
  360. })
  361. }
  362. // scanFilesAtCommit accepts a Commit object and a repo. This function is only called when the --files-at-Commit=
  363. // option has been set. That option tells gitleaks to look only at ALL the files at a Commit and check the contents
  364. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  365. // allowlisted then those files will be skipped.
  366. func scanFilesAtCommit(c *object.Commit, repo *Repo) error {
  367. fIter, err := c.Files()
  368. if err != nil {
  369. return err
  370. }
  371. err = fIter.ForEach(func(f *object.File) error {
  372. bin, err := f.IsBinary()
  373. if bin || repo.timeoutReached() {
  374. return nil
  375. } else if err != nil {
  376. return err
  377. }
  378. content, err := f.Contents()
  379. if err != nil {
  380. return err
  381. }
  382. repo.CheckRules(&Bundle{
  383. Content: content,
  384. FilePath: f.Name,
  385. Commit: c,
  386. scanType: commitScan,
  387. Operation: fdiff.Add,
  388. })
  389. return nil
  390. })
  391. return err
  392. }
  393. // depthReached checks if i meets the depth (--depth=) if set
  394. func (repo *Repo) depthReached(i int) bool {
  395. if repo.Manager.Opts.Depth != 0 && repo.Manager.Opts.Depth == i {
  396. log.Warnf("Exceeded depth limit (%d)", i)
  397. return true
  398. }
  399. return false
  400. }
  401. // emptyCommit generates an empty commit used for scanning uncommitted changes
  402. func emptyCommit() *object.Commit {
  403. return &object.Commit{
  404. Hash: plumbing.Hash{},
  405. Message: "***STAGED CHANGES***",
  406. Author: object.Signature{
  407. Name: "",
  408. Email: "",
  409. When: time.Unix(0, 0).UTC(),
  410. },
  411. }
  412. }