scan.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. package scan
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "sync"
  7. "time"
  8. "github.com/zricethezav/gitleaks/v6/manager"
  9. "github.com/go-git/go-git/v5"
  10. "github.com/go-git/go-git/v5/plumbing"
  11. fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"
  12. "github.com/go-git/go-git/v5/plumbing/object"
  13. "github.com/go-git/go-git/v5/plumbing/storer"
  14. "github.com/sergi/go-diff/diffmatchpatch"
  15. log "github.com/sirupsen/logrus"
  16. )
  17. // Bundle contains various git information for scans.
  18. type Bundle struct {
  19. Commit *object.Commit
  20. Patch string
  21. Content string
  22. FilePath string
  23. Operation fdiff.Operation
  24. reader io.Reader
  25. lineLookup map[string]bool
  26. scanType int
  27. }
  28. // commitScanner is a function signature for scanning commits. There is some
  29. // redundant work needed by scanning all files at a commit (--files-at-commit=) and scanning
  30. // the patches generated by a commit (--commit=). The function scanCommit wraps that redundant work
  31. // and accepts a commitScanner for the different logic needed between the two cases described above.
  32. type commitScanner func(c *object.Commit, repo *Repo) error
  33. const (
  34. // We need to differentiate between scans as the logic for line searching is different between
  35. // scanning patches, commits, and uncommitted files.
  36. patchScan int = iota + 1
  37. uncommittedScan
  38. commitScan
  39. )
  40. // Scan is responsible for scanning the entire history (default behavior) of a
  41. // git repo. Options that can change the behavior of this function include: --Commit, --depth, --branch.
  42. // See options/options.go for an explanation on these options.
  43. func (repo *Repo) Scan() error {
  44. if err := repo.setupTimeout(); err != nil {
  45. return err
  46. }
  47. if repo.cancel != nil {
  48. defer repo.cancel()
  49. }
  50. if repo.Repository == nil {
  51. return fmt.Errorf("%s repo is empty", repo.Name)
  52. }
  53. // load up alternative config if possible, if not use manager's config
  54. if repo.Manager.Opts.RepoConfig {
  55. cfg, err := repo.loadRepoConfig()
  56. if err != nil {
  57. return err
  58. }
  59. repo.config = cfg
  60. }
  61. scanTimeStart := time.Now()
  62. // scan Commit patches OR all files at Commit. See https://github.com/zricethezav/gitleaks/issues/326
  63. if repo.Manager.Opts.Commit != "" {
  64. return scanCommit(repo.Manager.Opts.Commit, repo, scanCommitPatches)
  65. } else if repo.Manager.Opts.FilesAtCommit != "" {
  66. return scanCommit(repo.Manager.Opts.FilesAtCommit, repo, scanFilesAtCommit)
  67. }
  68. logOpts, err := getLogOptions(repo)
  69. if err != nil {
  70. return err
  71. }
  72. cIter, err := repo.Log(logOpts)
  73. if err != nil {
  74. return err
  75. }
  76. cc := 0
  77. semaphore := make(chan bool, howManyThreads(repo.Manager.Opts.Threads))
  78. wg := sync.WaitGroup{}
  79. err = cIter.ForEach(func(c *object.Commit) error {
  80. if c == nil || repo.timeoutReached() || repo.depthReached(cc) {
  81. return storer.ErrStop
  82. }
  83. // Check if Commit is allowlisted
  84. if isCommitAllowListed(c.Hash.String(), repo.config.Allowlist.Commits) {
  85. return nil
  86. }
  87. // Check if at root
  88. if len(c.ParentHashes) == 0 {
  89. cc++
  90. err = scanFilesAtCommit(c, repo)
  91. if err != nil {
  92. return err
  93. }
  94. return nil
  95. }
  96. // increase Commit counter
  97. cc++
  98. // inspect first parent only as all other parents will be eventually reached
  99. // (they exist as the tip of other branches, etc)
  100. // See https://github.com/zricethezav/gitleaks/issues/413 for details
  101. parent, err := c.Parent(0)
  102. if err != nil {
  103. return err
  104. }
  105. defer func() {
  106. if err := recover(); err != nil {
  107. // sometimes the Patch generation will fail due to a known bug in
  108. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  109. // Once a fix has been merged I will remove this recover.
  110. return
  111. }
  112. }()
  113. if repo.timeoutReached() {
  114. return nil
  115. }
  116. if parent == nil {
  117. // shouldn't reach this point but just in case
  118. return nil
  119. }
  120. start := time.Now()
  121. patch, err := parent.Patch(c)
  122. if err != nil {
  123. log.Errorf("could not generate Patch")
  124. }
  125. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  126. wg.Add(1)
  127. semaphore <- true
  128. go func(c *object.Commit, patch *object.Patch) {
  129. defer func() {
  130. <-semaphore
  131. wg.Done()
  132. }()
  133. scanPatch(patch, c, repo)
  134. }(c, patch)
  135. if c.Hash.String() == repo.Manager.Opts.CommitTo {
  136. return storer.ErrStop
  137. }
  138. return nil
  139. })
  140. wg.Wait()
  141. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  142. repo.Manager.IncrementCommits(cc)
  143. return nil
  144. }
  145. // scanEmpty scans an empty repo without any commits. See https://github.com/zricethezav/gitleaks/issues/352
  146. func (repo *Repo) scanEmpty() error {
  147. scanTimeStart := time.Now()
  148. wt, err := repo.Worktree()
  149. if err != nil {
  150. return err
  151. }
  152. status, err := wt.Status()
  153. if err != nil {
  154. return err
  155. }
  156. for fn := range status {
  157. workTreeBuf := bytes.NewBuffer(nil)
  158. workTreeFile, err := wt.Filesystem.Open(fn)
  159. if err != nil {
  160. continue
  161. }
  162. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  163. return err
  164. }
  165. repo.CheckRules(&Bundle{
  166. Content: workTreeBuf.String(),
  167. FilePath: workTreeFile.Name(),
  168. Commit: emptyCommit(),
  169. scanType: uncommittedScan,
  170. })
  171. }
  172. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  173. return nil
  174. }
  175. // scanUncommitted will do a `git diff` and scan changed files that are being tracked. This is useful functionality
  176. // for a pre-Commit hook so you can make sure your code does not have any leaks before committing.
  177. func (repo *Repo) scanUncommitted() error {
  178. // load up alternative config if possible, if not use manager's config
  179. if repo.Manager.Opts.RepoConfig {
  180. cfg, err := repo.loadRepoConfig()
  181. if err != nil {
  182. return err
  183. }
  184. repo.config = cfg
  185. }
  186. if err := repo.setupTimeout(); err != nil {
  187. return err
  188. }
  189. r, err := repo.Head()
  190. if err == plumbing.ErrReferenceNotFound {
  191. // possibly an empty repo, or maybe its not, either way lets scan all the files in the directory
  192. return repo.scanEmpty()
  193. } else if err != nil {
  194. return err
  195. }
  196. scanTimeStart := time.Now()
  197. c, err := repo.CommitObject(r.Hash())
  198. if err != nil {
  199. return err
  200. }
  201. // Staged change so the Commit details do not yet exist. Insert empty defaults.
  202. c.Hash = plumbing.Hash{}
  203. c.Message = "***STAGED CHANGES***"
  204. c.Author.Name = ""
  205. c.Author.Email = ""
  206. c.Author.When = time.Unix(0, 0).UTC()
  207. prevTree, err := c.Tree()
  208. if err != nil {
  209. return err
  210. }
  211. wt, err := repo.Worktree()
  212. if err != nil {
  213. return err
  214. }
  215. status, err := wt.Status()
  216. for fn, state := range status {
  217. var (
  218. prevFileContents string
  219. currFileContents string
  220. filename string
  221. )
  222. if state.Staging != git.Untracked {
  223. if state.Staging == git.Deleted {
  224. // file in staging has been deleted, aka it is not on the filesystem
  225. // so the contents of the file are ""
  226. currFileContents = ""
  227. } else {
  228. workTreeBuf := bytes.NewBuffer(nil)
  229. workTreeFile, err := wt.Filesystem.Open(fn)
  230. if err != nil {
  231. continue
  232. }
  233. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  234. return err
  235. }
  236. currFileContents = workTreeBuf.String()
  237. filename = workTreeFile.Name()
  238. }
  239. // get files at HEAD state
  240. prevFile, err := prevTree.File(fn)
  241. if err != nil {
  242. prevFileContents = ""
  243. } else {
  244. prevFileContents, err = prevFile.Contents()
  245. if err != nil {
  246. return err
  247. }
  248. if filename == "" {
  249. filename = prevFile.Name
  250. }
  251. }
  252. dmp := diffmatchpatch.New()
  253. diffs := dmp.DiffCleanupSemantic(dmp.DiffMain(prevFileContents, currFileContents, false))
  254. var diffContents string
  255. for _, d := range diffs {
  256. if d.Type == diffmatchpatch.DiffInsert {
  257. diffContents += fmt.Sprintf("%s\n", d.Text)
  258. }
  259. }
  260. repo.CheckRules(&Bundle{
  261. Content: diffContents,
  262. FilePath: filename,
  263. Commit: c,
  264. scanType: uncommittedScan,
  265. })
  266. }
  267. }
  268. if err != nil {
  269. return err
  270. }
  271. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  272. return nil
  273. }
  274. // scan accepts a Patch, Commit, and repo. If the patches contains files that are
  275. // binary, then gitleaks will skip scanning that file OR if a file is matched on
  276. // allowlisted files set in the configuration. If a global rule for files is defined and a filename
  277. // matches said global rule, then a leak is sent to the manager.
  278. // After that, file chunks are created which are then inspected by InspectString()
  279. func scanPatch(patch *object.Patch, c *object.Commit, repo *Repo) {
  280. bundle := Bundle{
  281. Commit: c,
  282. Patch: patch.String(),
  283. scanType: patchScan,
  284. }
  285. for _, f := range patch.FilePatches() {
  286. if repo.timeoutReached() {
  287. return
  288. }
  289. if f.IsBinary() {
  290. continue
  291. }
  292. for _, chunk := range f.Chunks() {
  293. if chunk.Type() == fdiff.Add || (repo.Manager.Opts.Deletion && chunk.Type() == fdiff.Delete) {
  294. bundle.Content = chunk.Content()
  295. bundle.Operation = chunk.Type()
  296. // get filepath
  297. from, to := f.Files()
  298. if from != nil {
  299. bundle.FilePath = from.Path()
  300. } else if to != nil {
  301. bundle.FilePath = to.Path()
  302. } else {
  303. bundle.FilePath = "???"
  304. }
  305. repo.CheckRules(&bundle)
  306. }
  307. }
  308. }
  309. }
  310. // scanCommit accepts a Commit hash, repo, and commit scanning function. A new Commit
  311. // object will be created from the hash which will be passed into either scanCommitPatches
  312. // or scanFilesAtCommit depending on the options set.
  313. func scanCommit(commit string, repo *Repo, f commitScanner) error {
  314. if commit == "latest" {
  315. ref, err := repo.Repository.Head()
  316. if err != nil {
  317. return err
  318. }
  319. commit = ref.Hash().String()
  320. }
  321. repo.Manager.IncrementCommits(1)
  322. h := plumbing.NewHash(commit)
  323. c, err := repo.CommitObject(h)
  324. if err != nil {
  325. return err
  326. }
  327. return f(c, repo)
  328. }
  329. // scanCommitPatches accepts a Commit object and a repo. This function is only called when the --Commit=
  330. // option has been set. That option tells gitleaks to look only at a single Commit and check the contents
  331. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  332. // allowlisted then those files will be skipped.
  333. func scanCommitPatches(c *object.Commit, repo *Repo) error {
  334. if len(c.ParentHashes) == 0 {
  335. err := scanFilesAtCommit(c, repo)
  336. if err != nil {
  337. return err
  338. }
  339. }
  340. return c.Parents().ForEach(func(parent *object.Commit) error {
  341. defer func() {
  342. if err := recover(); err != nil {
  343. // sometimes the Patch generation will fail due to a known bug in
  344. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  345. // Once a fix has been merged I will remove this recover.
  346. return
  347. }
  348. }()
  349. if repo.timeoutReached() {
  350. return nil
  351. }
  352. if parent == nil {
  353. return nil
  354. }
  355. start := time.Now()
  356. patch, err := parent.Patch(c)
  357. if err != nil {
  358. return fmt.Errorf("could not generate Patch")
  359. }
  360. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  361. scanPatch(patch, c, repo)
  362. return nil
  363. })
  364. }
  365. // scanFilesAtCommit accepts a Commit object and a repo. This function is only called when the --files-at-Commit=
  366. // option has been set. That option tells gitleaks to look only at ALL the files at a Commit and check the contents
  367. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  368. // allowlisted then those files will be skipped.
  369. func scanFilesAtCommit(c *object.Commit, repo *Repo) error {
  370. fIter, err := c.Files()
  371. if err != nil {
  372. return err
  373. }
  374. err = fIter.ForEach(func(f *object.File) error {
  375. bin, err := f.IsBinary()
  376. if bin || repo.timeoutReached() {
  377. return nil
  378. } else if err != nil {
  379. return err
  380. }
  381. content, err := f.Contents()
  382. if err != nil {
  383. return err
  384. }
  385. repo.CheckRules(&Bundle{
  386. Content: content,
  387. FilePath: f.Name,
  388. Commit: c,
  389. scanType: commitScan,
  390. Operation: fdiff.Add,
  391. })
  392. return nil
  393. })
  394. return err
  395. }
  396. // depthReached checks if i meets the depth (--depth=) if set
  397. func (repo *Repo) depthReached(i int) bool {
  398. if repo.Manager.Opts.Depth != 0 && repo.Manager.Opts.Depth == i {
  399. log.Warnf("Exceeded depth limit (%d)", i)
  400. return true
  401. }
  402. return false
  403. }
  404. // emptyCommit generates an empty commit used for scanning uncommitted changes
  405. func emptyCommit() *object.Commit {
  406. return &object.Commit{
  407. Hash: plumbing.Hash{},
  408. Message: "***STAGED CHANGES***",
  409. Author: object.Signature{
  410. Name: "",
  411. Email: "",
  412. When: time.Unix(0, 0).UTC(),
  413. },
  414. }
  415. }