scan.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. package scan
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "sync"
  7. "time"
  8. "github.com/zricethezav/gitleaks/v5/manager"
  9. "github.com/go-git/go-git/v5"
  10. "github.com/go-git/go-git/v5/plumbing"
  11. fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"
  12. "github.com/go-git/go-git/v5/plumbing/object"
  13. "github.com/go-git/go-git/v5/plumbing/storer"
  14. "github.com/sergi/go-diff/diffmatchpatch"
  15. log "github.com/sirupsen/logrus"
  16. )
  17. // Bundle contains various git information for scans.
  18. type Bundle struct {
  19. Commit *object.Commit
  20. Patch string
  21. Content string
  22. FilePath string
  23. Operation fdiff.Operation
  24. reader io.Reader
  25. lineLookup map[string]bool
  26. scanType int
  27. }
  28. // commitScanner is a function signature for scanning commits. There is some
  29. // redundant work needed by scanning all files at a commit (--files-at-commit=) and scanning
  30. // the patches generated by a commit (--commit=). The function scanCommit wraps that redundant work
  31. // and accepts a commitScanner for the different logic needed between the two cases described above.
  32. type commitScanner func(c *object.Commit, repo *Repo) error
  33. const (
  34. // We need to differentiate between scans as the logic for line searching is different between
  35. // scanning patches, commits, and uncommitted files.
  36. patchScan int = iota + 1
  37. uncommittedScan
  38. commitScan
  39. )
  40. // Scan is responsible for scanning the entire history (default behavior) of a
  41. // git repo. Options that can change the behavior of this function include: --Commit, --depth, --branch.
  42. // See options/options.go for an explanation on these options.
  43. func (repo *Repo) Scan() error {
  44. if err := repo.setupTimeout(); err != nil {
  45. return err
  46. }
  47. if repo.cancel != nil {
  48. defer repo.cancel()
  49. }
  50. if repo.Repository == nil {
  51. return fmt.Errorf("%s repo is empty", repo.Name)
  52. }
  53. // load up alternative config if possible, if not use manager's config
  54. if repo.Manager.Opts.RepoConfig {
  55. cfg, err := repo.loadRepoConfig()
  56. if err != nil {
  57. return err
  58. }
  59. repo.config = cfg
  60. }
  61. scanTimeStart := time.Now()
  62. // scan Commit patches OR all files at Commit. See https://github.com/zricethezav/gitleaks/issues/326
  63. if repo.Manager.Opts.Commit != "" {
  64. return scanCommit(repo.Manager.Opts.Commit, repo, scanCommitPatches)
  65. } else if repo.Manager.Opts.FilesAtCommit != "" {
  66. return scanCommit(repo.Manager.Opts.FilesAtCommit, repo, scanFilesAtCommit)
  67. }
  68. logOpts, err := getLogOptions(repo)
  69. if err != nil {
  70. return err
  71. }
  72. cIter, err := repo.Log(logOpts)
  73. if err != nil {
  74. return err
  75. }
  76. cc := 0
  77. semaphore := make(chan bool, howManyThreads(repo.Manager.Opts.Threads))
  78. wg := sync.WaitGroup{}
  79. err = cIter.ForEach(func(c *object.Commit) error {
  80. if c == nil || repo.timeoutReached() || repo.depthReached(cc) {
  81. return storer.ErrStop
  82. }
  83. // Check if Commit is allowlisted
  84. if isCommitWhiteListed(c.Hash.String(), repo.config.Allowlist.Commits) {
  85. return nil
  86. }
  87. // Check if at root
  88. if len(c.ParentHashes) == 0 {
  89. cc++
  90. err = scanFilesAtCommit(c, repo)
  91. if err != nil {
  92. return err
  93. }
  94. return nil
  95. }
  96. // increase Commit counter
  97. cc++
  98. // inspect first parent only as all other parents will be eventually reached
  99. // (they exist as the tip of other branches, etc)
  100. // See https://github.com/zricethezav/gitleaks/issues/413 for details
  101. parent, err := c.Parent(0)
  102. if err != nil {
  103. return err
  104. }
  105. defer func() {
  106. if err := recover(); err != nil {
  107. // sometimes the Patch generation will fail due to a known bug in
  108. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  109. // Once a fix has been merged I will remove this recover.
  110. return
  111. }
  112. }()
  113. if repo.timeoutReached() {
  114. return nil
  115. }
  116. if parent == nil {
  117. // shouldn't reach this point but just in case
  118. return nil
  119. }
  120. start := time.Now()
  121. patch, err := parent.Patch(c)
  122. if err != nil {
  123. return fmt.Errorf("could not generate Patch")
  124. }
  125. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  126. wg.Add(1)
  127. semaphore <- true
  128. go func(c *object.Commit, patch *object.Patch) {
  129. defer func() {
  130. <-semaphore
  131. wg.Done()
  132. }()
  133. scanPatch(patch, c, repo)
  134. }(c, patch)
  135. if c.Hash.String() == repo.Manager.Opts.CommitTo {
  136. return storer.ErrStop
  137. }
  138. return nil
  139. })
  140. wg.Wait()
  141. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  142. repo.Manager.IncrementCommits(cc)
  143. return nil
  144. }
  145. // scanEmpty scans an empty repo without any commits. See https://github.com/zricethezav/gitleaks/issues/352
  146. func (repo *Repo) scanEmpty() error {
  147. scanTimeStart := time.Now()
  148. wt, err := repo.Worktree()
  149. if err != nil {
  150. return err
  151. }
  152. status, err := wt.Status()
  153. for fn := range status {
  154. workTreeBuf := bytes.NewBuffer(nil)
  155. workTreeFile, err := wt.Filesystem.Open(fn)
  156. if err != nil {
  157. continue
  158. }
  159. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  160. return err
  161. }
  162. repo.CheckRules(&Bundle{
  163. Content: workTreeBuf.String(),
  164. FilePath: workTreeFile.Name(),
  165. Commit: emptyCommit(),
  166. scanType: uncommittedScan,
  167. })
  168. }
  169. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  170. return nil
  171. }
  172. // scanUncommitted will do a `git diff` and scan changed files that are being tracked. This is useful functionality
  173. // for a pre-Commit hook so you can make sure your code does not have any leaks before committing.
  174. func (repo *Repo) scanUncommitted() error {
  175. // load up alternative config if possible, if not use manager's config
  176. if repo.Manager.Opts.RepoConfig {
  177. cfg, err := repo.loadRepoConfig()
  178. if err != nil {
  179. return err
  180. }
  181. repo.config = cfg
  182. }
  183. if err := repo.setupTimeout(); err != nil {
  184. return err
  185. }
  186. r, err := repo.Head()
  187. if err == plumbing.ErrReferenceNotFound {
  188. // possibly an empty repo, or maybe its not, either way lets scan all the files in the directory
  189. return repo.scanEmpty()
  190. } else if err != nil {
  191. return err
  192. }
  193. scanTimeStart := time.Now()
  194. c, err := repo.CommitObject(r.Hash())
  195. if err != nil {
  196. return err
  197. }
  198. // Staged change so the Commit details do not yet exist. Insert empty defaults.
  199. c.Hash = plumbing.Hash{}
  200. c.Message = "***STAGED CHANGES***"
  201. c.Author.Name = ""
  202. c.Author.Email = ""
  203. c.Author.When = time.Unix(0, 0).UTC()
  204. prevTree, err := c.Tree()
  205. if err != nil {
  206. return err
  207. }
  208. wt, err := repo.Worktree()
  209. if err != nil {
  210. return err
  211. }
  212. status, err := wt.Status()
  213. for fn, state := range status {
  214. var (
  215. prevFileContents string
  216. currFileContents string
  217. filename string
  218. )
  219. if state.Staging != git.Untracked {
  220. if state.Staging == git.Deleted {
  221. // file in staging has been deleted, aka it is not on the filesystem
  222. // so the contents of the file are ""
  223. currFileContents = ""
  224. } else {
  225. workTreeBuf := bytes.NewBuffer(nil)
  226. workTreeFile, err := wt.Filesystem.Open(fn)
  227. if err != nil {
  228. continue
  229. }
  230. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  231. return err
  232. }
  233. currFileContents = workTreeBuf.String()
  234. filename = workTreeFile.Name()
  235. }
  236. // get files at HEAD state
  237. prevFile, err := prevTree.File(fn)
  238. if err != nil {
  239. prevFileContents = ""
  240. } else {
  241. prevFileContents, err = prevFile.Contents()
  242. if err != nil {
  243. return err
  244. }
  245. if filename == "" {
  246. filename = prevFile.Name
  247. }
  248. }
  249. diffs := diffmatchpatch.New().DiffMain(prevFileContents, currFileContents, false)
  250. var diffContents string
  251. for _, d := range diffs {
  252. if d.Type == diffmatchpatch.DiffInsert {
  253. diffContents += fmt.Sprintf("%s\n", d.Text)
  254. }
  255. }
  256. repo.CheckRules(&Bundle{
  257. Content: diffContents,
  258. FilePath: filename,
  259. Commit: c,
  260. scanType: uncommittedScan,
  261. })
  262. }
  263. }
  264. if err != nil {
  265. return err
  266. }
  267. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  268. return nil
  269. }
  270. // scan accepts a Patch, Commit, and repo. If the patches contains files that are
  271. // binary, then gitleaks will skip scanning that file OR if a file is matched on
  272. // allowlisted files set in the configuration. If a global rule for files is defined and a filename
  273. // matches said global rule, then a leak is sent to the manager.
  274. // After that, file chunks are created which are then inspected by InspectString()
  275. func scanPatch(patch *object.Patch, c *object.Commit, repo *Repo) {
  276. bundle := Bundle{
  277. Commit: c,
  278. Patch: patch.String(),
  279. scanType: patchScan,
  280. }
  281. for _, f := range patch.FilePatches() {
  282. if repo.timeoutReached() {
  283. return
  284. }
  285. if f.IsBinary() {
  286. continue
  287. }
  288. for _, chunk := range f.Chunks() {
  289. if chunk.Type() == fdiff.Add || (repo.Manager.Opts.Deletion && chunk.Type() == fdiff.Delete) {
  290. bundle.Content = chunk.Content()
  291. bundle.Operation = chunk.Type()
  292. // get filepath
  293. from, to := f.Files()
  294. if from != nil {
  295. bundle.FilePath = from.Path()
  296. } else if to != nil {
  297. bundle.FilePath = to.Path()
  298. } else {
  299. bundle.FilePath = "???"
  300. }
  301. repo.CheckRules(&bundle)
  302. }
  303. }
  304. }
  305. }
  306. // scanCommit accepts a Commit hash, repo, and commit scanning function. A new Commit
  307. // object will be created from the hash which will be passed into either scanCommitPatches
  308. // or scanFilesAtCommit depending on the options set.
  309. func scanCommit(commit string, repo *Repo, f commitScanner) error {
  310. if commit == "latest" {
  311. ref, err := repo.Repository.Head()
  312. if err != nil {
  313. return err
  314. }
  315. commit = ref.Hash().String()
  316. }
  317. repo.Manager.IncrementCommits(1)
  318. h := plumbing.NewHash(commit)
  319. c, err := repo.CommitObject(h)
  320. if err != nil {
  321. return err
  322. }
  323. return f(c, repo)
  324. }
  325. // scanCommitPatches accepts a Commit object and a repo. This function is only called when the --Commit=
  326. // option has been set. That option tells gitleaks to look only at a single Commit and check the contents
  327. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  328. // allowlisted then those files will be skipped.
  329. func scanCommitPatches(c *object.Commit, repo *Repo) error {
  330. if len(c.ParentHashes) == 0 {
  331. err := scanFilesAtCommit(c, repo)
  332. if err != nil {
  333. return err
  334. }
  335. }
  336. return c.Parents().ForEach(func(parent *object.Commit) error {
  337. defer func() {
  338. if err := recover(); err != nil {
  339. // sometimes the Patch generation will fail due to a known bug in
  340. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  341. // Once a fix has been merged I will remove this recover.
  342. return
  343. }
  344. }()
  345. if repo.timeoutReached() {
  346. return nil
  347. }
  348. if parent == nil {
  349. return nil
  350. }
  351. start := time.Now()
  352. patch, err := parent.Patch(c)
  353. if err != nil {
  354. return fmt.Errorf("could not generate Patch")
  355. }
  356. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  357. scanPatch(patch, c, repo)
  358. return nil
  359. })
  360. }
  361. // scanFilesAtCommit accepts a Commit object and a repo. This function is only called when the --files-at-Commit=
  362. // option has been set. That option tells gitleaks to look only at ALL the files at a Commit and check the contents
  363. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  364. // allowlisted then those files will be skipped.
  365. func scanFilesAtCommit(c *object.Commit, repo *Repo) error {
  366. fIter, err := c.Files()
  367. if err != nil {
  368. return err
  369. }
  370. err = fIter.ForEach(func(f *object.File) error {
  371. bin, err := f.IsBinary()
  372. if bin || repo.timeoutReached() {
  373. return nil
  374. } else if err != nil {
  375. return err
  376. }
  377. content, err := f.Contents()
  378. if err != nil {
  379. return err
  380. }
  381. repo.CheckRules(&Bundle{
  382. Content: content,
  383. FilePath: f.Name,
  384. Commit: c,
  385. scanType: commitScan,
  386. Operation: fdiff.Add,
  387. })
  388. return nil
  389. })
  390. return err
  391. }
  392. // depthReached checks if i meets the depth (--depth=) if set
  393. func (repo *Repo) depthReached(i int) bool {
  394. if repo.Manager.Opts.Depth != 0 && repo.Manager.Opts.Depth == i {
  395. log.Warnf("Exceeded depth limit (%d)", i)
  396. return true
  397. }
  398. return false
  399. }
  400. // emptyCommit generates an empty commit used for scanning uncommitted changes
  401. func emptyCommit() *object.Commit {
  402. return &object.Commit{
  403. Hash: plumbing.Hash{},
  404. Message: "***STAGED CHANGES***",
  405. Author: object.Signature{
  406. Name: "",
  407. Email: "",
  408. When: time.Unix(0, 0).UTC(),
  409. },
  410. }
  411. }