scan.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. package scan
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "sync"
  7. "time"
  8. "github.com/zricethezav/gitleaks/v5/manager"
  9. "github.com/go-git/go-git/v5"
  10. "github.com/go-git/go-git/v5/plumbing"
  11. fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"
  12. "github.com/go-git/go-git/v5/plumbing/object"
  13. "github.com/go-git/go-git/v5/plumbing/storer"
  14. "github.com/sergi/go-diff/diffmatchpatch"
  15. log "github.com/sirupsen/logrus"
  16. )
  17. // Bundle contains various git information for scans.
  18. type Bundle struct {
  19. Commit *object.Commit
  20. Patch string
  21. Content string
  22. FilePath string
  23. Operation fdiff.Operation
  24. reader io.Reader
  25. lineLookup map[string]bool
  26. scanType int
  27. }
  28. // commitScanner is a function signature for scanning commits. There is some
  29. // redundant work needed by scanning all files at a commit (--files-at-commit=) and scanning
  30. // the patches generated by a commit (--commit=). The function scanCommit wraps that redundant work
  31. // and accepts a commitScanner for the different logic needed between the two cases described above.
  32. type commitScanner func(c *object.Commit, repo *Repo) error
  33. const (
  34. // We need to differentiate between scans as the logic for line searching is different between
  35. // scanning patches, commits, and uncommitted files.
  36. patchScan int = iota + 1
  37. uncommittedScan
  38. commitScan
  39. )
  40. // Scan is responsible for scanning the entire history (default behavior) of a
  41. // git repo. Options that can change the behavior of this function include: --Commit, --depth, --branch.
  42. // See options/options.go for an explanation on these options.
  43. func (repo *Repo) Scan() error {
  44. if err := repo.setupTimeout(); err != nil {
  45. return err
  46. }
  47. if repo.cancel != nil {
  48. defer repo.cancel()
  49. }
  50. if repo.Repository == nil {
  51. return fmt.Errorf("%s repo is empty", repo.Name)
  52. }
  53. // load up alternative config if possible, if not use manager's config
  54. if repo.Manager.Opts.RepoConfig {
  55. cfg, err := repo.loadRepoConfig()
  56. if err != nil {
  57. return err
  58. }
  59. repo.config = cfg
  60. }
  61. scanTimeStart := time.Now()
  62. // scan Commit patches OR all files at Commit. See https://github.com/zricethezav/gitleaks/issues/326
  63. if repo.Manager.Opts.Commit != "" {
  64. return scanCommit(repo.Manager.Opts.Commit, repo, scanCommitPatches)
  65. } else if repo.Manager.Opts.FilesAtCommit != "" {
  66. return scanCommit(repo.Manager.Opts.FilesAtCommit, repo, scanFilesAtCommit)
  67. }
  68. logOpts, err := getLogOptions(repo)
  69. if err != nil {
  70. return err
  71. }
  72. cIter, err := repo.Log(logOpts)
  73. if err != nil {
  74. return err
  75. }
  76. cc := 0
  77. semaphore := make(chan bool, howManyThreads(repo.Manager.Opts.Threads))
  78. wg := sync.WaitGroup{}
  79. err = cIter.ForEach(func(c *object.Commit) error {
  80. if c == nil || repo.timeoutReached() || repo.depthReached(cc) {
  81. return storer.ErrStop
  82. }
  83. // Check if Commit is whitelisted
  84. if isCommitWhiteListed(c.Hash.String(), repo.config.Whitelist.Commits) {
  85. return nil
  86. }
  87. // Check if at root
  88. if len(c.ParentHashes) == 0 {
  89. cc++
  90. err = scanFilesAtCommit(c, repo)
  91. if err != nil {
  92. return err
  93. }
  94. return nil
  95. }
  96. // increase Commit counter
  97. cc++
  98. err = c.Parents().ForEach(func(parent *object.Commit) error {
  99. defer func() {
  100. if err := recover(); err != nil {
  101. // sometimes the Patch generation will fail due to a known bug in
  102. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  103. // Once a fix has been merged I will remove this recover.
  104. return
  105. }
  106. }()
  107. if repo.timeoutReached() {
  108. return nil
  109. }
  110. if parent == nil {
  111. // shouldn't reach this point but just in case
  112. return nil
  113. }
  114. start := time.Now()
  115. patch, err := parent.Patch(c)
  116. if err != nil {
  117. return fmt.Errorf("could not generate Patch")
  118. }
  119. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  120. wg.Add(1)
  121. semaphore <- true
  122. go func(c *object.Commit, patch *object.Patch) {
  123. defer func() {
  124. <-semaphore
  125. wg.Done()
  126. }()
  127. scanPatch(patch, c, repo)
  128. }(c, patch)
  129. return nil
  130. })
  131. if c.Hash.String() == repo.Manager.Opts.CommitTo {
  132. return storer.ErrStop
  133. }
  134. return nil
  135. })
  136. wg.Wait()
  137. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  138. repo.Manager.IncrementCommits(cc)
  139. return nil
  140. }
  141. // scanEmpty scans an empty repo without any commits. See https://github.com/zricethezav/gitleaks/issues/352
  142. func (repo *Repo) scanEmpty() error {
  143. scanTimeStart := time.Now()
  144. wt, err := repo.Worktree()
  145. if err != nil {
  146. return err
  147. }
  148. status, err := wt.Status()
  149. for fn := range status {
  150. workTreeBuf := bytes.NewBuffer(nil)
  151. workTreeFile, err := wt.Filesystem.Open(fn)
  152. if err != nil {
  153. continue
  154. }
  155. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  156. return err
  157. }
  158. repo.CheckRules(&Bundle{
  159. Content: workTreeBuf.String(),
  160. FilePath: workTreeFile.Name(),
  161. Commit: emptyCommit(),
  162. scanType: uncommittedScan,
  163. })
  164. }
  165. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  166. return nil
  167. }
  168. // scanUncommitted will do a `git diff` and scan changed files that are being tracked. This is useful functionality
  169. // for a pre-Commit hook so you can make sure your code does not have any leaks before committing.
  170. func (repo *Repo) scanUncommitted() error {
  171. // load up alternative config if possible, if not use manager's config
  172. if repo.Manager.Opts.RepoConfig {
  173. cfg, err := repo.loadRepoConfig()
  174. if err != nil {
  175. return err
  176. }
  177. repo.config = cfg
  178. }
  179. if err := repo.setupTimeout(); err != nil {
  180. return err
  181. }
  182. r, err := repo.Head()
  183. if err == plumbing.ErrReferenceNotFound {
  184. // possibly an empty repo, or maybe its not, either way lets scan all the files in the directory
  185. return repo.scanEmpty()
  186. } else if err != nil {
  187. return err
  188. }
  189. scanTimeStart := time.Now()
  190. c, err := repo.CommitObject(r.Hash())
  191. if err != nil {
  192. return err
  193. }
  194. // Staged change so the Commit details do not yet exist. Insert empty defaults.
  195. c.Hash = plumbing.Hash{}
  196. c.Message = "***STAGED CHANGES***"
  197. c.Author.Name = ""
  198. c.Author.Email = ""
  199. c.Author.When = time.Unix(0, 0).UTC()
  200. prevTree, err := c.Tree()
  201. if err != nil {
  202. return err
  203. }
  204. wt, err := repo.Worktree()
  205. if err != nil {
  206. return err
  207. }
  208. status, err := wt.Status()
  209. for fn, state := range status {
  210. var (
  211. prevFileContents string
  212. currFileContents string
  213. filename string
  214. )
  215. if state.Staging != git.Untracked {
  216. if state.Staging == git.Deleted {
  217. // file in staging has been deleted, aka it is not on the filesystem
  218. // so the contents of the file are ""
  219. currFileContents = ""
  220. } else {
  221. workTreeBuf := bytes.NewBuffer(nil)
  222. workTreeFile, err := wt.Filesystem.Open(fn)
  223. if err != nil {
  224. continue
  225. }
  226. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  227. return err
  228. }
  229. currFileContents = workTreeBuf.String()
  230. filename = workTreeFile.Name()
  231. }
  232. // get files at HEAD state
  233. prevFile, err := prevTree.File(fn)
  234. if err != nil {
  235. prevFileContents = ""
  236. } else {
  237. prevFileContents, err = prevFile.Contents()
  238. if err != nil {
  239. return err
  240. }
  241. if filename == "" {
  242. filename = prevFile.Name
  243. }
  244. }
  245. diffs := diffmatchpatch.New().DiffMain(prevFileContents, currFileContents, false)
  246. var diffContents string
  247. for _, d := range diffs {
  248. if d.Type == diffmatchpatch.DiffInsert {
  249. diffContents += fmt.Sprintf("%s\n", d.Text)
  250. }
  251. }
  252. repo.CheckRules(&Bundle{
  253. Content: diffContents,
  254. FilePath: filename,
  255. Commit: c,
  256. scanType: uncommittedScan,
  257. })
  258. }
  259. }
  260. if err != nil {
  261. return err
  262. }
  263. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  264. return nil
  265. }
  266. // scan accepts a Patch, Commit, and repo. If the patches contains files that are
  267. // binary, then gitleaks will skip scanning that file OR if a file is matched on
  268. // whitelisted files set in the configuration. If a global rule for files is defined and a filename
  269. // matches said global rule, then a leak is sent to the manager.
  270. // After that, file chunks are created which are then inspected by InspectString()
  271. func scanPatch(patch *object.Patch, c *object.Commit, repo *Repo) {
  272. bundle := Bundle{
  273. Commit: c,
  274. Patch: patch.String(),
  275. scanType: patchScan,
  276. }
  277. for _, f := range patch.FilePatches() {
  278. if repo.timeoutReached() {
  279. return
  280. }
  281. if f.IsBinary() {
  282. continue
  283. }
  284. for _, chunk := range f.Chunks() {
  285. if chunk.Type() == fdiff.Add || (repo.Manager.Opts.Deletion && chunk.Type() == fdiff.Delete) {
  286. bundle.Content = chunk.Content()
  287. bundle.Operation = chunk.Type()
  288. // get filepath
  289. from, to := f.Files()
  290. if from != nil {
  291. bundle.FilePath = from.Path()
  292. } else if to != nil {
  293. bundle.FilePath = to.Path()
  294. } else {
  295. bundle.FilePath = "???"
  296. }
  297. repo.CheckRules(&bundle)
  298. }
  299. }
  300. }
  301. }
  302. // scanCommit accepts a Commit hash, repo, and commit scanning function. A new Commit
  303. // object will be created from the hash which will be passed into either scanCommitPatches
  304. // or scanFilesAtCommit depending on the options set.
  305. func scanCommit(commit string, repo *Repo, f commitScanner) error {
  306. if commit == "latest" {
  307. ref, err := repo.Repository.Head()
  308. if err != nil {
  309. return err
  310. }
  311. commit = ref.Hash().String()
  312. }
  313. repo.Manager.IncrementCommits(1)
  314. h := plumbing.NewHash(commit)
  315. c, err := repo.CommitObject(h)
  316. if err != nil {
  317. return err
  318. }
  319. return f(c, repo)
  320. }
  321. // scanCommitPatches accepts a Commit object and a repo. This function is only called when the --Commit=
  322. // option has been set. That option tells gitleaks to look only at a single Commit and check the contents
  323. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  324. // whitelisted then those files will be skipped.
  325. func scanCommitPatches(c *object.Commit, repo *Repo) error {
  326. if len(c.ParentHashes) == 0 {
  327. err := scanFilesAtCommit(c, repo)
  328. if err != nil {
  329. return err
  330. }
  331. }
  332. return c.Parents().ForEach(func(parent *object.Commit) error {
  333. defer func() {
  334. if err := recover(); err != nil {
  335. // sometimes the Patch generation will fail due to a known bug in
  336. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  337. // Once a fix has been merged I will remove this recover.
  338. return
  339. }
  340. }()
  341. if repo.timeoutReached() {
  342. return nil
  343. }
  344. if parent == nil {
  345. return nil
  346. }
  347. start := time.Now()
  348. patch, err := parent.Patch(c)
  349. if err != nil {
  350. return fmt.Errorf("could not generate Patch")
  351. }
  352. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  353. scanPatch(patch, c, repo)
  354. return nil
  355. })
  356. }
  357. // scanFilesAtCommit accepts a Commit object and a repo. This function is only called when the --files-at-Commit=
  358. // option has been set. That option tells gitleaks to look only at ALL the files at a Commit and check the contents
  359. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  360. // whitelisted then those files will be skipped.
  361. func scanFilesAtCommit(c *object.Commit, repo *Repo) error {
  362. fIter, err := c.Files()
  363. if err != nil {
  364. return err
  365. }
  366. err = fIter.ForEach(func(f *object.File) error {
  367. bin, err := f.IsBinary()
  368. if bin || repo.timeoutReached() {
  369. return nil
  370. } else if err != nil {
  371. return err
  372. }
  373. content, err := f.Contents()
  374. if err != nil {
  375. return err
  376. }
  377. repo.CheckRules(&Bundle{
  378. Content: content,
  379. FilePath: f.Name,
  380. Commit: c,
  381. scanType: commitScan,
  382. Operation: fdiff.Add,
  383. })
  384. return nil
  385. })
  386. return err
  387. }
  388. // depthReached checks if i meets the depth (--depth=) if set
  389. func (repo *Repo) depthReached(i int) bool {
  390. if repo.Manager.Opts.Depth != 0 && repo.Manager.Opts.Depth == i {
  391. log.Warnf("Exceeded depth limit (%d)", i)
  392. return true
  393. }
  394. return false
  395. }
  396. // emptyCommit generates an empty commit used for scanning uncommitted changes
  397. func emptyCommit() *object.Commit {
  398. return &object.Commit{
  399. Hash: plumbing.Hash{},
  400. Message: "***STAGED CHANGES***",
  401. Author: object.Signature{
  402. Name: "",
  403. Email: "",
  404. When: time.Unix(0, 0).UTC(),
  405. },
  406. }
  407. }