scan.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. package scan
  2. import (
  3. "bufio"
  4. "bytes"
  5. "fmt"
  6. "io"
  7. "os"
  8. "strings"
  9. "sync"
  10. "time"
  11. "github.com/zricethezav/gitleaks/v6/manager"
  12. "github.com/go-git/go-git/v5"
  13. "github.com/go-git/go-git/v5/plumbing"
  14. fdiff "github.com/go-git/go-git/v5/plumbing/format/diff"
  15. "github.com/go-git/go-git/v5/plumbing/object"
  16. "github.com/go-git/go-git/v5/plumbing/storer"
  17. "github.com/sergi/go-diff/diffmatchpatch"
  18. log "github.com/sirupsen/logrus"
  19. )
  20. // Bundle contains various git information for scans.
  21. type Bundle struct {
  22. Commit *object.Commit
  23. Patch string
  24. Content string
  25. FilePath string
  26. Operation fdiff.Operation
  27. reader io.Reader
  28. lineLookup map[string]bool
  29. scanType int
  30. }
  31. // commitScanner is a function signature for scanning commits. There is some
  32. // redundant work needed by scanning all files at a commit (--files-at-commit=) and scanning
  33. // the patches generated by a commit (--commit=). The function scanCommit wraps that redundant work
  34. // and accepts a commitScanner for the different logic needed between the two cases described above.
  35. type commitScanner func(c *object.Commit, repo *Repo) error
  36. const (
  37. // We need to differentiate between scans as the logic for line searching is different between
  38. // scanning patches, commits, and uncommitted files.
  39. patchScan int = iota + 1
  40. uncommittedScan
  41. commitScan
  42. )
  43. // Scan is responsible for scanning the entire history (default behavior) of a
  44. // git repo. Options that can change the behavior of this function include: --Commit, --depth, --branch.
  45. // See options/options.go for an explanation on these options.
  46. func (repo *Repo) Scan() error {
  47. if err := repo.setupTimeout(); err != nil {
  48. return err
  49. }
  50. if repo.cancel != nil {
  51. defer repo.cancel()
  52. }
  53. if repo.Repository == nil {
  54. return fmt.Errorf("%s repo is empty", repo.Name)
  55. }
  56. // load up alternative config if possible, if not use manager's config
  57. if repo.Manager.Opts.RepoConfig {
  58. cfg, err := repo.loadRepoConfig()
  59. if err != nil {
  60. return err
  61. }
  62. repo.config = cfg
  63. }
  64. scanTimeStart := time.Now()
  65. // See https://github.com/zricethezav/gitleaks/issues/326
  66. // Scan commit patches, all files at a commit, or a range of commits
  67. if repo.Manager.Opts.Commit != "" {
  68. return scanCommit(repo.Manager.Opts.Commit, repo, scanCommitPatches)
  69. } else if repo.Manager.Opts.FilesAtCommit != "" {
  70. return scanCommit(repo.Manager.Opts.FilesAtCommit, repo, scanFilesAtCommit)
  71. } else if repo.Manager.Opts.Commits != "" {
  72. commits := strings.Split(repo.Manager.Opts.Commits, ",")
  73. for _, c := range commits {
  74. err := scanCommit(c, repo, scanCommitPatches)
  75. if err != nil {
  76. return err
  77. }
  78. }
  79. return nil
  80. } else if repo.Manager.Opts.CommitsFile != "" {
  81. file, err := os.Open(repo.Manager.Opts.CommitsFile)
  82. if err != nil {
  83. return err
  84. }
  85. defer file.Close()
  86. scanner := bufio.NewScanner(file)
  87. for scanner.Scan() {
  88. err := scanCommit(scanner.Text(), repo, scanCommitPatches)
  89. if err != nil {
  90. return err
  91. }
  92. }
  93. return nil
  94. }
  95. logOpts, err := getLogOptions(repo)
  96. if err != nil {
  97. return err
  98. }
  99. cIter, err := repo.Log(logOpts)
  100. if err != nil {
  101. return err
  102. }
  103. cc := 0
  104. semaphore := make(chan bool, howManyThreads(repo.Manager.Opts.Threads))
  105. wg := sync.WaitGroup{}
  106. err = cIter.ForEach(func(c *object.Commit) error {
  107. if c == nil || repo.timeoutReached() || repo.depthReached(cc) {
  108. return storer.ErrStop
  109. }
  110. // Check if Commit is allowlisted
  111. if isCommitAllowListed(c.Hash.String(), repo.config.Allowlist.Commits) {
  112. return nil
  113. }
  114. // Check if at root
  115. if len(c.ParentHashes) == 0 {
  116. cc++
  117. err = scanFilesAtCommit(c, repo)
  118. if err != nil {
  119. return err
  120. }
  121. return nil
  122. }
  123. // increase Commit counter
  124. cc++
  125. // inspect first parent only as all other parents will be eventually reached
  126. // (they exist as the tip of other branches, etc)
  127. // See https://github.com/zricethezav/gitleaks/issues/413 for details
  128. parent, err := c.Parent(0)
  129. if err != nil {
  130. return err
  131. }
  132. defer func() {
  133. if err := recover(); err != nil {
  134. // sometimes the Patch generation will fail due to a known bug in
  135. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  136. // Once a fix has been merged I will remove this recover.
  137. return
  138. }
  139. }()
  140. if repo.timeoutReached() {
  141. return nil
  142. }
  143. if parent == nil {
  144. // shouldn't reach this point but just in case
  145. return nil
  146. }
  147. start := time.Now()
  148. patch, err := parent.Patch(c)
  149. if err != nil {
  150. log.Errorf("could not generate Patch")
  151. }
  152. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  153. wg.Add(1)
  154. semaphore <- true
  155. go func(c *object.Commit, patch *object.Patch) {
  156. defer func() {
  157. <-semaphore
  158. wg.Done()
  159. }()
  160. scanPatch(patch, c, repo)
  161. }(c, patch)
  162. if c.Hash.String() == repo.Manager.Opts.CommitTo {
  163. return storer.ErrStop
  164. }
  165. return nil
  166. })
  167. wg.Wait()
  168. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  169. repo.Manager.IncrementCommits(cc)
  170. return nil
  171. }
  172. // scanEmpty scans an empty repo without any commits. See https://github.com/zricethezav/gitleaks/issues/352
  173. func (repo *Repo) scanEmpty() error {
  174. scanTimeStart := time.Now()
  175. wt, err := repo.Worktree()
  176. if err != nil {
  177. return err
  178. }
  179. status, err := wt.Status()
  180. if err != nil {
  181. return err
  182. }
  183. for fn := range status {
  184. workTreeBuf := bytes.NewBuffer(nil)
  185. workTreeFile, err := wt.Filesystem.Open(fn)
  186. if err != nil {
  187. continue
  188. }
  189. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  190. return err
  191. }
  192. repo.CheckRules(&Bundle{
  193. Content: workTreeBuf.String(),
  194. FilePath: workTreeFile.Name(),
  195. Commit: emptyCommit(),
  196. scanType: uncommittedScan,
  197. })
  198. }
  199. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  200. return nil
  201. }
  202. // scanUncommitted will do a `git diff` and scan changed files that are being tracked. This is useful functionality
  203. // for a pre-Commit hook so you can make sure your code does not have any leaks before committing.
  204. func (repo *Repo) scanUncommitted() error {
  205. // load up alternative config if possible, if not use manager's config
  206. if repo.Manager.Opts.RepoConfig {
  207. cfg, err := repo.loadRepoConfig()
  208. if err != nil {
  209. return err
  210. }
  211. repo.config = cfg
  212. }
  213. if err := repo.setupTimeout(); err != nil {
  214. return err
  215. }
  216. r, err := repo.Head()
  217. if err == plumbing.ErrReferenceNotFound {
  218. // possibly an empty repo, or maybe its not, either way lets scan all the files in the directory
  219. return repo.scanEmpty()
  220. } else if err != nil {
  221. return err
  222. }
  223. scanTimeStart := time.Now()
  224. c, err := repo.CommitObject(r.Hash())
  225. if err != nil {
  226. return err
  227. }
  228. // Staged change so the Commit details do not yet exist. Insert empty defaults.
  229. c.Hash = plumbing.Hash{}
  230. c.Message = "***STAGED CHANGES***"
  231. c.Author.Name = ""
  232. c.Author.Email = ""
  233. c.Author.When = time.Unix(0, 0).UTC()
  234. prevTree, err := c.Tree()
  235. if err != nil {
  236. return err
  237. }
  238. wt, err := repo.Worktree()
  239. if err != nil {
  240. return err
  241. }
  242. status, err := wt.Status()
  243. for fn, state := range status {
  244. var (
  245. prevFileContents string
  246. currFileContents string
  247. filename string
  248. )
  249. if state.Staging != git.Untracked {
  250. if state.Staging == git.Deleted {
  251. // file in staging has been deleted, aka it is not on the filesystem
  252. // so the contents of the file are ""
  253. currFileContents = ""
  254. } else {
  255. workTreeBuf := bytes.NewBuffer(nil)
  256. workTreeFile, err := wt.Filesystem.Open(fn)
  257. if err != nil {
  258. continue
  259. }
  260. if _, err := io.Copy(workTreeBuf, workTreeFile); err != nil {
  261. return err
  262. }
  263. currFileContents = workTreeBuf.String()
  264. filename = workTreeFile.Name()
  265. }
  266. // get files at HEAD state
  267. prevFile, err := prevTree.File(fn)
  268. if err != nil {
  269. prevFileContents = ""
  270. } else {
  271. prevFileContents, err = prevFile.Contents()
  272. if err != nil {
  273. return err
  274. }
  275. if filename == "" {
  276. filename = prevFile.Name
  277. }
  278. }
  279. dmp := diffmatchpatch.New()
  280. diffs := dmp.DiffCleanupSemantic(dmp.DiffMain(prevFileContents, currFileContents, false))
  281. var diffContents string
  282. for _, d := range diffs {
  283. if d.Type == diffmatchpatch.DiffInsert {
  284. diffContents += fmt.Sprintf("%s\n", d.Text)
  285. }
  286. }
  287. repo.CheckRules(&Bundle{
  288. Content: diffContents,
  289. FilePath: filename,
  290. Commit: c,
  291. scanType: uncommittedScan,
  292. })
  293. }
  294. }
  295. if err != nil {
  296. return err
  297. }
  298. repo.Manager.RecordTime(manager.ScanTime(howLong(scanTimeStart)))
  299. return nil
  300. }
  301. // scan accepts a Patch, Commit, and repo. If the patches contains files that are
  302. // binary, then gitleaks will skip scanning that file OR if a file is matched on
  303. // allowlisted files set in the configuration. If a global rule for files is defined and a filename
  304. // matches said global rule, then a leak is sent to the manager.
  305. // After that, file chunks are created which are then inspected by InspectString()
  306. func scanPatch(patch *object.Patch, c *object.Commit, repo *Repo) {
  307. bundle := Bundle{
  308. Commit: c,
  309. Patch: patch.String(),
  310. scanType: patchScan,
  311. }
  312. for _, f := range patch.FilePatches() {
  313. if repo.timeoutReached() {
  314. return
  315. }
  316. if f.IsBinary() {
  317. continue
  318. }
  319. for _, chunk := range f.Chunks() {
  320. if chunk.Type() == fdiff.Add || (repo.Manager.Opts.Deletion && chunk.Type() == fdiff.Delete) {
  321. bundle.Content = chunk.Content()
  322. bundle.Operation = chunk.Type()
  323. // get filepath
  324. from, to := f.Files()
  325. if from != nil {
  326. bundle.FilePath = from.Path()
  327. } else if to != nil {
  328. bundle.FilePath = to.Path()
  329. } else {
  330. bundle.FilePath = "???"
  331. }
  332. repo.CheckRules(&bundle)
  333. }
  334. }
  335. }
  336. }
  337. // scanCommit accepts a Commit hash, repo, and commit scanning function. A new Commit
  338. // object will be created from the hash which will be passed into either scanCommitPatches
  339. // or scanFilesAtCommit depending on the options set.
  340. func scanCommit(commit string, repo *Repo, f commitScanner) error {
  341. if commit == "latest" {
  342. ref, err := repo.Repository.Head()
  343. if err != nil {
  344. return err
  345. }
  346. commit = ref.Hash().String()
  347. }
  348. repo.Manager.IncrementCommits(1)
  349. h := plumbing.NewHash(commit)
  350. c, err := repo.CommitObject(h)
  351. if err != nil {
  352. return err
  353. }
  354. return f(c, repo)
  355. }
  356. // scanCommitPatches accepts a Commit object and a repo. This function is only called when the --Commit=
  357. // option has been set. That option tells gitleaks to look only at a single Commit and check the contents
  358. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  359. // allowlisted then those files will be skipped.
  360. func scanCommitPatches(c *object.Commit, repo *Repo) error {
  361. if len(c.ParentHashes) == 0 {
  362. err := scanFilesAtCommit(c, repo)
  363. if err != nil {
  364. return err
  365. }
  366. }
  367. return c.Parents().ForEach(func(parent *object.Commit) error {
  368. defer func() {
  369. if err := recover(); err != nil {
  370. // sometimes the Patch generation will fail due to a known bug in
  371. // sergi's go-diff: https://github.com/sergi/go-diff/issues/89.
  372. // Once a fix has been merged I will remove this recover.
  373. return
  374. }
  375. }()
  376. if repo.timeoutReached() {
  377. return nil
  378. }
  379. if parent == nil {
  380. return nil
  381. }
  382. start := time.Now()
  383. patch, err := parent.Patch(c)
  384. if err != nil {
  385. return fmt.Errorf("could not generate Patch")
  386. }
  387. repo.Manager.RecordTime(manager.PatchTime(howLong(start)))
  388. scanPatch(patch, c, repo)
  389. return nil
  390. })
  391. }
  392. // scanFilesAtCommit accepts a Commit object and a repo. This function is only called when the --files-at-Commit=
  393. // option has been set. That option tells gitleaks to look only at ALL the files at a Commit and check the contents
  394. // of said Commit. Similar to scan(), if the files contained in the Commit are a binaries or if they are
  395. // allowlisted then those files will be skipped.
  396. func scanFilesAtCommit(c *object.Commit, repo *Repo) error {
  397. fIter, err := c.Files()
  398. if err != nil {
  399. return err
  400. }
  401. err = fIter.ForEach(func(f *object.File) error {
  402. bin, err := f.IsBinary()
  403. if bin || repo.timeoutReached() {
  404. return nil
  405. } else if err != nil {
  406. return err
  407. }
  408. content, err := f.Contents()
  409. if err != nil {
  410. return err
  411. }
  412. repo.CheckRules(&Bundle{
  413. Content: content,
  414. FilePath: f.Name,
  415. Commit: c,
  416. scanType: commitScan,
  417. Operation: fdiff.Add,
  418. })
  419. return nil
  420. })
  421. return err
  422. }
  423. // depthReached checks if i meets the depth (--depth=) if set
  424. func (repo *Repo) depthReached(i int) bool {
  425. if repo.Manager.Opts.Depth != 0 && repo.Manager.Opts.Depth == i {
  426. log.Warnf("Exceeded depth limit (%d)", i)
  427. return true
  428. }
  429. return false
  430. }
  431. // emptyCommit generates an empty commit used for scanning uncommitted changes
  432. func emptyCommit() *object.Commit {
  433. return &object.Commit{
  434. Hash: plumbing.Hash{},
  435. Message: "***STAGED CHANGES***",
  436. Author: object.Signature{
  437. Name: "",
  438. Email: "",
  439. When: time.Unix(0, 0).UTC(),
  440. },
  441. }
  442. }