detect.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "io"
  7. "io/fs"
  8. "os"
  9. "path/filepath"
  10. "regexp"
  11. "strings"
  12. "sync"
  13. "github.com/h2non/filetype"
  14. "github.com/zricethezav/gitleaks/v8/config"
  15. "github.com/zricethezav/gitleaks/v8/detect/git"
  16. "github.com/zricethezav/gitleaks/v8/report"
  17. ahocorasick "github.com/BobuSumisu/aho-corasick"
  18. "github.com/fatih/semgroup"
  19. "github.com/gitleaks/go-gitdiff/gitdiff"
  20. "github.com/rs/zerolog/log"
  21. "github.com/spf13/viper"
  22. )
  23. // Type used to differentiate between git scan types:
  24. // $ gitleaks detect
  25. // $ gitleaks protect
  26. // $ gitleaks protect staged
  27. type GitScanType int
  28. const (
  29. DetectType GitScanType = iota
  30. ProtectType
  31. ProtectStagedType
  32. gitleaksAllowSignature = "gitleaks:allow"
  33. )
  34. // Detector is the main detector struct
  35. type Detector struct {
  36. // Config is the configuration for the detector
  37. Config config.Config
  38. // Redact is a flag to redact findings. This is exported
  39. // so users using gitleaks as a library can set this flag
  40. // without calling `detector.Start(cmd *cobra.Command)`
  41. Redact uint
  42. // verbose is a flag to print findings
  43. Verbose bool
  44. // files larger than this will be skipped
  45. MaxTargetMegaBytes int
  46. // followSymlinks is a flag to enable scanning symlink files
  47. FollowSymlinks bool
  48. // NoColor is a flag to disable color output
  49. NoColor bool
  50. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  51. IgnoreGitleaksAllow bool
  52. // commitMap is used to keep track of commits that have been scanned.
  53. // This is only used for logging purposes and git scans.
  54. commitMap map[string]bool
  55. // findingMutex is to prevent concurrent access to the
  56. // findings slice when adding findings.
  57. findingMutex *sync.Mutex
  58. // findings is a slice of report.Findings. This is the result
  59. // of the detector's scan which can then be used to generate a
  60. // report.
  61. findings []report.Finding
  62. // prefilter is a ahocorasick struct used for doing efficient string
  63. // matching given a set of words (keywords from the rules in the config)
  64. prefilter ahocorasick.Trie
  65. // a list of known findings that should be ignored
  66. baseline []report.Finding
  67. // path to baseline
  68. baselinePath string
  69. // gitleaksIgnore
  70. gitleaksIgnore map[string]bool
  71. }
  72. // Fragment contains the data to be scanned
  73. type Fragment struct {
  74. // Raw is the raw content of the fragment
  75. Raw string
  76. // FilePath is the path to the file if applicable
  77. FilePath string
  78. SymlinkFile string
  79. // CommitSHA is the SHA of the commit if applicable
  80. CommitSHA string
  81. // newlineIndices is a list of indices of newlines in the raw content.
  82. // This is used to calculate the line location of a finding
  83. newlineIndices [][]int
  84. // keywords is a map of all the keywords contain within the contents
  85. // of this fragment
  86. keywords map[string]bool
  87. }
  88. // NewDetector creates a new detector with the given config
  89. func NewDetector(cfg config.Config) *Detector {
  90. return &Detector{
  91. commitMap: make(map[string]bool),
  92. gitleaksIgnore: make(map[string]bool),
  93. findingMutex: &sync.Mutex{},
  94. findings: make([]report.Finding, 0),
  95. Config: cfg,
  96. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(cfg.Keywords).Build(),
  97. }
  98. }
  99. // NewDetectorDefaultConfig creates a new detector with the default config
  100. func NewDetectorDefaultConfig() (*Detector, error) {
  101. viper.SetConfigType("toml")
  102. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  103. if err != nil {
  104. return nil, err
  105. }
  106. var vc config.ViperConfig
  107. err = viper.Unmarshal(&vc)
  108. if err != nil {
  109. return nil, err
  110. }
  111. cfg, err := vc.Translate()
  112. if err != nil {
  113. return nil, err
  114. }
  115. return NewDetector(cfg), nil
  116. }
  117. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  118. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  119. file, err := os.Open(gitleaksIgnorePath)
  120. if err != nil {
  121. return err
  122. }
  123. // https://github.com/securego/gosec/issues/512
  124. defer func() {
  125. if err := file.Close(); err != nil {
  126. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  127. }
  128. }()
  129. scanner := bufio.NewScanner(file)
  130. for scanner.Scan() {
  131. d.gitleaksIgnore[scanner.Text()] = true
  132. }
  133. return nil
  134. }
  135. func (d *Detector) AddBaseline(baselinePath string, source string) error {
  136. if baselinePath != "" {
  137. absoluteSource, err := filepath.Abs(source)
  138. if err != nil {
  139. return err
  140. }
  141. absoluteBaseline, err := filepath.Abs(baselinePath)
  142. if err != nil {
  143. return err
  144. }
  145. relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline)
  146. if err != nil {
  147. return err
  148. }
  149. baseline, err := LoadBaseline(baselinePath)
  150. if err != nil {
  151. return err
  152. }
  153. d.baseline = baseline
  154. baselinePath = relativeBaseline
  155. }
  156. d.baselinePath = baselinePath
  157. return nil
  158. }
  159. // DetectBytes scans the given bytes and returns a list of findings
  160. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  161. return d.DetectString(string(content))
  162. }
  163. // DetectString scans the given string and returns a list of findings
  164. func (d *Detector) DetectString(content string) []report.Finding {
  165. return d.Detect(Fragment{
  166. Raw: content,
  167. })
  168. }
  169. // detectRule scans the given fragment for the given rule and returns a list of findings
  170. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  171. var findings []report.Finding
  172. // check if filepath or commit is allowed for this rule
  173. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  174. rule.Allowlist.PathAllowed(fragment.FilePath) {
  175. return findings
  176. }
  177. if rule.Path != nil && rule.Regex == nil {
  178. // Path _only_ rule
  179. if rule.Path.MatchString(fragment.FilePath) {
  180. finding := report.Finding{
  181. Description: rule.Description,
  182. File: fragment.FilePath,
  183. SymlinkFile: fragment.SymlinkFile,
  184. RuleID: rule.RuleID,
  185. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  186. Tags: rule.Tags,
  187. }
  188. return append(findings, finding)
  189. }
  190. } else if rule.Path != nil {
  191. // if path is set _and_ a regex is set, then we need to check both
  192. // so if the path does not match, then we should return early and not
  193. // consider the regex
  194. if !rule.Path.MatchString(fragment.FilePath) {
  195. return findings
  196. }
  197. }
  198. // if path only rule, skip content checks
  199. if rule.Regex == nil {
  200. return findings
  201. }
  202. // If flag configure and raw data size bigger then the flag
  203. if d.MaxTargetMegaBytes > 0 {
  204. rawLength := len(fragment.Raw) / 1000000
  205. if rawLength > d.MaxTargetMegaBytes {
  206. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  207. return findings
  208. }
  209. }
  210. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  211. for _, matchIndex := range matchIndices {
  212. // extract secret from match
  213. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  214. // determine location of match. Note that the location
  215. // in the finding will be the line/column numbers of the _match_
  216. // not the _secret_, which will be different if the secretGroup
  217. // value is set for this rule
  218. loc := location(fragment, matchIndex)
  219. if matchIndex[1] > loc.endLineIndex {
  220. loc.endLineIndex = matchIndex[1]
  221. }
  222. finding := report.Finding{
  223. Description: rule.Description,
  224. File: fragment.FilePath,
  225. SymlinkFile: fragment.SymlinkFile,
  226. RuleID: rule.RuleID,
  227. StartLine: loc.startLine,
  228. EndLine: loc.endLine,
  229. StartColumn: loc.startColumn,
  230. EndColumn: loc.endColumn,
  231. Secret: secret,
  232. Match: secret,
  233. Tags: rule.Tags,
  234. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  235. }
  236. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  237. gitleaksAllowSignature) && !d.IgnoreGitleaksAllow {
  238. continue
  239. }
  240. // by default if secret group is not set, we will check to see if there
  241. // are any capture groups. If there are, we will use the first capture to start
  242. groups := rule.Regex.FindStringSubmatch(secret)
  243. if rule.SecretGroup == 0 {
  244. // if len(groups) == 2 that means there is only one capture group
  245. // the first element in groups is the full match, the second is the
  246. // first capture group
  247. if len(groups) == 2 {
  248. secret = groups[1]
  249. finding.Secret = secret
  250. }
  251. } else {
  252. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  253. // Config validation should prevent this
  254. continue
  255. }
  256. secret = groups[rule.SecretGroup]
  257. finding.Secret = secret
  258. }
  259. // check if the regexTarget is defined in the allowlist "regexes" entry
  260. allowlistTarget := finding.Secret
  261. switch rule.Allowlist.RegexTarget {
  262. case "match":
  263. allowlistTarget = finding.Match
  264. case "line":
  265. allowlistTarget = finding.Line
  266. }
  267. globalAllowlistTarget := finding.Secret
  268. switch d.Config.Allowlist.RegexTarget {
  269. case "match":
  270. globalAllowlistTarget = finding.Match
  271. case "line":
  272. globalAllowlistTarget = finding.Line
  273. }
  274. if rule.Allowlist.RegexAllowed(allowlistTarget) ||
  275. d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  276. continue
  277. }
  278. // check if the secret is in the list of stopwords
  279. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  280. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  281. continue
  282. }
  283. // check entropy
  284. entropy := shannonEntropy(finding.Secret)
  285. finding.Entropy = float32(entropy)
  286. if rule.Entropy != 0.0 {
  287. if entropy <= rule.Entropy {
  288. // entropy is too low, skip this finding
  289. continue
  290. }
  291. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  292. // does not support positive lookaheads. Ideally we would want to add a
  293. // restriction on generic rules regex that requires the secret match group
  294. // contains both numbers and alphabetical characters, not just alphabetical characters.
  295. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  296. // secret contains both digits and alphabetical characters.
  297. // TODO: this should be replaced with stop words
  298. if strings.HasPrefix(rule.RuleID, "generic") {
  299. if !containsDigit(secret) {
  300. continue
  301. }
  302. }
  303. }
  304. findings = append(findings, finding)
  305. }
  306. return findings
  307. }
  308. // DetectGit accepts source directory, log opts and GitScanType and returns a slice of report.Finding.
  309. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  310. var (
  311. diffFilesCmd *git.DiffFilesCmd
  312. err error
  313. )
  314. switch gitScanType {
  315. case DetectType:
  316. diffFilesCmd, err = git.NewGitLogCmd(source, logOpts)
  317. if err != nil {
  318. return d.findings, err
  319. }
  320. case ProtectType:
  321. diffFilesCmd, err = git.NewGitDiffCmd(source, false)
  322. if err != nil {
  323. return d.findings, err
  324. }
  325. case ProtectStagedType:
  326. diffFilesCmd, err = git.NewGitDiffCmd(source, true)
  327. if err != nil {
  328. return d.findings, err
  329. }
  330. }
  331. defer diffFilesCmd.Wait()
  332. diffFilesCh := diffFilesCmd.DiffFilesCh()
  333. errCh := diffFilesCmd.ErrCh()
  334. s := semgroup.NewGroup(context.Background(), 4)
  335. // loop to range over both DiffFiles (stdout) and ErrCh (stderr)
  336. for diffFilesCh != nil || errCh != nil {
  337. select {
  338. case gitdiffFile, open := <-diffFilesCh:
  339. if !open {
  340. diffFilesCh = nil
  341. break
  342. }
  343. // skip binary files
  344. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  345. continue
  346. }
  347. // Check if commit is allowed
  348. commitSHA := ""
  349. if gitdiffFile.PatchHeader != nil {
  350. commitSHA = gitdiffFile.PatchHeader.SHA
  351. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  352. continue
  353. }
  354. }
  355. d.addCommit(commitSHA)
  356. s.Go(func() error {
  357. for _, textFragment := range gitdiffFile.TextFragments {
  358. if textFragment == nil {
  359. return nil
  360. }
  361. fragment := Fragment{
  362. Raw: textFragment.Raw(gitdiff.OpAdd),
  363. CommitSHA: commitSHA,
  364. FilePath: gitdiffFile.NewName,
  365. }
  366. for _, finding := range d.Detect(fragment) {
  367. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  368. }
  369. }
  370. return nil
  371. })
  372. case err, open := <-errCh:
  373. if !open {
  374. errCh = nil
  375. break
  376. }
  377. return d.findings, err
  378. }
  379. }
  380. if err := s.Wait(); err != nil {
  381. return d.findings, err
  382. }
  383. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  384. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  385. return d.findings, nil
  386. }
  387. type scanTarget struct {
  388. Path string
  389. Symlink string
  390. }
  391. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  392. // file or directory.
  393. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  394. s := semgroup.NewGroup(context.Background(), 4)
  395. paths := make(chan scanTarget)
  396. s.Go(func() error {
  397. defer close(paths)
  398. return filepath.Walk(source,
  399. func(path string, fInfo os.FileInfo, err error) error {
  400. if err != nil {
  401. return err
  402. }
  403. if fInfo.Name() == ".git" && fInfo.IsDir() {
  404. return filepath.SkipDir
  405. }
  406. if fInfo.Size() == 0 {
  407. return nil
  408. }
  409. if fInfo.Mode().IsRegular() {
  410. paths <- scanTarget{
  411. Path: path,
  412. Symlink: "",
  413. }
  414. }
  415. if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
  416. realPath, err := filepath.EvalSymlinks(path)
  417. if err != nil {
  418. return err
  419. }
  420. realPathFileInfo, _ := os.Stat(realPath)
  421. if realPathFileInfo.IsDir() {
  422. log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
  423. return nil
  424. }
  425. paths <- scanTarget{
  426. Path: realPath,
  427. Symlink: path,
  428. }
  429. }
  430. return nil
  431. })
  432. })
  433. for pa := range paths {
  434. p := pa
  435. s.Go(func() error {
  436. b, err := os.ReadFile(p.Path)
  437. if err != nil {
  438. return err
  439. }
  440. mimetype, err := filetype.Match(b)
  441. if err != nil {
  442. return err
  443. }
  444. if mimetype.MIME.Type == "application" {
  445. return nil // skip binary files
  446. }
  447. fragment := Fragment{
  448. Raw: string(b),
  449. FilePath: p.Path,
  450. }
  451. if p.Symlink != "" {
  452. fragment.SymlinkFile = p.Symlink
  453. }
  454. for _, finding := range d.Detect(fragment) {
  455. // need to add 1 since line counting starts at 1
  456. finding.EndLine++
  457. finding.StartLine++
  458. d.addFinding(finding)
  459. }
  460. return nil
  461. })
  462. }
  463. if err := s.Wait(); err != nil {
  464. return d.findings, err
  465. }
  466. return d.findings, nil
  467. }
  468. // DetectReader accepts an io.Reader and a buffer size for the reader in KB
  469. func (d *Detector) DetectReader(r io.Reader, bufSize int) ([]report.Finding, error) {
  470. reader := bufio.NewReader(r)
  471. buf := make([]byte, 0, 1000*bufSize)
  472. findings := []report.Finding{}
  473. for {
  474. n, err := reader.Read(buf[:cap(buf)])
  475. buf = buf[:n]
  476. if err != nil {
  477. if err != io.EOF {
  478. return findings, err
  479. }
  480. break
  481. }
  482. fragment := Fragment{
  483. Raw: string(buf),
  484. }
  485. for _, finding := range d.Detect(fragment) {
  486. findings = append(findings, finding)
  487. if d.Verbose {
  488. printFinding(finding, d.NoColor)
  489. }
  490. }
  491. }
  492. return findings, nil
  493. }
  494. // Detect scans the given fragment and returns a list of findings
  495. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  496. var findings []report.Finding
  497. // initiate fragment keywords
  498. fragment.keywords = make(map[string]bool)
  499. // check if filepath is allowed
  500. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  501. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  502. return findings
  503. }
  504. // add newline indices for location calculation in detectRule
  505. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  506. // build keyword map for prefiltering rules
  507. normalizedRaw := strings.ToLower(fragment.Raw)
  508. matches := d.prefilter.MatchString(normalizedRaw)
  509. for _, m := range matches {
  510. fragment.keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  511. }
  512. for _, rule := range d.Config.Rules {
  513. if len(rule.Keywords) == 0 {
  514. // if not keywords are associated with the rule always scan the
  515. // fragment using the rule
  516. findings = append(findings, d.detectRule(fragment, rule)...)
  517. continue
  518. }
  519. fragmentContainsKeyword := false
  520. // check if keywords are in the fragment
  521. for _, k := range rule.Keywords {
  522. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  523. fragmentContainsKeyword = true
  524. }
  525. }
  526. if fragmentContainsKeyword {
  527. findings = append(findings, d.detectRule(fragment, rule)...)
  528. }
  529. }
  530. return filter(findings, d.Redact)
  531. }
  532. // addFinding synchronously adds a finding to the findings slice
  533. func (d *Detector) addFinding(finding report.Finding) {
  534. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  535. if finding.Commit != "" {
  536. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  537. } else {
  538. finding.Fingerprint = globalFingerprint
  539. }
  540. // check if we should ignore this finding
  541. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  542. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  543. finding.Fingerprint)
  544. return
  545. } else if finding.Commit != "" {
  546. // Awkward nested if because I'm not sure how to chain these two conditions.
  547. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  548. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  549. finding.Fingerprint)
  550. return
  551. }
  552. }
  553. if d.baseline != nil && !IsNew(finding, d.baseline) {
  554. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  555. return
  556. }
  557. d.findingMutex.Lock()
  558. d.findings = append(d.findings, finding)
  559. if d.Verbose {
  560. printFinding(finding, d.NoColor)
  561. }
  562. d.findingMutex.Unlock()
  563. }
  564. // addCommit synchronously adds a commit to the commit slice
  565. func (d *Detector) addCommit(commit string) {
  566. d.commitMap[commit] = true
  567. }