detect.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "io"
  7. "io/fs"
  8. "os"
  9. "path/filepath"
  10. "regexp"
  11. "strings"
  12. "sync"
  13. "github.com/h2non/filetype"
  14. "github.com/zricethezav/gitleaks/v8/config"
  15. "github.com/zricethezav/gitleaks/v8/detect/git"
  16. "github.com/zricethezav/gitleaks/v8/report"
  17. ahocorasick "github.com/BobuSumisu/aho-corasick"
  18. "github.com/fatih/semgroup"
  19. "github.com/gitleaks/go-gitdiff/gitdiff"
  20. "github.com/rs/zerolog/log"
  21. "github.com/spf13/viper"
  22. )
  23. // Type used to differentiate between git scan types:
  24. // $ gitleaks detect
  25. // $ gitleaks protect
  26. // $ gitleaks protect staged
  27. type GitScanType int
  28. const (
  29. DetectType GitScanType = iota
  30. ProtectType
  31. ProtectStagedType
  32. gitleaksAllowSignature = "gitleaks:allow"
  33. chunkSize = 10 * 1_000 // 10kb
  34. )
  35. // Detector is the main detector struct
  36. type Detector struct {
  37. // Config is the configuration for the detector
  38. Config config.Config
  39. // Redact is a flag to redact findings. This is exported
  40. // so users using gitleaks as a library can set this flag
  41. // without calling `detector.Start(cmd *cobra.Command)`
  42. Redact uint
  43. // verbose is a flag to print findings
  44. Verbose bool
  45. // files larger than this will be skipped
  46. MaxTargetMegaBytes int
  47. // followSymlinks is a flag to enable scanning symlink files
  48. FollowSymlinks bool
  49. // NoColor is a flag to disable color output
  50. NoColor bool
  51. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  52. IgnoreGitleaksAllow bool
  53. // commitMap is used to keep track of commits that have been scanned.
  54. // This is only used for logging purposes and git scans.
  55. commitMap map[string]bool
  56. // findingMutex is to prevent concurrent access to the
  57. // findings slice when adding findings.
  58. findingMutex *sync.Mutex
  59. // findings is a slice of report.Findings. This is the result
  60. // of the detector's scan which can then be used to generate a
  61. // report.
  62. findings []report.Finding
  63. // prefilter is a ahocorasick struct used for doing efficient string
  64. // matching given a set of words (keywords from the rules in the config)
  65. prefilter ahocorasick.Trie
  66. // a list of known findings that should be ignored
  67. baseline []report.Finding
  68. // path to baseline
  69. baselinePath string
  70. // gitleaksIgnore
  71. gitleaksIgnore map[string]bool
  72. }
  73. // Fragment contains the data to be scanned
  74. type Fragment struct {
  75. // Raw is the raw content of the fragment
  76. Raw string
  77. // FilePath is the path to the file if applicable
  78. FilePath string
  79. SymlinkFile string
  80. // CommitSHA is the SHA of the commit if applicable
  81. CommitSHA string
  82. // newlineIndices is a list of indices of newlines in the raw content.
  83. // This is used to calculate the line location of a finding
  84. newlineIndices [][]int
  85. // keywords is a map of all the keywords contain within the contents
  86. // of this fragment
  87. keywords map[string]bool
  88. }
  89. // NewDetector creates a new detector with the given config
  90. func NewDetector(cfg config.Config) *Detector {
  91. return &Detector{
  92. commitMap: make(map[string]bool),
  93. gitleaksIgnore: make(map[string]bool),
  94. findingMutex: &sync.Mutex{},
  95. findings: make([]report.Finding, 0),
  96. Config: cfg,
  97. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(cfg.Keywords).Build(),
  98. }
  99. }
  100. // NewDetectorDefaultConfig creates a new detector with the default config
  101. func NewDetectorDefaultConfig() (*Detector, error) {
  102. viper.SetConfigType("toml")
  103. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  104. if err != nil {
  105. return nil, err
  106. }
  107. var vc config.ViperConfig
  108. err = viper.Unmarshal(&vc)
  109. if err != nil {
  110. return nil, err
  111. }
  112. cfg, err := vc.Translate()
  113. if err != nil {
  114. return nil, err
  115. }
  116. return NewDetector(cfg), nil
  117. }
  118. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  119. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  120. file, err := os.Open(gitleaksIgnorePath)
  121. if err != nil {
  122. return err
  123. }
  124. // https://github.com/securego/gosec/issues/512
  125. defer func() {
  126. if err := file.Close(); err != nil {
  127. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  128. }
  129. }()
  130. scanner := bufio.NewScanner(file)
  131. for scanner.Scan() {
  132. d.gitleaksIgnore[scanner.Text()] = true
  133. }
  134. return nil
  135. }
  136. func (d *Detector) AddBaseline(baselinePath string, source string) error {
  137. if baselinePath != "" {
  138. absoluteSource, err := filepath.Abs(source)
  139. if err != nil {
  140. return err
  141. }
  142. absoluteBaseline, err := filepath.Abs(baselinePath)
  143. if err != nil {
  144. return err
  145. }
  146. relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline)
  147. if err != nil {
  148. return err
  149. }
  150. baseline, err := LoadBaseline(baselinePath)
  151. if err != nil {
  152. return err
  153. }
  154. d.baseline = baseline
  155. baselinePath = relativeBaseline
  156. }
  157. d.baselinePath = baselinePath
  158. return nil
  159. }
  160. // DetectBytes scans the given bytes and returns a list of findings
  161. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  162. return d.DetectString(string(content))
  163. }
  164. // DetectString scans the given string and returns a list of findings
  165. func (d *Detector) DetectString(content string) []report.Finding {
  166. return d.Detect(Fragment{
  167. Raw: content,
  168. })
  169. }
  170. // detectRule scans the given fragment for the given rule and returns a list of findings
  171. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  172. var findings []report.Finding
  173. // check if filepath or commit is allowed for this rule
  174. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  175. rule.Allowlist.PathAllowed(fragment.FilePath) {
  176. return findings
  177. }
  178. if rule.Path != nil && rule.Regex == nil {
  179. // Path _only_ rule
  180. if rule.Path.MatchString(fragment.FilePath) {
  181. finding := report.Finding{
  182. Description: rule.Description,
  183. File: fragment.FilePath,
  184. SymlinkFile: fragment.SymlinkFile,
  185. RuleID: rule.RuleID,
  186. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  187. Tags: rule.Tags,
  188. }
  189. return append(findings, finding)
  190. }
  191. } else if rule.Path != nil {
  192. // if path is set _and_ a regex is set, then we need to check both
  193. // so if the path does not match, then we should return early and not
  194. // consider the regex
  195. if !rule.Path.MatchString(fragment.FilePath) {
  196. return findings
  197. }
  198. }
  199. // if path only rule, skip content checks
  200. if rule.Regex == nil {
  201. return findings
  202. }
  203. // If flag configure and raw data size bigger then the flag
  204. if d.MaxTargetMegaBytes > 0 {
  205. rawLength := len(fragment.Raw) / 1000000
  206. if rawLength > d.MaxTargetMegaBytes {
  207. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  208. return findings
  209. }
  210. }
  211. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  212. for _, matchIndex := range matchIndices {
  213. // extract secret from match
  214. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  215. // determine location of match. Note that the location
  216. // in the finding will be the line/column numbers of the _match_
  217. // not the _secret_, which will be different if the secretGroup
  218. // value is set for this rule
  219. loc := location(fragment, matchIndex)
  220. if matchIndex[1] > loc.endLineIndex {
  221. loc.endLineIndex = matchIndex[1]
  222. }
  223. finding := report.Finding{
  224. Description: rule.Description,
  225. File: fragment.FilePath,
  226. SymlinkFile: fragment.SymlinkFile,
  227. RuleID: rule.RuleID,
  228. StartLine: loc.startLine,
  229. EndLine: loc.endLine,
  230. StartColumn: loc.startColumn,
  231. EndColumn: loc.endColumn,
  232. Secret: secret,
  233. Match: secret,
  234. Tags: rule.Tags,
  235. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  236. }
  237. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  238. gitleaksAllowSignature) && !d.IgnoreGitleaksAllow {
  239. continue
  240. }
  241. // by default if secret group is not set, we will check to see if there
  242. // are any capture groups. If there are, we will use the first capture to start
  243. groups := rule.Regex.FindStringSubmatch(secret)
  244. if rule.SecretGroup == 0 {
  245. // if len(groups) == 2 that means there is only one capture group
  246. // the first element in groups is the full match, the second is the
  247. // first capture group
  248. if len(groups) == 2 {
  249. secret = groups[1]
  250. finding.Secret = secret
  251. }
  252. } else {
  253. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  254. // Config validation should prevent this
  255. continue
  256. }
  257. secret = groups[rule.SecretGroup]
  258. finding.Secret = secret
  259. }
  260. // check if the regexTarget is defined in the allowlist "regexes" entry
  261. allowlistTarget := finding.Secret
  262. switch rule.Allowlist.RegexTarget {
  263. case "match":
  264. allowlistTarget = finding.Match
  265. case "line":
  266. allowlistTarget = finding.Line
  267. }
  268. globalAllowlistTarget := finding.Secret
  269. switch d.Config.Allowlist.RegexTarget {
  270. case "match":
  271. globalAllowlistTarget = finding.Match
  272. case "line":
  273. globalAllowlistTarget = finding.Line
  274. }
  275. if rule.Allowlist.RegexAllowed(allowlistTarget) ||
  276. d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  277. continue
  278. }
  279. // check if the secret is in the list of stopwords
  280. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  281. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  282. continue
  283. }
  284. // check entropy
  285. entropy := shannonEntropy(finding.Secret)
  286. finding.Entropy = float32(entropy)
  287. if rule.Entropy != 0.0 {
  288. if entropy <= rule.Entropy {
  289. // entropy is too low, skip this finding
  290. continue
  291. }
  292. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  293. // does not support positive lookaheads. Ideally we would want to add a
  294. // restriction on generic rules regex that requires the secret match group
  295. // contains both numbers and alphabetical characters, not just alphabetical characters.
  296. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  297. // secret contains both digits and alphabetical characters.
  298. // TODO: this should be replaced with stop words
  299. if strings.HasPrefix(rule.RuleID, "generic") {
  300. if !containsDigit(secret) {
  301. continue
  302. }
  303. }
  304. }
  305. findings = append(findings, finding)
  306. }
  307. return findings
  308. }
  309. // DetectGit accepts source directory, log opts and GitScanType and returns a slice of report.Finding.
  310. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  311. var (
  312. diffFilesCmd *git.DiffFilesCmd
  313. err error
  314. )
  315. switch gitScanType {
  316. case DetectType:
  317. diffFilesCmd, err = git.NewGitLogCmd(source, logOpts)
  318. if err != nil {
  319. return d.findings, err
  320. }
  321. case ProtectType:
  322. diffFilesCmd, err = git.NewGitDiffCmd(source, false)
  323. if err != nil {
  324. return d.findings, err
  325. }
  326. case ProtectStagedType:
  327. diffFilesCmd, err = git.NewGitDiffCmd(source, true)
  328. if err != nil {
  329. return d.findings, err
  330. }
  331. }
  332. defer diffFilesCmd.Wait()
  333. diffFilesCh := diffFilesCmd.DiffFilesCh()
  334. errCh := diffFilesCmd.ErrCh()
  335. s := semgroup.NewGroup(context.Background(), 4)
  336. // loop to range over both DiffFiles (stdout) and ErrCh (stderr)
  337. for diffFilesCh != nil || errCh != nil {
  338. select {
  339. case gitdiffFile, open := <-diffFilesCh:
  340. if !open {
  341. diffFilesCh = nil
  342. break
  343. }
  344. // skip binary files
  345. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  346. continue
  347. }
  348. // Check if commit is allowed
  349. commitSHA := ""
  350. if gitdiffFile.PatchHeader != nil {
  351. commitSHA = gitdiffFile.PatchHeader.SHA
  352. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  353. continue
  354. }
  355. }
  356. d.addCommit(commitSHA)
  357. s.Go(func() error {
  358. for _, textFragment := range gitdiffFile.TextFragments {
  359. if textFragment == nil {
  360. return nil
  361. }
  362. fragment := Fragment{
  363. Raw: textFragment.Raw(gitdiff.OpAdd),
  364. CommitSHA: commitSHA,
  365. FilePath: gitdiffFile.NewName,
  366. }
  367. for _, finding := range d.Detect(fragment) {
  368. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  369. }
  370. }
  371. return nil
  372. })
  373. case err, open := <-errCh:
  374. if !open {
  375. errCh = nil
  376. break
  377. }
  378. return d.findings, err
  379. }
  380. }
  381. if err := s.Wait(); err != nil {
  382. return d.findings, err
  383. }
  384. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  385. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  386. return d.findings, nil
  387. }
  388. type scanTarget struct {
  389. Path string
  390. Symlink string
  391. }
  392. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  393. // file or directory.
  394. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  395. s := semgroup.NewGroup(context.Background(), 4)
  396. paths := make(chan scanTarget)
  397. s.Go(func() error {
  398. defer close(paths)
  399. return filepath.Walk(source,
  400. func(path string, fInfo os.FileInfo, err error) error {
  401. if err != nil {
  402. return err
  403. }
  404. if fInfo.Name() == ".git" && fInfo.IsDir() {
  405. return filepath.SkipDir
  406. }
  407. if fInfo.Size() == 0 {
  408. return nil
  409. }
  410. if fInfo.Mode().IsRegular() {
  411. paths <- scanTarget{
  412. Path: path,
  413. Symlink: "",
  414. }
  415. }
  416. if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
  417. realPath, err := filepath.EvalSymlinks(path)
  418. if err != nil {
  419. return err
  420. }
  421. realPathFileInfo, _ := os.Stat(realPath)
  422. if realPathFileInfo.IsDir() {
  423. log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
  424. return nil
  425. }
  426. paths <- scanTarget{
  427. Path: realPath,
  428. Symlink: path,
  429. }
  430. }
  431. return nil
  432. })
  433. })
  434. for pa := range paths {
  435. p := pa
  436. s.Go(func() error {
  437. f, err := os.Open(p.Path)
  438. if err != nil {
  439. return err
  440. }
  441. defer f.Close()
  442. // Buffer to hold file chunks
  443. buf := make([]byte, chunkSize)
  444. totalLines := 0
  445. for {
  446. n, err := f.Read(buf)
  447. if err != nil && err != io.EOF {
  448. return err
  449. }
  450. if n == 0 {
  451. break
  452. }
  453. // TODO: optimization could be introduced here
  454. mimetype, err := filetype.Match(buf[:n])
  455. if err != nil {
  456. return err
  457. }
  458. if mimetype.MIME.Type == "application" {
  459. return nil // skip binary files
  460. }
  461. // Count the number of newlines in this chunk
  462. linesInChunk := strings.Count(string(buf[:n]), "\n")
  463. totalLines += linesInChunk
  464. fragment := Fragment{
  465. Raw: string(buf[:n]),
  466. FilePath: p.Path,
  467. }
  468. if p.Symlink != "" {
  469. fragment.SymlinkFile = p.Symlink
  470. }
  471. for _, finding := range d.Detect(fragment) {
  472. // need to add 1 since line counting starts at 1
  473. finding.StartLine += (totalLines - linesInChunk) + 1
  474. finding.EndLine += (totalLines - linesInChunk) + 1
  475. d.addFinding(finding)
  476. }
  477. }
  478. return nil
  479. })
  480. }
  481. if err := s.Wait(); err != nil {
  482. return d.findings, err
  483. }
  484. return d.findings, nil
  485. }
  486. // DetectReader accepts an io.Reader and a buffer size for the reader in KB
  487. func (d *Detector) DetectReader(r io.Reader, bufSize int) ([]report.Finding, error) {
  488. reader := bufio.NewReader(r)
  489. buf := make([]byte, 0, 1000*bufSize)
  490. findings := []report.Finding{}
  491. for {
  492. n, err := reader.Read(buf[:cap(buf)])
  493. buf = buf[:n]
  494. if err != nil {
  495. if err != io.EOF {
  496. return findings, err
  497. }
  498. break
  499. }
  500. fragment := Fragment{
  501. Raw: string(buf),
  502. }
  503. for _, finding := range d.Detect(fragment) {
  504. findings = append(findings, finding)
  505. if d.Verbose {
  506. printFinding(finding, d.NoColor)
  507. }
  508. }
  509. }
  510. return findings, nil
  511. }
  512. // Detect scans the given fragment and returns a list of findings
  513. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  514. var findings []report.Finding
  515. // initiate fragment keywords
  516. fragment.keywords = make(map[string]bool)
  517. // check if filepath is allowed
  518. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  519. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  520. return findings
  521. }
  522. // add newline indices for location calculation in detectRule
  523. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  524. // build keyword map for prefiltering rules
  525. normalizedRaw := strings.ToLower(fragment.Raw)
  526. matches := d.prefilter.MatchString(normalizedRaw)
  527. for _, m := range matches {
  528. fragment.keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  529. }
  530. for _, rule := range d.Config.Rules {
  531. if len(rule.Keywords) == 0 {
  532. // if not keywords are associated with the rule always scan the
  533. // fragment using the rule
  534. findings = append(findings, d.detectRule(fragment, rule)...)
  535. continue
  536. }
  537. fragmentContainsKeyword := false
  538. // check if keywords are in the fragment
  539. for _, k := range rule.Keywords {
  540. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  541. fragmentContainsKeyword = true
  542. }
  543. }
  544. if fragmentContainsKeyword {
  545. findings = append(findings, d.detectRule(fragment, rule)...)
  546. }
  547. }
  548. return filter(findings, d.Redact)
  549. }
  550. // addFinding synchronously adds a finding to the findings slice
  551. func (d *Detector) addFinding(finding report.Finding) {
  552. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  553. if finding.Commit != "" {
  554. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  555. } else {
  556. finding.Fingerprint = globalFingerprint
  557. }
  558. // check if we should ignore this finding
  559. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  560. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  561. finding.Fingerprint)
  562. return
  563. } else if finding.Commit != "" {
  564. // Awkward nested if because I'm not sure how to chain these two conditions.
  565. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  566. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  567. finding.Fingerprint)
  568. return
  569. }
  570. }
  571. if d.baseline != nil && !IsNew(finding, d.baseline) {
  572. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  573. return
  574. }
  575. d.findingMutex.Lock()
  576. d.findings = append(d.findings, finding)
  577. if d.Verbose {
  578. printFinding(finding, d.NoColor)
  579. }
  580. d.findingMutex.Unlock()
  581. }
  582. // addCommit synchronously adds a commit to the commit slice
  583. func (d *Detector) addCommit(commit string) {
  584. d.commitMap[commit] = true
  585. }