detect.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "runtime"
  8. "strings"
  9. "sync"
  10. "sync/atomic"
  11. "github.com/zricethezav/gitleaks/v8/config"
  12. "github.com/zricethezav/gitleaks/v8/logging"
  13. "github.com/zricethezav/gitleaks/v8/regexp"
  14. "github.com/zricethezav/gitleaks/v8/report"
  15. ahocorasick "github.com/BobuSumisu/aho-corasick"
  16. "github.com/fatih/semgroup"
  17. "github.com/rs/zerolog"
  18. "github.com/spf13/viper"
  19. "golang.org/x/exp/maps"
  20. )
  21. const (
  22. gitleaksAllowSignature = "gitleaks:allow"
  23. chunkSize = 100 * 1_000 // 100kb
  24. )
  25. var (
  26. newLineRegexp = regexp.MustCompile("\n")
  27. isWindows = runtime.GOOS == "windows"
  28. )
  29. // Detector is the main detector struct
  30. type Detector struct {
  31. // Config is the configuration for the detector
  32. Config config.Config
  33. // Redact is a flag to redact findings. This is exported
  34. // so users using gitleaks as a library can set this flag
  35. // without calling `detector.Start(cmd *cobra.Command)`
  36. Redact uint
  37. // verbose is a flag to print findings
  38. Verbose bool
  39. // MaxDecodeDepths limits how many recursive decoding passes are allowed
  40. MaxDecodeDepth int
  41. // files larger than this will be skipped
  42. MaxTargetMegaBytes int
  43. // followSymlinks is a flag to enable scanning symlink files
  44. FollowSymlinks bool
  45. // NoColor is a flag to disable color output
  46. NoColor bool
  47. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  48. IgnoreGitleaksAllow bool
  49. // commitMap is used to keep track of commits that have been scanned.
  50. // This is only used for logging purposes and git scans.
  51. commitMap map[string]bool
  52. // findingMutex is to prevent concurrent access to the
  53. // findings slice when adding findings.
  54. findingMutex *sync.Mutex
  55. // findings is a slice of report.Findings. This is the result
  56. // of the detector's scan which can then be used to generate a
  57. // report.
  58. findings []report.Finding
  59. // prefilter is a ahocorasick struct used for doing efficient string
  60. // matching given a set of words (keywords from the rules in the config)
  61. prefilter ahocorasick.Trie
  62. // a list of known findings that should be ignored
  63. baseline []report.Finding
  64. // path to baseline
  65. baselinePath string
  66. // gitleaksIgnore
  67. gitleaksIgnore map[string]struct{}
  68. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  69. Sema *semgroup.Group
  70. // report-related settings.
  71. ReportPath string
  72. Reporter report.Reporter
  73. TotalBytes atomic.Uint64
  74. }
  75. // Fragment contains the data to be scanned
  76. type Fragment struct {
  77. // Raw is the raw content of the fragment
  78. Raw string
  79. Bytes []byte
  80. // FilePath is the path to the file, if applicable.
  81. // The path separator MUST be normalized to `/`.
  82. FilePath string
  83. SymlinkFile string
  84. // WindowsFilePath is the path with the original separator.
  85. // This provides a backwards-compatible solution to https://github.com/gitleaks/gitleaks/issues/1565.
  86. WindowsFilePath string `json:"-"` // TODO: remove this in v9.
  87. // CommitSHA is the SHA of the commit if applicable
  88. CommitSHA string
  89. // newlineIndices is a list of indices of newlines in the raw content.
  90. // This is used to calculate the line location of a finding
  91. newlineIndices [][]int
  92. }
  93. // NewDetector creates a new detector with the given config
  94. func NewDetector(cfg config.Config) *Detector {
  95. return &Detector{
  96. commitMap: make(map[string]bool),
  97. gitleaksIgnore: make(map[string]struct{}),
  98. findingMutex: &sync.Mutex{},
  99. findings: make([]report.Finding, 0),
  100. Config: cfg,
  101. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
  102. Sema: semgroup.NewGroup(context.Background(), 40),
  103. }
  104. }
  105. // NewDetectorDefaultConfig creates a new detector with the default config
  106. func NewDetectorDefaultConfig() (*Detector, error) {
  107. viper.SetConfigType("toml")
  108. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  109. if err != nil {
  110. return nil, err
  111. }
  112. var vc config.ViperConfig
  113. err = viper.Unmarshal(&vc)
  114. if err != nil {
  115. return nil, err
  116. }
  117. cfg, err := vc.Translate()
  118. if err != nil {
  119. return nil, err
  120. }
  121. return NewDetector(cfg), nil
  122. }
  123. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  124. logging.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  125. file, err := os.Open(gitleaksIgnorePath)
  126. if err != nil {
  127. return err
  128. }
  129. defer func() {
  130. // https://github.com/securego/gosec/issues/512
  131. if err := file.Close(); err != nil {
  132. logging.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  133. }
  134. }()
  135. scanner := bufio.NewScanner(file)
  136. replacer := strings.NewReplacer("\\", "/")
  137. for scanner.Scan() {
  138. line := strings.TrimSpace(scanner.Text())
  139. // Skip lines that start with a comment
  140. if line == "" || strings.HasPrefix(line, "#") {
  141. continue
  142. }
  143. // Normalize the path.
  144. // TODO: Make this a breaking change in v9.
  145. s := strings.Split(line, ":")
  146. switch len(s) {
  147. case 3:
  148. // Global fingerprint.
  149. // `file:rule-id:start-line`
  150. s[0] = replacer.Replace(s[0])
  151. case 4:
  152. // Commit fingerprint.
  153. // `commit:file:rule-id:start-line`
  154. s[1] = replacer.Replace(s[1])
  155. default:
  156. logging.Warn().Str("fingerprint", line).Msg("Invalid .gitleaksignore entry")
  157. }
  158. d.gitleaksIgnore[strings.Join(s, ":")] = struct{}{}
  159. }
  160. return nil
  161. }
  162. // DetectBytes scans the given bytes and returns a list of findings
  163. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  164. return d.DetectString(string(content))
  165. }
  166. // DetectString scans the given string and returns a list of findings
  167. func (d *Detector) DetectString(content string) []report.Finding {
  168. return d.Detect(Fragment{
  169. Raw: content,
  170. })
  171. }
  172. // Detect scans the given fragment and returns a list of findings
  173. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  174. if fragment.Bytes == nil {
  175. d.TotalBytes.Add(uint64(len(fragment.Raw)))
  176. }
  177. d.TotalBytes.Add(uint64(len(fragment.Bytes)))
  178. var findings []report.Finding
  179. // check if filepath is allowed
  180. if fragment.FilePath != "" {
  181. // is the path our config or baseline file?
  182. if fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath) ||
  183. // is the path excluded by the global allowlist?
  184. (d.Config.Allowlist.PathAllowed(fragment.FilePath) || (fragment.WindowsFilePath != "" && d.Config.Allowlist.PathAllowed(fragment.WindowsFilePath))) {
  185. return findings
  186. }
  187. }
  188. // add newline indices for location calculation in detectRule
  189. fragment.newlineIndices = newLineRegexp.FindAllStringIndex(fragment.Raw, -1)
  190. // setup variables to handle different decoding passes
  191. currentRaw := fragment.Raw
  192. encodedSegments := []EncodedSegment{}
  193. currentDecodeDepth := 0
  194. decoder := NewDecoder()
  195. for {
  196. // build keyword map for prefiltering rules
  197. keywords := make(map[string]bool)
  198. normalizedRaw := strings.ToLower(currentRaw)
  199. matches := d.prefilter.MatchString(normalizedRaw)
  200. for _, m := range matches {
  201. keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  202. }
  203. for _, rule := range d.Config.Rules {
  204. if len(rule.Keywords) == 0 {
  205. // if no keywords are associated with the rule always scan the
  206. // fragment using the rule
  207. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  208. continue
  209. }
  210. // check if keywords are in the fragment
  211. for _, k := range rule.Keywords {
  212. if _, ok := keywords[strings.ToLower(k)]; ok {
  213. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  214. break
  215. }
  216. }
  217. }
  218. // increment the depth by 1 as we start our decoding pass
  219. currentDecodeDepth++
  220. // stop the loop if we've hit our max decoding depth
  221. if currentDecodeDepth > d.MaxDecodeDepth {
  222. break
  223. }
  224. // decode the currentRaw for the next pass
  225. currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
  226. // stop the loop when there's nothing else to decode
  227. if len(encodedSegments) == 0 {
  228. break
  229. }
  230. }
  231. return filter(findings, d.Redact)
  232. }
  233. // detectRule scans the given fragment for the given rule and returns a list of findings
  234. func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []EncodedSegment) []report.Finding {
  235. var (
  236. findings []report.Finding
  237. logger = func() zerolog.Logger {
  238. l := logging.With().Str("rule-id", r.RuleID).Str("path", fragment.FilePath)
  239. if fragment.CommitSHA != "" {
  240. l = l.Str("commit", fragment.CommitSHA)
  241. }
  242. return l.Logger()
  243. }()
  244. )
  245. // check if filepath or commit is allowed for this rule
  246. for _, a := range r.Allowlists {
  247. var (
  248. isAllowed bool
  249. commitAllowed, commit = a.CommitAllowed(fragment.CommitSHA)
  250. pathAllowed = a.PathAllowed(fragment.FilePath) || (fragment.WindowsFilePath != "" && a.PathAllowed(fragment.WindowsFilePath))
  251. )
  252. if a.MatchCondition == config.AllowlistMatchAnd {
  253. // Determine applicable checks.
  254. var allowlistChecks []bool
  255. if len(a.Commits) > 0 {
  256. allowlistChecks = append(allowlistChecks, commitAllowed)
  257. }
  258. if len(a.Paths) > 0 {
  259. allowlistChecks = append(allowlistChecks, pathAllowed)
  260. }
  261. // These will be checked later.
  262. if len(a.Regexes) > 0 {
  263. allowlistChecks = append(allowlistChecks, false)
  264. }
  265. if len(a.StopWords) > 0 {
  266. allowlistChecks = append(allowlistChecks, false)
  267. }
  268. // Check if allowed.
  269. isAllowed = allTrue(allowlistChecks)
  270. } else {
  271. isAllowed = commitAllowed || pathAllowed
  272. }
  273. if isAllowed {
  274. event := logger.Trace().Str("condition", a.MatchCondition.String())
  275. if commitAllowed {
  276. event.Str("allowed-commit", commit)
  277. }
  278. if pathAllowed {
  279. event.Bool("allowed-path", pathAllowed)
  280. }
  281. event.Msg("skipping file: rule allowlist")
  282. return findings
  283. }
  284. }
  285. if r.Path != nil {
  286. if r.Regex == nil && len(encodedSegments) == 0 {
  287. // Path _only_ rule
  288. if r.Path.MatchString(fragment.FilePath) || (fragment.WindowsFilePath != "" && r.Path.MatchString(fragment.WindowsFilePath)) {
  289. finding := report.Finding{
  290. RuleID: r.RuleID,
  291. Description: r.Description,
  292. File: fragment.FilePath,
  293. SymlinkFile: fragment.SymlinkFile,
  294. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  295. Tags: r.Tags,
  296. }
  297. return append(findings, finding)
  298. }
  299. } else {
  300. // if path is set _and_ a regex is set, then we need to check both
  301. // so if the path does not match, then we should return early and not
  302. // consider the regex
  303. if !(r.Path.MatchString(fragment.FilePath) || (fragment.WindowsFilePath != "" && r.Path.MatchString(fragment.WindowsFilePath))) {
  304. return findings
  305. }
  306. }
  307. }
  308. // if path only rule, skip content checks
  309. if r.Regex == nil {
  310. return findings
  311. }
  312. // if flag configure and raw data size bigger then the flag
  313. if d.MaxTargetMegaBytes > 0 {
  314. rawLength := len(currentRaw) / 1000000
  315. if rawLength > d.MaxTargetMegaBytes {
  316. logger.Debug().
  317. Int("size", rawLength).
  318. Int("max-size", d.MaxTargetMegaBytes).
  319. Msg("skipping fragment: size")
  320. return findings
  321. }
  322. }
  323. // use currentRaw instead of fragment.Raw since this represents the current
  324. // decoding pass on the text
  325. MatchLoop:
  326. for _, matchIndex := range r.Regex.FindAllStringIndex(currentRaw, -1) {
  327. // Extract secret from match
  328. secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n")
  329. // For any meta data from decoding
  330. var metaTags []string
  331. currentLine := ""
  332. // Check if the decoded portions of the segment overlap with the match
  333. // to see if its potentially a new match
  334. if len(encodedSegments) > 0 {
  335. if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
  336. matchIndex = segment.adjustMatchIndex(matchIndex)
  337. metaTags = append(metaTags, segment.tags()...)
  338. currentLine = currentRaw[segment.lineStartIndex(currentRaw):segment.lineEndIndex(currentRaw, matchIndex[1]-matchIndex[0])]
  339. } else {
  340. // This item has already been added to a finding
  341. continue
  342. }
  343. } else {
  344. // Fixes: https://github.com/gitleaks/gitleaks/issues/1352
  345. // removes the incorrectly following line that was detected by regex expression '\n'
  346. matchIndex[1] = matchIndex[0] + len(secret)
  347. }
  348. // determine location of match. Note that the location
  349. // in the finding will be the line/column numbers of the _match_
  350. // not the _secret_, which will be different if the secretGroup
  351. // value is set for this rule
  352. loc := location(fragment, matchIndex)
  353. if matchIndex[1] > loc.endLineIndex {
  354. loc.endLineIndex = matchIndex[1]
  355. }
  356. finding := report.Finding{
  357. RuleID: r.RuleID,
  358. Description: r.Description,
  359. StartLine: loc.startLine,
  360. EndLine: loc.endLine,
  361. StartColumn: loc.startColumn,
  362. EndColumn: loc.endColumn,
  363. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  364. Match: secret,
  365. Secret: secret,
  366. File: fragment.FilePath,
  367. SymlinkFile: fragment.SymlinkFile,
  368. Tags: append(r.Tags, metaTags...),
  369. }
  370. if !d.IgnoreGitleaksAllow && strings.Contains(finding.Line, gitleaksAllowSignature) {
  371. logger.Trace().
  372. Str("finding", finding.Secret).
  373. Msg("skipping finding: 'gitleaks:allow' signature")
  374. continue
  375. }
  376. if currentLine == "" {
  377. currentLine = finding.Line
  378. }
  379. // Set the value of |secret|, if the pattern contains at least one capture group.
  380. // (The first element is the full match, hence we check >= 2.)
  381. groups := r.Regex.FindStringSubmatch(finding.Secret)
  382. if len(groups) >= 2 {
  383. if r.SecretGroup > 0 {
  384. if len(groups) <= r.SecretGroup {
  385. // Config validation should prevent this
  386. continue
  387. }
  388. finding.Secret = groups[r.SecretGroup]
  389. } else {
  390. // If |secretGroup| is not set, we will use the first suitable capture group.
  391. for _, s := range groups[1:] {
  392. if len(s) > 0 {
  393. finding.Secret = s
  394. break
  395. }
  396. }
  397. }
  398. }
  399. // check entropy
  400. entropy := shannonEntropy(finding.Secret)
  401. finding.Entropy = float32(entropy)
  402. if r.Entropy != 0.0 {
  403. // entropy is too low, skip this finding
  404. if entropy <= r.Entropy {
  405. logger.Trace().
  406. Str("finding", finding.Secret).
  407. Float32("entropy", finding.Entropy).
  408. Msg("skipping finding: low entropy")
  409. continue
  410. }
  411. }
  412. // check if the regexTarget is defined in the allowlist "regexes" entry
  413. // or if the secret is in the list of stopwords
  414. globalAllowlistTarget := finding.Secret
  415. switch d.Config.Allowlist.RegexTarget {
  416. case "match":
  417. globalAllowlistTarget = finding.Match
  418. case "line":
  419. globalAllowlistTarget = currentLine
  420. }
  421. if d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  422. logger.Trace().
  423. Str("finding", globalAllowlistTarget).
  424. Msg("skipping finding: global allowlist regex")
  425. continue
  426. } else if ok, word := d.Config.Allowlist.ContainsStopWord(finding.Secret); ok {
  427. logger.Trace().
  428. Str("finding", finding.Secret).
  429. Str("allowed-stopword", word).
  430. Msg("skipping finding: global allowlist stopword")
  431. continue
  432. }
  433. // check if the result matches any of the rule allowlists.
  434. for _, a := range r.Allowlists {
  435. allowlistTarget := finding.Secret
  436. switch a.RegexTarget {
  437. case "match":
  438. allowlistTarget = finding.Match
  439. case "line":
  440. allowlistTarget = currentLine
  441. }
  442. var (
  443. isAllowed bool
  444. commitAllowed bool
  445. commit string
  446. pathAllowed bool
  447. regexAllowed = a.RegexAllowed(allowlistTarget)
  448. containsStopword, word = a.ContainsStopWord(finding.Secret)
  449. )
  450. // check if the secret is in the list of stopwords
  451. if a.MatchCondition == config.AllowlistMatchAnd {
  452. // Determine applicable checks.
  453. var allowlistChecks []bool
  454. if len(a.Commits) > 0 {
  455. commitAllowed, commit = a.CommitAllowed(fragment.CommitSHA)
  456. allowlistChecks = append(allowlistChecks, commitAllowed)
  457. }
  458. if len(a.Paths) > 0 {
  459. pathAllowed = a.PathAllowed(fragment.FilePath) || (fragment.WindowsFilePath != "" && a.PathAllowed(fragment.WindowsFilePath))
  460. allowlistChecks = append(allowlistChecks, pathAllowed)
  461. }
  462. if len(a.Regexes) > 0 {
  463. allowlistChecks = append(allowlistChecks, regexAllowed)
  464. }
  465. if len(a.StopWords) > 0 {
  466. allowlistChecks = append(allowlistChecks, containsStopword)
  467. }
  468. // Check if allowed.
  469. isAllowed = allTrue(allowlistChecks)
  470. } else {
  471. isAllowed = regexAllowed || containsStopword
  472. }
  473. if isAllowed {
  474. event := logger.Trace().
  475. Str("finding", finding.Secret).
  476. Str("condition", a.MatchCondition.String())
  477. if commitAllowed {
  478. event.Str("allowed-commit", commit)
  479. }
  480. if pathAllowed {
  481. event.Bool("allowed-path", pathAllowed)
  482. }
  483. if regexAllowed {
  484. event.Bool("allowed-regex", regexAllowed)
  485. }
  486. if containsStopword {
  487. event.Str("allowed-stopword", word)
  488. }
  489. event.Msg("skipping finding: rule allowlist")
  490. continue MatchLoop
  491. }
  492. }
  493. findings = append(findings, finding)
  494. }
  495. return findings
  496. }
  497. func allTrue(bools []bool) bool {
  498. allMatch := true
  499. for _, check := range bools {
  500. if !check {
  501. allMatch = false
  502. break
  503. }
  504. }
  505. return allMatch
  506. }
  507. // AddFinding synchronously adds a finding to the findings slice
  508. func (d *Detector) AddFinding(finding report.Finding) {
  509. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  510. if finding.Commit != "" {
  511. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  512. } else {
  513. finding.Fingerprint = globalFingerprint
  514. }
  515. // check if we should ignore this finding
  516. logger := logging.With().Str("finding", finding.Secret).Logger()
  517. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  518. logger.Debug().
  519. Str("fingerprint", globalFingerprint).
  520. Msg("skipping finding: global fingerprint")
  521. return
  522. } else if finding.Commit != "" {
  523. // Awkward nested if because I'm not sure how to chain these two conditions.
  524. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  525. logger.Debug().
  526. Str("fingerprint", finding.Fingerprint).
  527. Msgf("skipping finding: fingerprint")
  528. return
  529. }
  530. }
  531. if d.baseline != nil && !IsNew(finding, d.baseline) {
  532. logger.Debug().
  533. Str("fingerprint", finding.Fingerprint).
  534. Msgf("skipping finding: baseline")
  535. return
  536. }
  537. d.findingMutex.Lock()
  538. d.findings = append(d.findings, finding)
  539. if d.Verbose {
  540. printFinding(finding, d.NoColor)
  541. }
  542. d.findingMutex.Unlock()
  543. }
  544. // Findings returns the findings added to the detector
  545. func (d *Detector) Findings() []report.Finding {
  546. return d.findings
  547. }
  548. // AddCommit synchronously adds a commit to the commit slice
  549. func (d *Detector) addCommit(commit string) {
  550. d.commitMap[commit] = true
  551. }