4
0

detect.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "strings"
  8. "sync"
  9. "sync/atomic"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/logging"
  12. "github.com/zricethezav/gitleaks/v8/regexp"
  13. "github.com/zricethezav/gitleaks/v8/report"
  14. ahocorasick "github.com/BobuSumisu/aho-corasick"
  15. "github.com/fatih/semgroup"
  16. "github.com/rs/zerolog"
  17. "github.com/spf13/viper"
  18. "golang.org/x/exp/maps"
  19. )
  20. const (
  21. gitleaksAllowSignature = "gitleaks:allow"
  22. chunkSize = 100 * 1_000 // 100kb
  23. )
  24. var newLineRegexp = regexp.MustCompile("\n")
  25. // Detector is the main detector struct
  26. type Detector struct {
  27. // Config is the configuration for the detector
  28. Config config.Config
  29. // Redact is a flag to redact findings. This is exported
  30. // so users using gitleaks as a library can set this flag
  31. // without calling `detector.Start(cmd *cobra.Command)`
  32. Redact uint
  33. // verbose is a flag to print findings
  34. Verbose bool
  35. // MaxDecodeDepths limits how many recursive decoding passes are allowed
  36. MaxDecodeDepth int
  37. // files larger than this will be skipped
  38. MaxTargetMegaBytes int
  39. // followSymlinks is a flag to enable scanning symlink files
  40. FollowSymlinks bool
  41. // NoColor is a flag to disable color output
  42. NoColor bool
  43. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  44. IgnoreGitleaksAllow bool
  45. // commitMap is used to keep track of commits that have been scanned.
  46. // This is only used for logging purposes and git scans.
  47. commitMap map[string]bool
  48. // findingMutex is to prevent concurrent access to the
  49. // findings slice when adding findings.
  50. findingMutex *sync.Mutex
  51. // findings is a slice of report.Findings. This is the result
  52. // of the detector's scan which can then be used to generate a
  53. // report.
  54. findings []report.Finding
  55. // prefilter is a ahocorasick struct used for doing efficient string
  56. // matching given a set of words (keywords from the rules in the config)
  57. prefilter ahocorasick.Trie
  58. // a list of known findings that should be ignored
  59. baseline []report.Finding
  60. // path to baseline
  61. baselinePath string
  62. // gitleaksIgnore
  63. gitleaksIgnore map[string]bool
  64. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  65. Sema *semgroup.Group
  66. // report-related settings.
  67. ReportPath string
  68. Reporter report.Reporter
  69. TotalBytes atomic.Uint64
  70. }
  71. // Fragment contains the data to be scanned
  72. type Fragment struct {
  73. // Raw is the raw content of the fragment
  74. Raw string
  75. Bytes []byte
  76. // FilePath is the path to the file if applicable
  77. FilePath string
  78. SymlinkFile string
  79. // CommitSHA is the SHA of the commit if applicable
  80. CommitSHA string
  81. // newlineIndices is a list of indices of newlines in the raw content.
  82. // This is used to calculate the line location of a finding
  83. newlineIndices [][]int
  84. }
  85. // NewDetector creates a new detector with the given config
  86. func NewDetector(cfg config.Config) *Detector {
  87. return &Detector{
  88. commitMap: make(map[string]bool),
  89. gitleaksIgnore: make(map[string]bool),
  90. findingMutex: &sync.Mutex{},
  91. findings: make([]report.Finding, 0),
  92. Config: cfg,
  93. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
  94. Sema: semgroup.NewGroup(context.Background(), 40),
  95. }
  96. }
  97. // NewDetectorDefaultConfig creates a new detector with the default config
  98. func NewDetectorDefaultConfig() (*Detector, error) {
  99. viper.SetConfigType("toml")
  100. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  101. if err != nil {
  102. return nil, err
  103. }
  104. var vc config.ViperConfig
  105. err = viper.Unmarshal(&vc)
  106. if err != nil {
  107. return nil, err
  108. }
  109. cfg, err := vc.Translate()
  110. if err != nil {
  111. return nil, err
  112. }
  113. return NewDetector(cfg), nil
  114. }
  115. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  116. logging.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  117. file, err := os.Open(gitleaksIgnorePath)
  118. if err != nil {
  119. return err
  120. }
  121. // https://github.com/securego/gosec/issues/512
  122. defer func() {
  123. if err := file.Close(); err != nil {
  124. logging.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  125. }
  126. }()
  127. scanner := bufio.NewScanner(file)
  128. for scanner.Scan() {
  129. line := strings.TrimSpace(scanner.Text())
  130. // Skip lines that start with a comment
  131. if line != "" && !strings.HasPrefix(line, "#") {
  132. d.gitleaksIgnore[line] = true
  133. }
  134. }
  135. return nil
  136. }
  137. // DetectBytes scans the given bytes and returns a list of findings
  138. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  139. return d.DetectString(string(content))
  140. }
  141. // DetectString scans the given string and returns a list of findings
  142. func (d *Detector) DetectString(content string) []report.Finding {
  143. return d.Detect(Fragment{
  144. Raw: content,
  145. })
  146. }
  147. // Detect scans the given fragment and returns a list of findings
  148. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  149. if fragment.Bytes == nil {
  150. d.TotalBytes.Add(uint64(len(fragment.Raw)))
  151. }
  152. d.TotalBytes.Add(uint64(len(fragment.Bytes)))
  153. var findings []report.Finding
  154. // check if filepath is allowed
  155. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  156. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  157. return findings
  158. }
  159. // add newline indices for location calculation in detectRule
  160. fragment.newlineIndices = newLineRegexp.FindAllStringIndex(fragment.Raw, -1)
  161. // setup variables to handle different decoding passes
  162. currentRaw := fragment.Raw
  163. encodedSegments := []EncodedSegment{}
  164. currentDecodeDepth := 0
  165. decoder := NewDecoder()
  166. for {
  167. // build keyword map for prefiltering rules
  168. keywords := make(map[string]bool)
  169. normalizedRaw := strings.ToLower(currentRaw)
  170. matches := d.prefilter.MatchString(normalizedRaw)
  171. for _, m := range matches {
  172. keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  173. }
  174. for _, rule := range d.Config.Rules {
  175. if len(rule.Keywords) == 0 {
  176. // if no keywords are associated with the rule always scan the
  177. // fragment using the rule
  178. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  179. continue
  180. }
  181. // check if keywords are in the fragment
  182. for _, k := range rule.Keywords {
  183. if _, ok := keywords[strings.ToLower(k)]; ok {
  184. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  185. break
  186. }
  187. }
  188. }
  189. // increment the depth by 1 as we start our decoding pass
  190. currentDecodeDepth++
  191. // stop the loop if we've hit our max decoding depth
  192. if currentDecodeDepth > d.MaxDecodeDepth {
  193. break
  194. }
  195. // decode the currentRaw for the next pass
  196. currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
  197. // stop the loop when there's nothing else to decode
  198. if len(encodedSegments) == 0 {
  199. break
  200. }
  201. }
  202. return filter(findings, d.Redact)
  203. }
  204. // detectRule scans the given fragment for the given rule and returns a list of findings
  205. func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []EncodedSegment) []report.Finding {
  206. var (
  207. findings []report.Finding
  208. logger = func() zerolog.Logger {
  209. l := logging.With().Str("rule-id", r.RuleID).Str("path", fragment.FilePath)
  210. if fragment.CommitSHA != "" {
  211. l = l.Str("commit", fragment.CommitSHA)
  212. }
  213. return l.Logger()
  214. }()
  215. )
  216. // check if filepath or commit is allowed for this rule
  217. for _, a := range r.Allowlists {
  218. var (
  219. isAllowed bool
  220. commitAllowed = a.CommitAllowed(fragment.CommitSHA)
  221. pathAllowed = a.PathAllowed(fragment.FilePath)
  222. )
  223. if a.MatchCondition == config.AllowlistMatchAnd {
  224. // Determine applicable checks.
  225. var allowlistChecks []bool
  226. if len(a.Commits) > 0 {
  227. allowlistChecks = append(allowlistChecks, commitAllowed)
  228. }
  229. if len(a.Paths) > 0 {
  230. allowlistChecks = append(allowlistChecks, pathAllowed)
  231. }
  232. // These will be checked later.
  233. if len(a.Regexes) > 0 {
  234. allowlistChecks = append(allowlistChecks, false)
  235. }
  236. if len(a.StopWords) > 0 {
  237. allowlistChecks = append(allowlistChecks, false)
  238. }
  239. // Check if allowed.
  240. isAllowed = allTrue(allowlistChecks)
  241. } else {
  242. isAllowed = commitAllowed || pathAllowed
  243. }
  244. if isAllowed {
  245. event := logger.Trace().Str("condition", a.MatchCondition.String())
  246. if commitAllowed {
  247. event.Bool("allowed-commit", commitAllowed)
  248. }
  249. if pathAllowed {
  250. event.Bool("allowed-path", pathAllowed)
  251. }
  252. event.Msg("skipping file: rule allowlist")
  253. return findings
  254. }
  255. }
  256. if r.Path != nil && r.Regex == nil && len(encodedSegments) == 0 {
  257. // Path _only_ rule
  258. if r.Path.MatchString(fragment.FilePath) {
  259. finding := report.Finding{
  260. Description: r.Description,
  261. File: fragment.FilePath,
  262. SymlinkFile: fragment.SymlinkFile,
  263. RuleID: r.RuleID,
  264. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  265. Tags: r.Tags,
  266. }
  267. return append(findings, finding)
  268. }
  269. } else if r.Path != nil {
  270. // if path is set _and_ a regex is set, then we need to check both
  271. // so if the path does not match, then we should return early and not
  272. // consider the regex
  273. if !r.Path.MatchString(fragment.FilePath) {
  274. return findings
  275. }
  276. }
  277. // if path only rule, skip content checks
  278. if r.Regex == nil {
  279. return findings
  280. }
  281. // if flag configure and raw data size bigger then the flag
  282. if d.MaxTargetMegaBytes > 0 {
  283. rawLength := len(currentRaw) / 1000000
  284. if rawLength > d.MaxTargetMegaBytes {
  285. logger.Debug().
  286. Int("size", rawLength).
  287. Int("max-size", d.MaxTargetMegaBytes).
  288. Msg("skipping fragment: size")
  289. return findings
  290. }
  291. }
  292. // use currentRaw instead of fragment.Raw since this represents the current
  293. // decoding pass on the text
  294. MatchLoop:
  295. for _, matchIndex := range r.Regex.FindAllStringIndex(currentRaw, -1) {
  296. // Extract secret from match
  297. secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n")
  298. // For any meta data from decoding
  299. var metaTags []string
  300. // Check if the decoded portions of the segment overlap with the match
  301. // to see if its potentially a new match
  302. if len(encodedSegments) > 0 {
  303. if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
  304. matchIndex = segment.adjustMatchIndex(matchIndex)
  305. metaTags = append(metaTags, segment.tags()...)
  306. } else {
  307. // This item has already been added to a finding
  308. continue
  309. }
  310. } else {
  311. // Fixes: https://github.com/gitleaks/gitleaks/issues/1352
  312. // removes the incorrectly following line that was detected by regex expression '\n'
  313. matchIndex[1] = matchIndex[0] + len(secret)
  314. }
  315. // determine location of match. Note that the location
  316. // in the finding will be the line/column numbers of the _match_
  317. // not the _secret_, which will be different if the secretGroup
  318. // value is set for this rule
  319. loc := location(fragment, matchIndex)
  320. if matchIndex[1] > loc.endLineIndex {
  321. loc.endLineIndex = matchIndex[1]
  322. }
  323. finding := report.Finding{
  324. Description: r.Description,
  325. File: fragment.FilePath,
  326. SymlinkFile: fragment.SymlinkFile,
  327. RuleID: r.RuleID,
  328. StartLine: loc.startLine,
  329. EndLine: loc.endLine,
  330. StartColumn: loc.startColumn,
  331. EndColumn: loc.endColumn,
  332. Secret: secret,
  333. Match: secret,
  334. Tags: append(r.Tags, metaTags...),
  335. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  336. }
  337. if !d.IgnoreGitleaksAllow &&
  338. strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex], gitleaksAllowSignature) {
  339. logger.Trace().
  340. Str("finding", finding.Secret).
  341. Msg("skipping finding: 'gitleaks:allow' signature")
  342. continue
  343. }
  344. // Set the value of |secret|, if the pattern contains at least one capture group.
  345. // (The first element is the full match, hence we check >= 2.)
  346. groups := r.Regex.FindStringSubmatch(finding.Secret)
  347. if len(groups) >= 2 {
  348. if r.SecretGroup > 0 {
  349. if len(groups) <= r.SecretGroup {
  350. // Config validation should prevent this
  351. continue
  352. }
  353. finding.Secret = groups[r.SecretGroup]
  354. } else {
  355. // If |secretGroup| is not set, we will use the first suitable capture group.
  356. for _, s := range groups[1:] {
  357. if len(s) > 0 {
  358. finding.Secret = s
  359. break
  360. }
  361. }
  362. }
  363. }
  364. // check entropy
  365. entropy := shannonEntropy(finding.Secret)
  366. finding.Entropy = float32(entropy)
  367. if r.Entropy != 0.0 {
  368. // entropy is too low, skip this finding
  369. if entropy <= r.Entropy {
  370. logger.Trace().
  371. Str("finding", finding.Secret).
  372. Float32("entropy", finding.Entropy).
  373. Msg("skipping finding: low entropy")
  374. continue
  375. }
  376. }
  377. // check if the regexTarget is defined in the allowlist "regexes" entry
  378. // or if the secret is in the list of stopwords
  379. globalAllowlistTarget := finding.Secret
  380. switch d.Config.Allowlist.RegexTarget {
  381. case "match":
  382. globalAllowlistTarget = finding.Match
  383. case "line":
  384. globalAllowlistTarget = finding.Line
  385. }
  386. if d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  387. logger.Trace().
  388. Str("finding", globalAllowlistTarget).
  389. Msg("skipping finding: global allowlist regex")
  390. continue
  391. } else if d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  392. logger.Trace().
  393. Str("finding", finding.Secret).
  394. Msg("skipping finding: global allowlist stopword")
  395. continue
  396. }
  397. // check if the result matches any of the rule allowlists.
  398. for _, a := range r.Allowlists {
  399. allowlistTarget := finding.Secret
  400. switch a.RegexTarget {
  401. case "match":
  402. allowlistTarget = finding.Match
  403. case "line":
  404. allowlistTarget = finding.Line
  405. }
  406. var (
  407. isAllowed bool
  408. commitAllowed bool
  409. pathAllowed bool
  410. regexAllowed = a.RegexAllowed(allowlistTarget)
  411. containsStopword = a.ContainsStopWord(finding.Secret)
  412. )
  413. // check if the secret is in the list of stopwords
  414. if a.MatchCondition == config.AllowlistMatchAnd {
  415. // Determine applicable checks.
  416. var allowlistChecks []bool
  417. if len(a.Commits) > 0 {
  418. commitAllowed = a.CommitAllowed(fragment.CommitSHA)
  419. allowlistChecks = append(allowlistChecks, commitAllowed)
  420. }
  421. if len(a.Paths) > 0 {
  422. pathAllowed = a.PathAllowed(fragment.FilePath)
  423. allowlistChecks = append(allowlistChecks, pathAllowed)
  424. }
  425. if len(a.Regexes) > 0 {
  426. allowlistChecks = append(allowlistChecks, regexAllowed)
  427. }
  428. if len(a.StopWords) > 0 {
  429. allowlistChecks = append(allowlistChecks, containsStopword)
  430. }
  431. // Check if allowed.
  432. isAllowed = allTrue(allowlistChecks)
  433. } else {
  434. isAllowed = regexAllowed || containsStopword
  435. }
  436. if isAllowed {
  437. event := logger.Trace().
  438. Str("finding", finding.Secret).
  439. Str("condition", a.MatchCondition.String())
  440. if commitAllowed {
  441. event.Bool("allowed-commit", commitAllowed)
  442. }
  443. if pathAllowed {
  444. event.Bool("allowed-path", pathAllowed)
  445. }
  446. if regexAllowed {
  447. event.Bool("allowed-regex", regexAllowed)
  448. }
  449. if containsStopword {
  450. event.Bool("allowed-stopword", containsStopword)
  451. }
  452. event.Msg("skipping finding: rule allowlist")
  453. continue MatchLoop
  454. }
  455. }
  456. findings = append(findings, finding)
  457. }
  458. return findings
  459. }
  460. func allTrue(bools []bool) bool {
  461. allMatch := true
  462. for _, check := range bools {
  463. if !check {
  464. allMatch = false
  465. break
  466. }
  467. }
  468. return allMatch
  469. }
  470. // addFinding synchronously adds a finding to the findings slice
  471. func (d *Detector) addFinding(finding report.Finding) {
  472. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  473. if finding.Commit != "" {
  474. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  475. } else {
  476. finding.Fingerprint = globalFingerprint
  477. }
  478. // check if we should ignore this finding
  479. logger := logging.With().Str("finding", finding.Secret).Logger()
  480. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  481. logger.Debug().
  482. Str("fingerprint", globalFingerprint).
  483. Msg("skipping finding: global fingerprint")
  484. return
  485. } else if finding.Commit != "" {
  486. // Awkward nested if because I'm not sure how to chain these two conditions.
  487. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  488. logger.Debug().
  489. Str("fingerprint", finding.Fingerprint).
  490. Msgf("skipping finding: fingerprint")
  491. return
  492. }
  493. }
  494. if d.baseline != nil && !IsNew(finding, d.baseline) {
  495. logger.Debug().
  496. Str("fingerprint", finding.Fingerprint).
  497. Msgf("skipping finding: baseline")
  498. return
  499. }
  500. d.findingMutex.Lock()
  501. d.findings = append(d.findings, finding)
  502. if d.Verbose {
  503. printFinding(finding, d.NoColor)
  504. }
  505. d.findingMutex.Unlock()
  506. }
  507. // addCommit synchronously adds a commit to the commit slice
  508. func (d *Detector) addCommit(commit string) {
  509. d.commitMap[commit] = true
  510. }