detect.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. ahocorasick "github.com/BobuSumisu/aho-corasick"
  13. "github.com/fatih/semgroup"
  14. "github.com/rs/zerolog"
  15. "github.com/rs/zerolog/log"
  16. "github.com/spf13/viper"
  17. "golang.org/x/exp/maps"
  18. )
  19. const (
  20. gitleaksAllowSignature = "gitleaks:allow"
  21. chunkSize = 100 * 1_000 // 100kb
  22. )
  23. var newLineRegexp = regexp.MustCompile("\n")
  24. // Detector is the main detector struct
  25. type Detector struct {
  26. // Config is the configuration for the detector
  27. Config config.Config
  28. // Redact is a flag to redact findings. This is exported
  29. // so users using gitleaks as a library can set this flag
  30. // without calling `detector.Start(cmd *cobra.Command)`
  31. Redact uint
  32. // verbose is a flag to print findings
  33. Verbose bool
  34. // MaxDecodeDepths limits how many recursive decoding passes are allowed
  35. MaxDecodeDepth int
  36. // files larger than this will be skipped
  37. MaxTargetMegaBytes int
  38. // followSymlinks is a flag to enable scanning symlink files
  39. FollowSymlinks bool
  40. // NoColor is a flag to disable color output
  41. NoColor bool
  42. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  43. IgnoreGitleaksAllow bool
  44. // commitMap is used to keep track of commits that have been scanned.
  45. // This is only used for logging purposes and git scans.
  46. commitMap map[string]bool
  47. // findingMutex is to prevent concurrent access to the
  48. // findings slice when adding findings.
  49. findingMutex *sync.Mutex
  50. // findings is a slice of report.Findings. This is the result
  51. // of the detector's scan which can then be used to generate a
  52. // report.
  53. findings []report.Finding
  54. // prefilter is a ahocorasick struct used for doing efficient string
  55. // matching given a set of words (keywords from the rules in the config)
  56. prefilter ahocorasick.Trie
  57. // a list of known findings that should be ignored
  58. baseline []report.Finding
  59. // path to baseline
  60. baselinePath string
  61. // gitleaksIgnore
  62. gitleaksIgnore map[string]bool
  63. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  64. Sema *semgroup.Group
  65. // report-related settings.
  66. ReportPath string
  67. Reporter report.Reporter
  68. }
  69. // Fragment contains the data to be scanned
  70. type Fragment struct {
  71. // Raw is the raw content of the fragment
  72. Raw string
  73. // FilePath is the path to the file if applicable
  74. FilePath string
  75. SymlinkFile string
  76. // CommitSHA is the SHA of the commit if applicable
  77. CommitSHA string
  78. // newlineIndices is a list of indices of newlines in the raw content.
  79. // This is used to calculate the line location of a finding
  80. newlineIndices [][]int
  81. }
  82. // NewDetector creates a new detector with the given config
  83. func NewDetector(cfg config.Config) *Detector {
  84. return &Detector{
  85. commitMap: make(map[string]bool),
  86. gitleaksIgnore: make(map[string]bool),
  87. findingMutex: &sync.Mutex{},
  88. findings: make([]report.Finding, 0),
  89. Config: cfg,
  90. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
  91. Sema: semgroup.NewGroup(context.Background(), 40),
  92. }
  93. }
  94. // NewDetectorDefaultConfig creates a new detector with the default config
  95. func NewDetectorDefaultConfig() (*Detector, error) {
  96. viper.SetConfigType("toml")
  97. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  98. if err != nil {
  99. return nil, err
  100. }
  101. var vc config.ViperConfig
  102. err = viper.Unmarshal(&vc)
  103. if err != nil {
  104. return nil, err
  105. }
  106. cfg, err := vc.Translate()
  107. if err != nil {
  108. return nil, err
  109. }
  110. return NewDetector(cfg), nil
  111. }
  112. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  113. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  114. file, err := os.Open(gitleaksIgnorePath)
  115. if err != nil {
  116. return err
  117. }
  118. // https://github.com/securego/gosec/issues/512
  119. defer func() {
  120. if err := file.Close(); err != nil {
  121. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  122. }
  123. }()
  124. scanner := bufio.NewScanner(file)
  125. for scanner.Scan() {
  126. line := strings.TrimSpace(scanner.Text())
  127. // Skip lines that start with a comment
  128. if line != "" && !strings.HasPrefix(line, "#") {
  129. d.gitleaksIgnore[line] = true
  130. }
  131. }
  132. return nil
  133. }
  134. // DetectBytes scans the given bytes and returns a list of findings
  135. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  136. return d.DetectString(string(content))
  137. }
  138. // DetectString scans the given string and returns a list of findings
  139. func (d *Detector) DetectString(content string) []report.Finding {
  140. return d.Detect(Fragment{
  141. Raw: content,
  142. })
  143. }
  144. // Detect scans the given fragment and returns a list of findings
  145. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  146. var findings []report.Finding
  147. // check if filepath is allowed
  148. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  149. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  150. return findings
  151. }
  152. // add newline indices for location calculation in detectRule
  153. fragment.newlineIndices = newLineRegexp.FindAllStringIndex(fragment.Raw, -1)
  154. // setup variables to handle different decoding passes
  155. currentRaw := fragment.Raw
  156. encodedSegments := []EncodedSegment{}
  157. currentDecodeDepth := 0
  158. decoder := NewDecoder()
  159. for {
  160. // build keyword map for prefiltering rules
  161. keywords := make(map[string]bool)
  162. normalizedRaw := strings.ToLower(currentRaw)
  163. matches := d.prefilter.MatchString(normalizedRaw)
  164. for _, m := range matches {
  165. keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  166. }
  167. for _, rule := range d.Config.Rules {
  168. if len(rule.Keywords) == 0 {
  169. // if no keywords are associated with the rule always scan the
  170. // fragment using the rule
  171. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  172. continue
  173. }
  174. // check if keywords are in the fragment
  175. for _, k := range rule.Keywords {
  176. if _, ok := keywords[strings.ToLower(k)]; ok {
  177. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  178. break
  179. }
  180. }
  181. }
  182. // increment the depth by 1 as we start our decoding pass
  183. currentDecodeDepth++
  184. // stop the loop if we've hit our max decoding depth
  185. if currentDecodeDepth > d.MaxDecodeDepth {
  186. break
  187. }
  188. // decode the currentRaw for the next pass
  189. currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
  190. // stop the loop when there's nothing else to decode
  191. if len(encodedSegments) == 0 {
  192. break
  193. }
  194. }
  195. return filter(findings, d.Redact)
  196. }
  197. // detectRule scans the given fragment for the given rule and returns a list of findings
  198. func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []EncodedSegment) []report.Finding {
  199. var (
  200. findings []report.Finding
  201. logger = func() zerolog.Logger {
  202. l := log.With().Str("rule-id", r.RuleID)
  203. if fragment.CommitSHA != "" {
  204. l = l.Str("commit", fragment.CommitSHA)
  205. }
  206. l = l.Str("path", fragment.FilePath)
  207. return l.Logger()
  208. }()
  209. )
  210. // check if filepath or commit is allowed for this rule
  211. for _, a := range r.Allowlists {
  212. var (
  213. isAllowed bool
  214. commitAllowed = a.CommitAllowed(fragment.CommitSHA)
  215. pathAllowed = a.PathAllowed(fragment.FilePath)
  216. )
  217. if a.MatchCondition == config.AllowlistMatchAnd {
  218. // Determine applicable checks.
  219. var allowlistChecks []bool
  220. if len(a.Commits) > 0 {
  221. allowlistChecks = append(allowlistChecks, commitAllowed)
  222. }
  223. if len(a.Paths) > 0 {
  224. allowlistChecks = append(allowlistChecks, pathAllowed)
  225. }
  226. // These will be checked later.
  227. if len(a.Regexes) > 0 {
  228. allowlistChecks = append(allowlistChecks, false)
  229. }
  230. if len(a.StopWords) > 0 {
  231. allowlistChecks = append(allowlistChecks, false)
  232. }
  233. // Check if allowed.
  234. isAllowed = allTrue(allowlistChecks)
  235. } else {
  236. isAllowed = commitAllowed || pathAllowed
  237. }
  238. if isAllowed {
  239. logger.Trace().
  240. Str("condition", a.MatchCondition.String()).
  241. Bool("commit-allowed", commitAllowed).
  242. Bool("path-allowed", commitAllowed).
  243. Msg("Skipping fragment due to rule allowlist")
  244. return findings
  245. }
  246. }
  247. if r.Path != nil && r.Regex == nil && len(encodedSegments) == 0 {
  248. // Path _only_ rule
  249. if r.Path.MatchString(fragment.FilePath) {
  250. finding := report.Finding{
  251. Description: r.Description,
  252. File: fragment.FilePath,
  253. SymlinkFile: fragment.SymlinkFile,
  254. RuleID: r.RuleID,
  255. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  256. Tags: r.Tags,
  257. }
  258. return append(findings, finding)
  259. }
  260. } else if r.Path != nil {
  261. // if path is set _and_ a regex is set, then we need to check both
  262. // so if the path does not match, then we should return early and not
  263. // consider the regex
  264. if !r.Path.MatchString(fragment.FilePath) {
  265. return findings
  266. }
  267. }
  268. // if path only rule, skip content checks
  269. if r.Regex == nil {
  270. return findings
  271. }
  272. // if flag configure and raw data size bigger then the flag
  273. if d.MaxTargetMegaBytes > 0 {
  274. rawLength := len(currentRaw) / 1000000
  275. if rawLength > d.MaxTargetMegaBytes {
  276. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  277. return findings
  278. }
  279. }
  280. // use currentRaw instead of fragment.Raw since this represents the current
  281. // decoding pass on the text
  282. MatchLoop:
  283. for _, matchIndex := range r.Regex.FindAllStringIndex(currentRaw, -1) {
  284. // Extract secret from match
  285. secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n")
  286. // For any meta data from decoding
  287. var metaTags []string
  288. // Check if the decoded portions of the segment overlap with the match
  289. // to see if its potentially a new match
  290. if len(encodedSegments) > 0 {
  291. if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
  292. matchIndex = segment.adjustMatchIndex(matchIndex)
  293. metaTags = append(metaTags, segment.tags()...)
  294. } else {
  295. // This item has already been added to a finding
  296. continue
  297. }
  298. } else {
  299. // Fixes: https://github.com/gitleaks/gitleaks/issues/1352
  300. // removes the incorrectly following line that was detected by regex expression '\n'
  301. matchIndex[1] = matchIndex[0] + len(secret)
  302. }
  303. // determine location of match. Note that the location
  304. // in the finding will be the line/column numbers of the _match_
  305. // not the _secret_, which will be different if the secretGroup
  306. // value is set for this rule
  307. loc := location(fragment, matchIndex)
  308. if matchIndex[1] > loc.endLineIndex {
  309. loc.endLineIndex = matchIndex[1]
  310. }
  311. finding := report.Finding{
  312. Description: r.Description,
  313. File: fragment.FilePath,
  314. SymlinkFile: fragment.SymlinkFile,
  315. RuleID: r.RuleID,
  316. StartLine: loc.startLine,
  317. EndLine: loc.endLine,
  318. StartColumn: loc.startColumn,
  319. EndColumn: loc.endColumn,
  320. Secret: secret,
  321. Match: secret,
  322. Tags: append(r.Tags, metaTags...),
  323. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  324. }
  325. if !d.IgnoreGitleaksAllow &&
  326. strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex], gitleaksAllowSignature) {
  327. logger.Trace().
  328. Str("finding", finding.Secret).
  329. Msg("Skipping finding due to 'gitleaks:allow' signature")
  330. continue
  331. }
  332. // Set the value of |secret|, if the pattern contains at least one capture group.
  333. // (The first element is the full match, hence we check >= 2.)
  334. groups := r.Regex.FindStringSubmatch(finding.Secret)
  335. if len(groups) >= 2 {
  336. if r.SecretGroup > 0 {
  337. if len(groups) <= r.SecretGroup {
  338. // Config validation should prevent this
  339. continue
  340. }
  341. finding.Secret = groups[r.SecretGroup]
  342. } else {
  343. // If |secretGroup| is not set, we will use the first suitable capture group.
  344. if len(groups) == 2 {
  345. // Use the only group.
  346. finding.Secret = groups[1]
  347. } else {
  348. // Use the first non-empty group.
  349. for _, s := range groups[1:] {
  350. if len(s) > 0 {
  351. finding.Secret = s
  352. break
  353. }
  354. }
  355. }
  356. }
  357. }
  358. // check if the regexTarget is defined in the allowlist "regexes" entry
  359. // or if the secret is in the list of stopwords
  360. globalAllowlistTarget := finding.Secret
  361. switch d.Config.Allowlist.RegexTarget {
  362. case "match":
  363. globalAllowlistTarget = finding.Match
  364. case "line":
  365. globalAllowlistTarget = finding.Line
  366. }
  367. if d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  368. logger.Trace().
  369. Str("finding", globalAllowlistTarget).
  370. Msg("Skipping finding due to global allowlist regex")
  371. continue
  372. } else if d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  373. logger.Trace().
  374. Str("finding", finding.Secret).
  375. Msg("Skipping finding due to global allowlist stopword")
  376. continue
  377. }
  378. // check if the result matches any of the rule allowlists.
  379. for _, a := range r.Allowlists {
  380. allowlistTarget := finding.Secret
  381. switch a.RegexTarget {
  382. case "match":
  383. allowlistTarget = finding.Match
  384. case "line":
  385. allowlistTarget = finding.Line
  386. }
  387. var (
  388. isAllowed bool
  389. regexAllowed = a.RegexAllowed(allowlistTarget)
  390. containsStopword = a.ContainsStopWord(finding.Secret)
  391. )
  392. // check if the secret is in the list of stopwords
  393. if a.MatchCondition == config.AllowlistMatchAnd {
  394. // Determine applicable checks.
  395. var allowlistChecks []bool
  396. if len(a.Commits) > 0 {
  397. allowlistChecks = append(allowlistChecks, a.CommitAllowed(fragment.CommitSHA))
  398. }
  399. if len(a.Paths) > 0 {
  400. allowlistChecks = append(allowlistChecks, a.PathAllowed(fragment.FilePath))
  401. }
  402. if len(a.Regexes) > 0 {
  403. allowlistChecks = append(allowlistChecks, regexAllowed)
  404. }
  405. if len(a.StopWords) > 0 {
  406. allowlistChecks = append(allowlistChecks, containsStopword)
  407. }
  408. // Check if allowed.
  409. isAllowed = allTrue(allowlistChecks)
  410. } else {
  411. isAllowed = regexAllowed || containsStopword
  412. }
  413. if isAllowed {
  414. logger.Trace().
  415. Str("finding", finding.Secret).
  416. Str("condition", a.MatchCondition.String()).
  417. Bool("regex-allowed", regexAllowed).
  418. Bool("contains-stopword", containsStopword).
  419. Msg("Skipping finding due to rule allowlist")
  420. continue MatchLoop
  421. }
  422. }
  423. // check entropy
  424. entropy := shannonEntropy(finding.Secret)
  425. finding.Entropy = float32(entropy)
  426. if r.Entropy != 0.0 {
  427. if entropy <= r.Entropy {
  428. // entropy is too low, skip this finding
  429. continue
  430. }
  431. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  432. // does not support positive lookaheads. Ideally we would want to add a
  433. // restriction on generic rules regex that requires the secret match group
  434. // contains both numbers and alphabetical characters, not just alphabetical characters.
  435. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  436. // secret contains both digits and alphabetical characters.
  437. // TODO: this should be replaced with stop words
  438. if strings.HasPrefix(r.RuleID, "generic") {
  439. if !containsDigit(finding.Secret) {
  440. continue
  441. }
  442. }
  443. }
  444. findings = append(findings, finding)
  445. }
  446. return findings
  447. }
  448. func allTrue(bools []bool) bool {
  449. allMatch := true
  450. for _, check := range bools {
  451. if !check {
  452. allMatch = false
  453. break
  454. }
  455. }
  456. return allMatch
  457. }
  458. // addFinding synchronously adds a finding to the findings slice
  459. func (d *Detector) addFinding(finding report.Finding) {
  460. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  461. if finding.Commit != "" {
  462. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  463. } else {
  464. finding.Fingerprint = globalFingerprint
  465. }
  466. // check if we should ignore this finding
  467. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  468. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  469. finding.Fingerprint)
  470. return
  471. } else if finding.Commit != "" {
  472. // Awkward nested if because I'm not sure how to chain these two conditions.
  473. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  474. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  475. finding.Fingerprint)
  476. return
  477. }
  478. }
  479. if d.baseline != nil && !IsNew(finding, d.baseline) {
  480. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  481. return
  482. }
  483. d.findingMutex.Lock()
  484. d.findings = append(d.findings, finding)
  485. if d.Verbose {
  486. printFinding(finding, d.NoColor)
  487. }
  488. d.findingMutex.Unlock()
  489. }
  490. // addCommit synchronously adds a commit to the commit slice
  491. func (d *Detector) addCommit(commit string) {
  492. d.commitMap[commit] = true
  493. }