4
0

detect.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. ahocorasick "github.com/BobuSumisu/aho-corasick"
  13. "github.com/fatih/semgroup"
  14. "github.com/rs/zerolog"
  15. "github.com/rs/zerolog/log"
  16. "github.com/spf13/viper"
  17. "golang.org/x/exp/maps"
  18. )
  19. const (
  20. gitleaksAllowSignature = "gitleaks:allow"
  21. chunkSize = 10 * 1_000 // 10kb
  22. )
  23. var newLineRegexp = regexp.MustCompile("\n")
  24. // Detector is the main detector struct
  25. type Detector struct {
  26. // Config is the configuration for the detector
  27. Config config.Config
  28. // Redact is a flag to redact findings. This is exported
  29. // so users using gitleaks as a library can set this flag
  30. // without calling `detector.Start(cmd *cobra.Command)`
  31. Redact uint
  32. // verbose is a flag to print findings
  33. Verbose bool
  34. // MaxDecodeDepths limits how many recursive decoding passes are allowed
  35. MaxDecodeDepth int
  36. // files larger than this will be skipped
  37. MaxTargetMegaBytes int
  38. // followSymlinks is a flag to enable scanning symlink files
  39. FollowSymlinks bool
  40. // NoColor is a flag to disable color output
  41. NoColor bool
  42. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  43. IgnoreGitleaksAllow bool
  44. // commitMap is used to keep track of commits that have been scanned.
  45. // This is only used for logging purposes and git scans.
  46. commitMap map[string]bool
  47. // findingMutex is to prevent concurrent access to the
  48. // findings slice when adding findings.
  49. findingMutex *sync.Mutex
  50. // findings is a slice of report.Findings. This is the result
  51. // of the detector's scan which can then be used to generate a
  52. // report.
  53. findings []report.Finding
  54. // prefilter is a ahocorasick struct used for doing efficient string
  55. // matching given a set of words (keywords from the rules in the config)
  56. prefilter ahocorasick.Trie
  57. // a list of known findings that should be ignored
  58. baseline []report.Finding
  59. // path to baseline
  60. baselinePath string
  61. // gitleaksIgnore
  62. gitleaksIgnore map[string]bool
  63. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  64. Sema *semgroup.Group
  65. }
  66. // Fragment contains the data to be scanned
  67. type Fragment struct {
  68. // Raw is the raw content of the fragment
  69. Raw string
  70. // FilePath is the path to the file if applicable
  71. FilePath string
  72. SymlinkFile string
  73. // CommitSHA is the SHA of the commit if applicable
  74. CommitSHA string
  75. // newlineIndices is a list of indices of newlines in the raw content.
  76. // This is used to calculate the line location of a finding
  77. newlineIndices [][]int
  78. }
  79. // NewDetector creates a new detector with the given config
  80. func NewDetector(cfg config.Config) *Detector {
  81. return &Detector{
  82. commitMap: make(map[string]bool),
  83. gitleaksIgnore: make(map[string]bool),
  84. findingMutex: &sync.Mutex{},
  85. findings: make([]report.Finding, 0),
  86. Config: cfg,
  87. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
  88. Sema: semgroup.NewGroup(context.Background(), 40),
  89. }
  90. }
  91. // NewDetectorDefaultConfig creates a new detector with the default config
  92. func NewDetectorDefaultConfig() (*Detector, error) {
  93. viper.SetConfigType("toml")
  94. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  95. if err != nil {
  96. return nil, err
  97. }
  98. var vc config.ViperConfig
  99. err = viper.Unmarshal(&vc)
  100. if err != nil {
  101. return nil, err
  102. }
  103. cfg, err := vc.Translate()
  104. if err != nil {
  105. return nil, err
  106. }
  107. return NewDetector(cfg), nil
  108. }
  109. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  110. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  111. file, err := os.Open(gitleaksIgnorePath)
  112. if err != nil {
  113. return err
  114. }
  115. // https://github.com/securego/gosec/issues/512
  116. defer func() {
  117. if err := file.Close(); err != nil {
  118. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  119. }
  120. }()
  121. scanner := bufio.NewScanner(file)
  122. for scanner.Scan() {
  123. line := strings.TrimSpace(scanner.Text())
  124. // Skip lines that start with a comment
  125. if line != "" && !strings.HasPrefix(line, "#") {
  126. d.gitleaksIgnore[line] = true
  127. }
  128. }
  129. return nil
  130. }
  131. // DetectBytes scans the given bytes and returns a list of findings
  132. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  133. return d.DetectString(string(content))
  134. }
  135. // DetectString scans the given string and returns a list of findings
  136. func (d *Detector) DetectString(content string) []report.Finding {
  137. return d.Detect(Fragment{
  138. Raw: content,
  139. })
  140. }
  141. // Detect scans the given fragment and returns a list of findings
  142. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  143. var findings []report.Finding
  144. // check if filepath is allowed
  145. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  146. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  147. return findings
  148. }
  149. // add newline indices for location calculation in detectRule
  150. fragment.newlineIndices = newLineRegexp.FindAllStringIndex(fragment.Raw, -1)
  151. // setup variables to handle different decoding passes
  152. currentRaw := fragment.Raw
  153. encodedSegments := []EncodedSegment{}
  154. currentDecodeDepth := 0
  155. decoder := NewDecoder()
  156. for {
  157. // build keyword map for prefiltering rules
  158. keywords := make(map[string]bool)
  159. normalizedRaw := strings.ToLower(currentRaw)
  160. matches := d.prefilter.MatchString(normalizedRaw)
  161. for _, m := range matches {
  162. keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  163. }
  164. for _, rule := range d.Config.Rules {
  165. if len(rule.Keywords) == 0 {
  166. // if no keywords are associated with the rule always scan the
  167. // fragment using the rule
  168. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  169. continue
  170. }
  171. // check if keywords are in the fragment
  172. for _, k := range rule.Keywords {
  173. if _, ok := keywords[strings.ToLower(k)]; ok {
  174. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  175. break
  176. }
  177. }
  178. }
  179. // increment the depth by 1 as we start our decoding pass
  180. currentDecodeDepth++
  181. // stop the loop if we've hit our max decoding depth
  182. if currentDecodeDepth > d.MaxDecodeDepth {
  183. break
  184. }
  185. // decode the currentRaw for the next pass
  186. currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
  187. // stop the loop when there's nothing else to decode
  188. if len(encodedSegments) == 0 {
  189. break
  190. }
  191. }
  192. return filter(findings, d.Redact)
  193. }
  194. // detectRule scans the given fragment for the given rule and returns a list of findings
  195. func (d *Detector) detectRule(fragment Fragment, currentRaw string, rule config.Rule, encodedSegments []EncodedSegment) []report.Finding {
  196. var (
  197. findings []report.Finding
  198. logger = func() zerolog.Logger {
  199. l := log.With().Str("rule-id", rule.RuleID)
  200. if fragment.CommitSHA != "" {
  201. l = l.Str("commit", fragment.CommitSHA)
  202. }
  203. l = l.Str("path", fragment.FilePath)
  204. return l.Logger()
  205. }()
  206. )
  207. // check if filepath or commit is allowed for this rule
  208. for _, a := range rule.Allowlists {
  209. var (
  210. isAllowed bool
  211. commitAllowed = a.CommitAllowed(fragment.CommitSHA)
  212. pathAllowed = a.PathAllowed(fragment.FilePath)
  213. )
  214. if a.MatchCondition == config.AllowlistMatchAnd {
  215. // Determine applicable checks.
  216. var allowlistChecks []bool
  217. if len(a.Commits) > 0 {
  218. allowlistChecks = append(allowlistChecks, commitAllowed)
  219. }
  220. if len(a.Paths) > 0 {
  221. allowlistChecks = append(allowlistChecks, pathAllowed)
  222. }
  223. // These will be checked later.
  224. if len(a.Regexes) > 0 {
  225. allowlistChecks = append(allowlistChecks, false)
  226. }
  227. if len(a.StopWords) > 0 {
  228. allowlistChecks = append(allowlistChecks, false)
  229. }
  230. // Check if allowed.
  231. isAllowed = allTrue(allowlistChecks)
  232. } else {
  233. isAllowed = commitAllowed || pathAllowed
  234. }
  235. if isAllowed {
  236. logger.Trace().
  237. Str("condition", a.MatchCondition.String()).
  238. Bool("commit-allowed", commitAllowed).
  239. Bool("path-allowed", commitAllowed).
  240. Msg("Skipping fragment due to rule allowlist")
  241. return findings
  242. }
  243. }
  244. if rule.Path != nil && rule.Regex == nil && len(encodedSegments) == 0 {
  245. // Path _only_ rule
  246. if rule.Path.MatchString(fragment.FilePath) {
  247. finding := report.Finding{
  248. Description: rule.Description,
  249. File: fragment.FilePath,
  250. SymlinkFile: fragment.SymlinkFile,
  251. RuleID: rule.RuleID,
  252. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  253. Tags: rule.Tags,
  254. }
  255. return append(findings, finding)
  256. }
  257. } else if rule.Path != nil {
  258. // if path is set _and_ a regex is set, then we need to check both
  259. // so if the path does not match, then we should return early and not
  260. // consider the regex
  261. if !rule.Path.MatchString(fragment.FilePath) {
  262. return findings
  263. }
  264. }
  265. // if path only rule, skip content checks
  266. if rule.Regex == nil {
  267. return findings
  268. }
  269. // if flag configure and raw data size bigger then the flag
  270. if d.MaxTargetMegaBytes > 0 {
  271. rawLength := len(currentRaw) / 1000000
  272. if rawLength > d.MaxTargetMegaBytes {
  273. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  274. return findings
  275. }
  276. }
  277. // use currentRaw instead of fragment.Raw since this represents the current
  278. // decoding pass on the text
  279. MatchLoop:
  280. for _, matchIndex := range rule.Regex.FindAllStringIndex(currentRaw, -1) {
  281. // Extract secret from match
  282. secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n")
  283. // For any meta data from decoding
  284. var metaTags []string
  285. // Check if the decoded portions of the segment overlap with the match
  286. // to see if its potentially a new match
  287. if len(encodedSegments) > 0 {
  288. if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
  289. matchIndex = segment.adjustMatchIndex(matchIndex)
  290. metaTags = append(metaTags, segment.tags()...)
  291. } else {
  292. // This item has already been added to a finding
  293. continue
  294. }
  295. } else {
  296. // Fixes: https://github.com/gitleaks/gitleaks/issues/1352
  297. // removes the incorrectly following line that was detected by regex expression '\n'
  298. matchIndex[1] = matchIndex[0] + len(secret)
  299. }
  300. // determine location of match. Note that the location
  301. // in the finding will be the line/column numbers of the _match_
  302. // not the _secret_, which will be different if the secretGroup
  303. // value is set for this rule
  304. loc := location(fragment, matchIndex)
  305. if matchIndex[1] > loc.endLineIndex {
  306. loc.endLineIndex = matchIndex[1]
  307. }
  308. finding := report.Finding{
  309. Description: rule.Description,
  310. File: fragment.FilePath,
  311. SymlinkFile: fragment.SymlinkFile,
  312. RuleID: rule.RuleID,
  313. StartLine: loc.startLine,
  314. EndLine: loc.endLine,
  315. StartColumn: loc.startColumn,
  316. EndColumn: loc.endColumn,
  317. Secret: secret,
  318. Match: secret,
  319. Tags: append(rule.Tags, metaTags...),
  320. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  321. }
  322. if !d.IgnoreGitleaksAllow &&
  323. strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex], gitleaksAllowSignature) {
  324. logger.Trace().
  325. Str("finding", finding.Secret).
  326. Msg("Skipping finding due to 'gitleaks:allow' signature")
  327. continue
  328. }
  329. // Set the value of |secret|, if the pattern contains at least one capture group.
  330. // (The first element is the full match, hence we check >= 2.)
  331. groups := rule.Regex.FindStringSubmatch(finding.Secret)
  332. if len(groups) >= 2 {
  333. if rule.SecretGroup > 0 {
  334. if len(groups) <= rule.SecretGroup {
  335. // Config validation should prevent this
  336. continue
  337. }
  338. finding.Secret = groups[rule.SecretGroup]
  339. } else {
  340. // If |secretGroup| is not set, we will use the first suitable capture group.
  341. if len(groups) == 2 {
  342. // Use the only group.
  343. finding.Secret = groups[1]
  344. } else {
  345. // Use the first non-empty group.
  346. for _, s := range groups[1:] {
  347. if len(s) > 0 {
  348. finding.Secret = s
  349. break
  350. }
  351. }
  352. }
  353. }
  354. }
  355. // check if the regexTarget is defined in the allowlist "regexes" entry
  356. // or if the secret is in the list of stopwords
  357. globalAllowlistTarget := finding.Secret
  358. switch d.Config.Allowlist.RegexTarget {
  359. case "match":
  360. globalAllowlistTarget = finding.Match
  361. case "line":
  362. globalAllowlistTarget = finding.Line
  363. }
  364. if d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  365. logger.Trace().
  366. Str("finding", globalAllowlistTarget).
  367. Msg("Skipping finding due to global allowlist regex")
  368. continue
  369. } else if d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  370. logger.Trace().
  371. Str("finding", finding.Secret).
  372. Msg("Skipping finding due to global allowlist stopword")
  373. continue
  374. }
  375. // check if the result matches any of the rule allowlists.
  376. for _, a := range rule.Allowlists {
  377. allowlistTarget := finding.Secret
  378. switch a.RegexTarget {
  379. case "match":
  380. allowlistTarget = finding.Match
  381. case "line":
  382. allowlistTarget = finding.Line
  383. }
  384. var (
  385. isAllowed bool
  386. regexAllowed = a.RegexAllowed(allowlistTarget)
  387. containsStopword = a.ContainsStopWord(finding.Secret)
  388. )
  389. // check if the secret is in the list of stopwords
  390. if a.MatchCondition == config.AllowlistMatchAnd {
  391. // Determine applicable checks.
  392. var allowlistChecks []bool
  393. if len(a.Commits) > 0 {
  394. allowlistChecks = append(allowlistChecks, a.CommitAllowed(fragment.CommitSHA))
  395. }
  396. if len(a.Paths) > 0 {
  397. allowlistChecks = append(allowlistChecks, a.PathAllowed(fragment.FilePath))
  398. }
  399. if len(a.Regexes) > 0 {
  400. allowlistChecks = append(allowlistChecks, regexAllowed)
  401. }
  402. if len(a.StopWords) > 0 {
  403. allowlistChecks = append(allowlistChecks, containsStopword)
  404. }
  405. // Check if allowed.
  406. isAllowed = allTrue(allowlistChecks)
  407. } else {
  408. isAllowed = regexAllowed || containsStopword
  409. }
  410. if isAllowed {
  411. logger.Trace().
  412. Str("finding", finding.Secret).
  413. Str("condition", a.MatchCondition.String()).
  414. Bool("regex-allowed", regexAllowed).
  415. Bool("contains-stopword", containsStopword).
  416. Msg("Skipping finding due to rule allowlist")
  417. continue MatchLoop
  418. }
  419. }
  420. // check entropy
  421. entropy := shannonEntropy(finding.Secret)
  422. finding.Entropy = float32(entropy)
  423. if rule.Entropy != 0.0 {
  424. if entropy <= rule.Entropy {
  425. // entropy is too low, skip this finding
  426. continue
  427. }
  428. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  429. // does not support positive lookaheads. Ideally we would want to add a
  430. // restriction on generic rules regex that requires the secret match group
  431. // contains both numbers and alphabetical characters, not just alphabetical characters.
  432. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  433. // secret contains both digits and alphabetical characters.
  434. // TODO: this should be replaced with stop words
  435. if strings.HasPrefix(rule.RuleID, "generic") {
  436. if !containsDigit(finding.Secret) {
  437. continue
  438. }
  439. }
  440. }
  441. findings = append(findings, finding)
  442. }
  443. return findings
  444. }
  445. func allTrue(bools []bool) bool {
  446. allMatch := true
  447. for _, check := range bools {
  448. if !check {
  449. allMatch = false
  450. break
  451. }
  452. }
  453. return allMatch
  454. }
  455. // addFinding synchronously adds a finding to the findings slice
  456. func (d *Detector) addFinding(finding report.Finding) {
  457. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  458. if finding.Commit != "" {
  459. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  460. } else {
  461. finding.Fingerprint = globalFingerprint
  462. }
  463. // check if we should ignore this finding
  464. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  465. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  466. finding.Fingerprint)
  467. return
  468. } else if finding.Commit != "" {
  469. // Awkward nested if because I'm not sure how to chain these two conditions.
  470. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  471. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  472. finding.Fingerprint)
  473. return
  474. }
  475. }
  476. if d.baseline != nil && !IsNew(finding, d.baseline) {
  477. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  478. return
  479. }
  480. d.findingMutex.Lock()
  481. d.findings = append(d.findings, finding)
  482. if d.Verbose {
  483. printFinding(finding, d.NoColor)
  484. }
  485. d.findingMutex.Unlock()
  486. }
  487. // addCommit synchronously adds a commit to the commit slice
  488. func (d *Detector) addCommit(commit string) {
  489. d.commitMap[commit] = true
  490. }