detect.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. ahocorasick "github.com/BobuSumisu/aho-corasick"
  13. "github.com/fatih/semgroup"
  14. "github.com/rs/zerolog/log"
  15. "github.com/spf13/viper"
  16. "golang.org/x/exp/maps"
  17. )
  18. const (
  19. gitleaksAllowSignature = "gitleaks:allow"
  20. chunkSize = 10 * 1_000 // 10kb
  21. )
  22. var newLineRegexp = regexp.MustCompile("\n")
  23. // Detector is the main detector struct
  24. type Detector struct {
  25. // Config is the configuration for the detector
  26. Config config.Config
  27. // Redact is a flag to redact findings. This is exported
  28. // so users using gitleaks as a library can set this flag
  29. // without calling `detector.Start(cmd *cobra.Command)`
  30. Redact uint
  31. // verbose is a flag to print findings
  32. Verbose bool
  33. // MaxDecodeDepths limits how many recursive decoding passes are allowed
  34. MaxDecodeDepth int
  35. // files larger than this will be skipped
  36. MaxTargetMegaBytes int
  37. // followSymlinks is a flag to enable scanning symlink files
  38. FollowSymlinks bool
  39. // NoColor is a flag to disable color output
  40. NoColor bool
  41. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  42. IgnoreGitleaksAllow bool
  43. // commitMap is used to keep track of commits that have been scanned.
  44. // This is only used for logging purposes and git scans.
  45. commitMap map[string]bool
  46. // findingMutex is to prevent concurrent access to the
  47. // findings slice when adding findings.
  48. findingMutex *sync.Mutex
  49. // findings is a slice of report.Findings. This is the result
  50. // of the detector's scan which can then be used to generate a
  51. // report.
  52. findings []report.Finding
  53. // prefilter is a ahocorasick struct used for doing efficient string
  54. // matching given a set of words (keywords from the rules in the config)
  55. prefilter ahocorasick.Trie
  56. // a list of known findings that should be ignored
  57. baseline []report.Finding
  58. // path to baseline
  59. baselinePath string
  60. // gitleaksIgnore
  61. gitleaksIgnore map[string]bool
  62. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  63. Sema *semgroup.Group
  64. }
  65. // Fragment contains the data to be scanned
  66. type Fragment struct {
  67. // Raw is the raw content of the fragment
  68. Raw string
  69. // FilePath is the path to the file if applicable
  70. FilePath string
  71. SymlinkFile string
  72. // CommitSHA is the SHA of the commit if applicable
  73. CommitSHA string
  74. // newlineIndices is a list of indices of newlines in the raw content.
  75. // This is used to calculate the line location of a finding
  76. newlineIndices [][]int
  77. }
  78. // NewDetector creates a new detector with the given config
  79. func NewDetector(cfg config.Config) *Detector {
  80. return &Detector{
  81. commitMap: make(map[string]bool),
  82. gitleaksIgnore: make(map[string]bool),
  83. findingMutex: &sync.Mutex{},
  84. findings: make([]report.Finding, 0),
  85. Config: cfg,
  86. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(),
  87. Sema: semgroup.NewGroup(context.Background(), 40),
  88. }
  89. }
  90. // NewDetectorDefaultConfig creates a new detector with the default config
  91. func NewDetectorDefaultConfig() (*Detector, error) {
  92. viper.SetConfigType("toml")
  93. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  94. if err != nil {
  95. return nil, err
  96. }
  97. var vc config.ViperConfig
  98. err = viper.Unmarshal(&vc)
  99. if err != nil {
  100. return nil, err
  101. }
  102. cfg, err := vc.Translate()
  103. if err != nil {
  104. return nil, err
  105. }
  106. return NewDetector(cfg), nil
  107. }
  108. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  109. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  110. file, err := os.Open(gitleaksIgnorePath)
  111. if err != nil {
  112. return err
  113. }
  114. // https://github.com/securego/gosec/issues/512
  115. defer func() {
  116. if err := file.Close(); err != nil {
  117. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  118. }
  119. }()
  120. scanner := bufio.NewScanner(file)
  121. for scanner.Scan() {
  122. line := strings.TrimSpace(scanner.Text())
  123. // Skip lines that start with a comment
  124. if line != "" && !strings.HasPrefix(line, "#") {
  125. d.gitleaksIgnore[line] = true
  126. }
  127. }
  128. return nil
  129. }
  130. // DetectBytes scans the given bytes and returns a list of findings
  131. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  132. return d.DetectString(string(content))
  133. }
  134. // DetectString scans the given string and returns a list of findings
  135. func (d *Detector) DetectString(content string) []report.Finding {
  136. return d.Detect(Fragment{
  137. Raw: content,
  138. })
  139. }
  140. // Detect scans the given fragment and returns a list of findings
  141. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  142. var findings []report.Finding
  143. // check if filepath is allowed
  144. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  145. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  146. return findings
  147. }
  148. // add newline indices for location calculation in detectRule
  149. fragment.newlineIndices = newLineRegexp.FindAllStringIndex(fragment.Raw, -1)
  150. // setup variables to handle different decoding passes
  151. currentRaw := fragment.Raw
  152. encodedSegments := []EncodedSegment{}
  153. currentDecodeDepth := 0
  154. decoder := NewDecoder()
  155. for {
  156. // build keyword map for prefiltering rules
  157. keywords := make(map[string]bool)
  158. normalizedRaw := strings.ToLower(currentRaw)
  159. matches := d.prefilter.MatchString(normalizedRaw)
  160. for _, m := range matches {
  161. keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  162. }
  163. for _, rule := range d.Config.Rules {
  164. if len(rule.Keywords) == 0 {
  165. // if no keywords are associated with the rule always scan the
  166. // fragment using the rule
  167. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  168. continue
  169. }
  170. // check if keywords are in the fragment
  171. for _, k := range rule.Keywords {
  172. if _, ok := keywords[strings.ToLower(k)]; ok {
  173. findings = append(findings, d.detectRule(fragment, currentRaw, rule, encodedSegments)...)
  174. break
  175. }
  176. }
  177. }
  178. // increment the depth by 1 as we start our decoding pass
  179. currentDecodeDepth++
  180. // stop the loop if we've hit our max decoding depth
  181. if currentDecodeDepth > d.MaxDecodeDepth {
  182. break
  183. }
  184. // decode the currentRaw for the next pass
  185. currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
  186. // stop the loop when there's nothing else to decode
  187. if len(encodedSegments) == 0 {
  188. break
  189. }
  190. }
  191. return filter(findings, d.Redact)
  192. }
  193. // detectRule scans the given fragment for the given rule and returns a list of findings
  194. func (d *Detector) detectRule(fragment Fragment, currentRaw string, rule config.Rule, encodedSegments []EncodedSegment) []report.Finding {
  195. var findings []report.Finding
  196. // check if filepath or commit is allowed for this rule
  197. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  198. rule.Allowlist.PathAllowed(fragment.FilePath) {
  199. return findings
  200. }
  201. if rule.Path != nil && rule.Regex == nil && len(encodedSegments) == 0 {
  202. // Path _only_ rule
  203. if rule.Path.MatchString(fragment.FilePath) {
  204. finding := report.Finding{
  205. Description: rule.Description,
  206. File: fragment.FilePath,
  207. SymlinkFile: fragment.SymlinkFile,
  208. RuleID: rule.RuleID,
  209. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  210. Tags: rule.Tags,
  211. }
  212. return append(findings, finding)
  213. }
  214. } else if rule.Path != nil {
  215. // if path is set _and_ a regex is set, then we need to check both
  216. // so if the path does not match, then we should return early and not
  217. // consider the regex
  218. if !rule.Path.MatchString(fragment.FilePath) {
  219. return findings
  220. }
  221. }
  222. // if path only rule, skip content checks
  223. if rule.Regex == nil {
  224. return findings
  225. }
  226. // if flag configure and raw data size bigger then the flag
  227. if d.MaxTargetMegaBytes > 0 {
  228. rawLength := len(currentRaw) / 1000000
  229. if rawLength > d.MaxTargetMegaBytes {
  230. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  231. return findings
  232. }
  233. }
  234. // use currentRaw instead of fragment.Raw since this represents the current
  235. // decoding pass on the text
  236. for _, matchIndex := range rule.Regex.FindAllStringIndex(currentRaw, -1) {
  237. // Extract secret from match
  238. secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n")
  239. // For any meta data from decoding
  240. var metaTags []string
  241. // Check if the decoded portions of the segment overlap with the match
  242. // to see if its potentially a new match
  243. if len(encodedSegments) > 0 {
  244. if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
  245. matchIndex = segment.adjustMatchIndex(matchIndex)
  246. metaTags = append(metaTags, segment.tags()...)
  247. } else {
  248. // This item has already been added to a finding
  249. continue
  250. }
  251. } else {
  252. // Fixes: https://github.com/gitleaks/gitleaks/issues/1352
  253. // removes the incorrectly following line that was detected by regex expression '\n'
  254. matchIndex[1] = matchIndex[0] + len(secret)
  255. }
  256. // determine location of match. Note that the location
  257. // in the finding will be the line/column numbers of the _match_
  258. // not the _secret_, which will be different if the secretGroup
  259. // value is set for this rule
  260. loc := location(fragment, matchIndex)
  261. if matchIndex[1] > loc.endLineIndex {
  262. loc.endLineIndex = matchIndex[1]
  263. }
  264. finding := report.Finding{
  265. Description: rule.Description,
  266. File: fragment.FilePath,
  267. SymlinkFile: fragment.SymlinkFile,
  268. RuleID: rule.RuleID,
  269. StartLine: loc.startLine,
  270. EndLine: loc.endLine,
  271. StartColumn: loc.startColumn,
  272. EndColumn: loc.endColumn,
  273. Secret: secret,
  274. Match: secret,
  275. Tags: append(rule.Tags, metaTags...),
  276. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  277. }
  278. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  279. gitleaksAllowSignature) && !d.IgnoreGitleaksAllow {
  280. continue
  281. }
  282. // Set the value of |secret|, if the pattern contains at least one capture group.
  283. // (The first element is the full match, hence we check >= 2.)
  284. groups := rule.Regex.FindStringSubmatch(finding.Secret)
  285. if len(groups) >= 2 {
  286. if rule.SecretGroup > 0 {
  287. if len(groups) <= rule.SecretGroup {
  288. // Config validation should prevent this
  289. continue
  290. }
  291. finding.Secret = groups[rule.SecretGroup]
  292. } else {
  293. // If |secretGroup| is not set, we will use the first suitable capture group.
  294. if len(groups) == 2 {
  295. // Use the only group.
  296. finding.Secret = groups[1]
  297. } else {
  298. // Use the first non-empty group.
  299. for _, s := range groups[1:] {
  300. if len(s) > 0 {
  301. finding.Secret = s
  302. break
  303. }
  304. }
  305. }
  306. }
  307. }
  308. // check if the secret is in the list of stopwords
  309. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  310. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  311. continue
  312. }
  313. // check if the regexTarget is defined in the allowlist "regexes" entry
  314. allowlistTarget := finding.Secret
  315. switch rule.Allowlist.RegexTarget {
  316. case "match":
  317. allowlistTarget = finding.Match
  318. case "line":
  319. allowlistTarget = finding.Line
  320. }
  321. globalAllowlistTarget := finding.Secret
  322. switch d.Config.Allowlist.RegexTarget {
  323. case "match":
  324. globalAllowlistTarget = finding.Match
  325. case "line":
  326. globalAllowlistTarget = finding.Line
  327. }
  328. if rule.Allowlist.RegexAllowed(allowlistTarget) ||
  329. d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  330. continue
  331. }
  332. // check entropy
  333. entropy := shannonEntropy(finding.Secret)
  334. finding.Entropy = float32(entropy)
  335. if rule.Entropy != 0.0 {
  336. if entropy <= rule.Entropy {
  337. // entropy is too low, skip this finding
  338. continue
  339. }
  340. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  341. // does not support positive lookaheads. Ideally we would want to add a
  342. // restriction on generic rules regex that requires the secret match group
  343. // contains both numbers and alphabetical characters, not just alphabetical characters.
  344. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  345. // secret contains both digits and alphabetical characters.
  346. // TODO: this should be replaced with stop words
  347. if strings.HasPrefix(rule.RuleID, "generic") {
  348. if !containsDigit(finding.Secret) {
  349. continue
  350. }
  351. }
  352. }
  353. findings = append(findings, finding)
  354. }
  355. return findings
  356. }
  357. // addFinding synchronously adds a finding to the findings slice
  358. func (d *Detector) addFinding(finding report.Finding) {
  359. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  360. if finding.Commit != "" {
  361. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  362. } else {
  363. finding.Fingerprint = globalFingerprint
  364. }
  365. // check if we should ignore this finding
  366. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  367. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  368. finding.Fingerprint)
  369. return
  370. } else if finding.Commit != "" {
  371. // Awkward nested if because I'm not sure how to chain these two conditions.
  372. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  373. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  374. finding.Fingerprint)
  375. return
  376. }
  377. }
  378. if d.baseline != nil && !IsNew(finding, d.baseline) {
  379. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  380. return
  381. }
  382. d.findingMutex.Lock()
  383. d.findings = append(d.findings, finding)
  384. if d.Verbose {
  385. printFinding(finding, d.NoColor)
  386. }
  387. d.findingMutex.Unlock()
  388. }
  389. // addCommit synchronously adds a commit to the commit slice
  390. func (d *Detector) addCommit(commit string) {
  391. d.commitMap[commit] = true
  392. }