detect.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/report"
  12. ahocorasick "github.com/BobuSumisu/aho-corasick"
  13. "github.com/fatih/semgroup"
  14. "github.com/rs/zerolog/log"
  15. "github.com/spf13/viper"
  16. )
  17. const (
  18. gitleaksAllowSignature = "gitleaks:allow"
  19. chunkSize = 10 * 1_000 // 10kb
  20. )
  21. // Detector is the main detector struct
  22. type Detector struct {
  23. // Config is the configuration for the detector
  24. Config config.Config
  25. // Redact is a flag to redact findings. This is exported
  26. // so users using gitleaks as a library can set this flag
  27. // without calling `detector.Start(cmd *cobra.Command)`
  28. Redact uint
  29. // verbose is a flag to print findings
  30. Verbose bool
  31. // files larger than this will be skipped
  32. MaxTargetMegaBytes int
  33. // followSymlinks is a flag to enable scanning symlink files
  34. FollowSymlinks bool
  35. // NoColor is a flag to disable color output
  36. NoColor bool
  37. // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments.
  38. IgnoreGitleaksAllow bool
  39. // commitMap is used to keep track of commits that have been scanned.
  40. // This is only used for logging purposes and git scans.
  41. commitMap map[string]bool
  42. // findingMutex is to prevent concurrent access to the
  43. // findings slice when adding findings.
  44. findingMutex *sync.Mutex
  45. // findings is a slice of report.Findings. This is the result
  46. // of the detector's scan which can then be used to generate a
  47. // report.
  48. findings []report.Finding
  49. // prefilter is a ahocorasick struct used for doing efficient string
  50. // matching given a set of words (keywords from the rules in the config)
  51. prefilter ahocorasick.Trie
  52. // a list of known findings that should be ignored
  53. baseline []report.Finding
  54. // path to baseline
  55. baselinePath string
  56. // gitleaksIgnore
  57. gitleaksIgnore map[string]bool
  58. // Sema (https://github.com/fatih/semgroup) controls the concurrency
  59. Sema *semgroup.Group
  60. }
  61. // Fragment contains the data to be scanned
  62. type Fragment struct {
  63. // Raw is the raw content of the fragment
  64. Raw string
  65. // FilePath is the path to the file if applicable
  66. FilePath string
  67. SymlinkFile string
  68. // CommitSHA is the SHA of the commit if applicable
  69. CommitSHA string
  70. // newlineIndices is a list of indices of newlines in the raw content.
  71. // This is used to calculate the line location of a finding
  72. newlineIndices [][]int
  73. // keywords is a map of all the keywords contain within the contents
  74. // of this fragment
  75. keywords map[string]bool
  76. }
  77. // NewDetector creates a new detector with the given config
  78. func NewDetector(cfg config.Config) *Detector {
  79. return &Detector{
  80. commitMap: make(map[string]bool),
  81. gitleaksIgnore: make(map[string]bool),
  82. findingMutex: &sync.Mutex{},
  83. findings: make([]report.Finding, 0),
  84. Config: cfg,
  85. prefilter: *ahocorasick.NewTrieBuilder().AddStrings(cfg.Keywords).Build(),
  86. Sema: semgroup.NewGroup(context.Background(), 40),
  87. }
  88. }
  89. // NewDetectorDefaultConfig creates a new detector with the default config
  90. func NewDetectorDefaultConfig() (*Detector, error) {
  91. viper.SetConfigType("toml")
  92. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  93. if err != nil {
  94. return nil, err
  95. }
  96. var vc config.ViperConfig
  97. err = viper.Unmarshal(&vc)
  98. if err != nil {
  99. return nil, err
  100. }
  101. cfg, err := vc.Translate()
  102. if err != nil {
  103. return nil, err
  104. }
  105. return NewDetector(cfg), nil
  106. }
  107. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  108. log.Debug().Msgf("found .gitleaksignore file: %s", gitleaksIgnorePath)
  109. file, err := os.Open(gitleaksIgnorePath)
  110. if err != nil {
  111. return err
  112. }
  113. // https://github.com/securego/gosec/issues/512
  114. defer func() {
  115. if err := file.Close(); err != nil {
  116. log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
  117. }
  118. }()
  119. scanner := bufio.NewScanner(file)
  120. for scanner.Scan() {
  121. d.gitleaksIgnore[scanner.Text()] = true
  122. }
  123. return nil
  124. }
  125. // DetectBytes scans the given bytes and returns a list of findings
  126. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  127. return d.DetectString(string(content))
  128. }
  129. // DetectString scans the given string and returns a list of findings
  130. func (d *Detector) DetectString(content string) []report.Finding {
  131. return d.Detect(Fragment{
  132. Raw: content,
  133. })
  134. }
  135. // Detect scans the given fragment and returns a list of findings
  136. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  137. var findings []report.Finding
  138. // initiate fragment keywords
  139. fragment.keywords = make(map[string]bool)
  140. // check if filepath is allowed
  141. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  142. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  143. return findings
  144. }
  145. // add newline indices for location calculation in detectRule
  146. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  147. // build keyword map for prefiltering rules
  148. normalizedRaw := strings.ToLower(fragment.Raw)
  149. matches := d.prefilter.MatchString(normalizedRaw)
  150. for _, m := range matches {
  151. fragment.keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true
  152. }
  153. for _, rule := range d.Config.Rules {
  154. if len(rule.Keywords) == 0 {
  155. // if not keywords are associated with the rule always scan the
  156. // fragment using the rule
  157. findings = append(findings, d.detectRule(fragment, rule)...)
  158. continue
  159. }
  160. fragmentContainsKeyword := false
  161. // check if keywords are in the fragment
  162. for _, k := range rule.Keywords {
  163. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  164. fragmentContainsKeyword = true
  165. }
  166. }
  167. if fragmentContainsKeyword {
  168. findings = append(findings, d.detectRule(fragment, rule)...)
  169. }
  170. }
  171. return filter(findings, d.Redact)
  172. }
  173. // detectRule scans the given fragment for the given rule and returns a list of findings
  174. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  175. var findings []report.Finding
  176. // check if filepath or commit is allowed for this rule
  177. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  178. rule.Allowlist.PathAllowed(fragment.FilePath) {
  179. return findings
  180. }
  181. if rule.Path != nil && rule.Regex == nil {
  182. // Path _only_ rule
  183. if rule.Path.MatchString(fragment.FilePath) {
  184. finding := report.Finding{
  185. Description: rule.Description,
  186. File: fragment.FilePath,
  187. SymlinkFile: fragment.SymlinkFile,
  188. RuleID: rule.RuleID,
  189. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  190. Tags: rule.Tags,
  191. }
  192. return append(findings, finding)
  193. }
  194. } else if rule.Path != nil {
  195. // if path is set _and_ a regex is set, then we need to check both
  196. // so if the path does not match, then we should return early and not
  197. // consider the regex
  198. if !rule.Path.MatchString(fragment.FilePath) {
  199. return findings
  200. }
  201. }
  202. // if path only rule, skip content checks
  203. if rule.Regex == nil {
  204. return findings
  205. }
  206. // If flag configure and raw data size bigger then the flag
  207. if d.MaxTargetMegaBytes > 0 {
  208. rawLength := len(fragment.Raw) / 1000000
  209. if rawLength > d.MaxTargetMegaBytes {
  210. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  211. return findings
  212. }
  213. }
  214. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  215. for _, matchIndex := range matchIndices {
  216. // extract secret from match
  217. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  218. // determine location of match. Note that the location
  219. // in the finding will be the line/column numbers of the _match_
  220. // not the _secret_, which will be different if the secretGroup
  221. // value is set for this rule
  222. loc := location(fragment, matchIndex)
  223. if matchIndex[1] > loc.endLineIndex {
  224. loc.endLineIndex = matchIndex[1]
  225. }
  226. finding := report.Finding{
  227. Description: rule.Description,
  228. File: fragment.FilePath,
  229. SymlinkFile: fragment.SymlinkFile,
  230. RuleID: rule.RuleID,
  231. StartLine: loc.startLine,
  232. EndLine: loc.endLine,
  233. StartColumn: loc.startColumn,
  234. EndColumn: loc.endColumn,
  235. Secret: secret,
  236. Match: secret,
  237. Tags: rule.Tags,
  238. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  239. }
  240. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  241. gitleaksAllowSignature) && !d.IgnoreGitleaksAllow {
  242. continue
  243. }
  244. // by default if secret group is not set, we will check to see if there
  245. // are any capture groups. If there are, we will use the first capture to start
  246. groups := rule.Regex.FindStringSubmatch(secret)
  247. if rule.SecretGroup == 0 {
  248. // if len(groups) == 2 that means there is only one capture group
  249. // the first element in groups is the full match, the second is the
  250. // first capture group
  251. if len(groups) == 2 {
  252. secret = groups[1]
  253. finding.Secret = secret
  254. }
  255. } else {
  256. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  257. // Config validation should prevent this
  258. continue
  259. }
  260. secret = groups[rule.SecretGroup]
  261. finding.Secret = secret
  262. }
  263. // check if the regexTarget is defined in the allowlist "regexes" entry
  264. allowlistTarget := finding.Secret
  265. switch rule.Allowlist.RegexTarget {
  266. case "match":
  267. allowlistTarget = finding.Match
  268. case "line":
  269. allowlistTarget = finding.Line
  270. }
  271. globalAllowlistTarget := finding.Secret
  272. switch d.Config.Allowlist.RegexTarget {
  273. case "match":
  274. globalAllowlistTarget = finding.Match
  275. case "line":
  276. globalAllowlistTarget = finding.Line
  277. }
  278. if rule.Allowlist.RegexAllowed(allowlistTarget) ||
  279. d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  280. continue
  281. }
  282. // check if the secret is in the list of stopwords
  283. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  284. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  285. continue
  286. }
  287. // check entropy
  288. entropy := shannonEntropy(finding.Secret)
  289. finding.Entropy = float32(entropy)
  290. if rule.Entropy != 0.0 {
  291. if entropy <= rule.Entropy {
  292. // entropy is too low, skip this finding
  293. continue
  294. }
  295. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  296. // does not support positive lookaheads. Ideally we would want to add a
  297. // restriction on generic rules regex that requires the secret match group
  298. // contains both numbers and alphabetical characters, not just alphabetical characters.
  299. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  300. // secret contains both digits and alphabetical characters.
  301. // TODO: this should be replaced with stop words
  302. if strings.HasPrefix(rule.RuleID, "generic") {
  303. if !containsDigit(secret) {
  304. continue
  305. }
  306. }
  307. }
  308. findings = append(findings, finding)
  309. }
  310. return findings
  311. }
  312. // addFinding synchronously adds a finding to the findings slice
  313. func (d *Detector) addFinding(finding report.Finding) {
  314. globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  315. if finding.Commit != "" {
  316. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  317. } else {
  318. finding.Fingerprint = globalFingerprint
  319. }
  320. // check if we should ignore this finding
  321. if _, ok := d.gitleaksIgnore[globalFingerprint]; ok {
  322. log.Debug().Msgf("ignoring finding with global Fingerprint %s",
  323. finding.Fingerprint)
  324. return
  325. } else if finding.Commit != "" {
  326. // Awkward nested if because I'm not sure how to chain these two conditions.
  327. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  328. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  329. finding.Fingerprint)
  330. return
  331. }
  332. }
  333. if d.baseline != nil && !IsNew(finding, d.baseline) {
  334. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  335. return
  336. }
  337. d.findingMutex.Lock()
  338. d.findings = append(d.findings, finding)
  339. if d.Verbose {
  340. printFinding(finding, d.NoColor)
  341. }
  342. d.findingMutex.Unlock()
  343. }
  344. // addCommit synchronously adds a commit to the commit slice
  345. func (d *Detector) addCommit(commit string) {
  346. d.commitMap[commit] = true
  347. }