detect.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "path/filepath"
  8. "regexp"
  9. "strings"
  10. "sync"
  11. "github.com/zricethezav/gitleaks/v8/config"
  12. "github.com/zricethezav/gitleaks/v8/detect/git"
  13. "github.com/zricethezav/gitleaks/v8/report"
  14. "github.com/fatih/semgroup"
  15. "github.com/gitleaks/go-gitdiff/gitdiff"
  16. "github.com/h2non/filetype"
  17. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  18. "github.com/rs/zerolog/log"
  19. "github.com/spf13/viper"
  20. )
  21. // Type used to differentiate between git scan types:
  22. // $ gitleaks detect
  23. // $ gitleaks protect
  24. // $ gitleaks protect staged
  25. type GitScanType int
  26. const (
  27. DetectType GitScanType = iota
  28. ProtectType
  29. ProtectStagedType
  30. gitleaksAllowSignature = "gitleaks:allow"
  31. )
  32. // Detector is the main detector struct
  33. type Detector struct {
  34. // Config is the configuration for the detector
  35. Config config.Config
  36. // Redact is a flag to redact findings. This is exported
  37. // so users using gitleaks as a library can set this flag
  38. // without calling `detector.Start(cmd *cobra.Command)`
  39. Redact bool
  40. // verbose is a flag to print findings
  41. Verbose bool
  42. // commitMap is used to keep track of commits that have been scanned.
  43. // This is only used for logging purposes and git scans.
  44. commitMap map[string]bool
  45. // findingMutex is to prevent concurrent access to the
  46. // findings slice when adding findings.
  47. findingMutex *sync.Mutex
  48. // findings is a slice of report.Findings. This is the result
  49. // of the detector's scan which can then be used to generate a
  50. // report.
  51. findings []report.Finding
  52. // prefilter is a ahocorasick struct used for doing efficient string
  53. // matching given a set of words (keywords from the rules in the config)
  54. prefilter ahocorasick.AhoCorasick
  55. // a list of known findings that should be ignored
  56. baseline []report.Finding
  57. // path to baseline
  58. baselinePath string
  59. // gitleaksIgnore
  60. gitleaksIgnore map[string]bool
  61. }
  62. // Fragment contains the data to be scanned
  63. type Fragment struct {
  64. // Raw is the raw content of the fragment
  65. Raw string
  66. // FilePath is the path to the file if applicable
  67. FilePath string
  68. // CommitSHA is the SHA of the commit if applicable
  69. CommitSHA string
  70. // newlineIndices is a list of indices of newlines in the raw content.
  71. // This is used to calculate the line location of a finding
  72. newlineIndices [][]int
  73. // keywords is a map of all the keywords contain within the contents
  74. // of this fragment
  75. keywords map[string]bool
  76. }
  77. // NewDetector creates a new detector with the given config
  78. func NewDetector(cfg config.Config) *Detector {
  79. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  80. AsciiCaseInsensitive: true,
  81. MatchOnlyWholeWords: false,
  82. MatchKind: ahocorasick.LeftMostLongestMatch,
  83. DFA: true,
  84. })
  85. return &Detector{
  86. commitMap: make(map[string]bool),
  87. gitleaksIgnore: make(map[string]bool),
  88. findingMutex: &sync.Mutex{},
  89. findings: make([]report.Finding, 0),
  90. Config: cfg,
  91. prefilter: builder.Build(cfg.Keywords),
  92. }
  93. }
  94. // NewDetectorDefaultConfig creates a new detector with the default config
  95. func NewDetectorDefaultConfig() (*Detector, error) {
  96. viper.SetConfigType("toml")
  97. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  98. if err != nil {
  99. return nil, err
  100. }
  101. var vc config.ViperConfig
  102. err = viper.Unmarshal(&vc)
  103. if err != nil {
  104. return nil, err
  105. }
  106. cfg, err := vc.Translate()
  107. if err != nil {
  108. return nil, err
  109. }
  110. return NewDetector(cfg), nil
  111. }
  112. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  113. log.Debug().Msg("found .gitleaksignore file")
  114. file, err := os.Open(gitleaksIgnorePath)
  115. if err != nil {
  116. return err
  117. }
  118. defer file.Close()
  119. scanner := bufio.NewScanner(file)
  120. for scanner.Scan() {
  121. d.gitleaksIgnore[scanner.Text()] = true
  122. }
  123. return nil
  124. }
  125. func (d *Detector) AddBaseline(baselinePath string) error {
  126. if baselinePath != "" {
  127. baseline, err := LoadBaseline(baselinePath)
  128. if err != nil {
  129. return err
  130. }
  131. d.baseline = baseline
  132. }
  133. d.baselinePath = baselinePath
  134. return nil
  135. }
  136. // DetectBytes scans the given bytes and returns a list of findings
  137. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  138. return d.DetectString(string(content))
  139. }
  140. // DetectString scans the given string and returns a list of findings
  141. func (d *Detector) DetectString(content string) []report.Finding {
  142. return d.Detect(Fragment{
  143. Raw: content,
  144. })
  145. }
  146. // detectRule scans the given fragment for the given rule and returns a list of findings
  147. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  148. var findings []report.Finding
  149. // check if filepath or commit is allowed for this rule
  150. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  151. rule.Allowlist.PathAllowed(fragment.FilePath) {
  152. return findings
  153. }
  154. if rule.Path != nil && rule.Regex == nil {
  155. // Path _only_ rule
  156. if rule.Path.Match([]byte(fragment.FilePath)) {
  157. finding := report.Finding{
  158. Description: rule.Description,
  159. File: fragment.FilePath,
  160. RuleID: rule.RuleID,
  161. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  162. Tags: rule.Tags,
  163. }
  164. return append(findings, finding)
  165. }
  166. } else if rule.Path != nil {
  167. // if path is set _and_ a regex is set, then we need to check both
  168. // so if the path does not match, then we should return early and not
  169. // consider the regex
  170. if !rule.Path.Match([]byte(fragment.FilePath)) {
  171. return findings
  172. }
  173. }
  174. // if path only rule, skip content checks
  175. if rule.Regex == nil {
  176. return findings
  177. }
  178. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  179. for _, matchIndex := range matchIndices {
  180. // extract secret from match
  181. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  182. // determine location of match. Note that the location
  183. // in the finding will be the line/column numbers of the _match_
  184. // not the _secret_, which will be different if the secretGroup
  185. // value is set for this rule
  186. loc := location(fragment, matchIndex)
  187. if matchIndex[1] > loc.endLineIndex {
  188. loc.endLineIndex = matchIndex[1]
  189. }
  190. finding := report.Finding{
  191. Description: rule.Description,
  192. File: fragment.FilePath,
  193. RuleID: rule.RuleID,
  194. StartLine: loc.startLine,
  195. EndLine: loc.endLine,
  196. StartColumn: loc.startColumn,
  197. EndColumn: loc.endColumn,
  198. Secret: secret,
  199. Match: secret,
  200. Tags: rule.Tags,
  201. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  202. }
  203. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  204. gitleaksAllowSignature) {
  205. continue
  206. }
  207. // check if the secret is in the allowlist
  208. if rule.Allowlist.RegexAllowed(finding.Secret) ||
  209. d.Config.Allowlist.RegexAllowed(finding.Secret) {
  210. continue
  211. }
  212. // extract secret from secret group if set
  213. if rule.SecretGroup != 0 {
  214. groups := rule.Regex.FindStringSubmatch(secret)
  215. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  216. // Config validation should prevent this
  217. continue
  218. }
  219. secret = groups[rule.SecretGroup]
  220. finding.Secret = secret
  221. }
  222. // check if the secret is in the list of stopwords
  223. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  224. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  225. continue
  226. }
  227. // check entropy
  228. entropy := shannonEntropy(finding.Secret)
  229. finding.Entropy = float32(entropy)
  230. if rule.Entropy != 0.0 {
  231. if entropy <= rule.Entropy {
  232. // entropy is too low, skip this finding
  233. continue
  234. }
  235. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  236. // does not support positive lookaheads. Ideally we would want to add a
  237. // restriction on generic rules regex that requires the secret match group
  238. // contains both numbers and alphabetical characters, not just alphabetical characters.
  239. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  240. // secret contains both digits and alphabetical characters.
  241. // TODO: this should be replaced with stop words
  242. if strings.HasPrefix(rule.RuleID, "generic") {
  243. if !containsDigit(secret) {
  244. continue
  245. }
  246. }
  247. }
  248. findings = append(findings, finding)
  249. }
  250. return findings
  251. }
  252. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  253. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  254. // and determine if the patch contains any findings.
  255. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  256. var (
  257. gitdiffFiles <-chan *gitdiff.File
  258. err error
  259. )
  260. switch gitScanType {
  261. case DetectType:
  262. gitdiffFiles, err = git.GitLog(source, logOpts)
  263. if err != nil {
  264. return d.findings, err
  265. }
  266. case ProtectType:
  267. gitdiffFiles, err = git.GitDiff(source, false)
  268. if err != nil {
  269. return d.findings, err
  270. }
  271. case ProtectStagedType:
  272. gitdiffFiles, err = git.GitDiff(source, true)
  273. if err != nil {
  274. return d.findings, err
  275. }
  276. }
  277. s := semgroup.NewGroup(context.Background(), 4)
  278. for gitdiffFile := range gitdiffFiles {
  279. gitdiffFile := gitdiffFile
  280. // skip binary files
  281. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  282. continue
  283. }
  284. // Check if commit is allowed
  285. commitSHA := ""
  286. if gitdiffFile.PatchHeader != nil {
  287. commitSHA = gitdiffFile.PatchHeader.SHA
  288. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  289. continue
  290. }
  291. }
  292. d.addCommit(commitSHA)
  293. s.Go(func() error {
  294. for _, textFragment := range gitdiffFile.TextFragments {
  295. if textFragment == nil {
  296. return nil
  297. }
  298. fragment := Fragment{
  299. Raw: textFragment.Raw(gitdiff.OpAdd),
  300. CommitSHA: commitSHA,
  301. FilePath: gitdiffFile.NewName,
  302. }
  303. for _, finding := range d.Detect(fragment) {
  304. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  305. }
  306. }
  307. return nil
  308. })
  309. }
  310. if err := s.Wait(); err != nil {
  311. return d.findings, err
  312. }
  313. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  314. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  315. if git.ErrEncountered {
  316. return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
  317. }
  318. return d.findings, nil
  319. }
  320. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  321. // file or directory.
  322. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  323. s := semgroup.NewGroup(context.Background(), 4)
  324. paths := make(chan string)
  325. s.Go(func() error {
  326. defer close(paths)
  327. return filepath.Walk(source,
  328. func(path string, fInfo os.FileInfo, err error) error {
  329. if err != nil {
  330. return err
  331. }
  332. if fInfo.Name() == ".git" && fInfo.IsDir() {
  333. return filepath.SkipDir
  334. }
  335. if fInfo.Size() == 0 {
  336. return nil
  337. }
  338. if fInfo.Mode().IsRegular() {
  339. paths <- path
  340. }
  341. return nil
  342. })
  343. })
  344. for pa := range paths {
  345. p := pa
  346. s.Go(func() error {
  347. b, err := os.ReadFile(p)
  348. if err != nil {
  349. return err
  350. }
  351. mimetype, err := filetype.Match(b)
  352. if err != nil {
  353. return err
  354. }
  355. if mimetype.MIME.Type == "application" {
  356. return nil // skip binary files
  357. }
  358. fragment := Fragment{
  359. Raw: string(b),
  360. FilePath: p,
  361. }
  362. for _, finding := range d.Detect(fragment) {
  363. // need to add 1 since line counting starts at 1
  364. finding.EndLine++
  365. finding.StartLine++
  366. d.addFinding(finding)
  367. }
  368. return nil
  369. })
  370. }
  371. if err := s.Wait(); err != nil {
  372. return d.findings, err
  373. }
  374. return d.findings, nil
  375. }
  376. // Detect scans the given fragment and returns a list of findings
  377. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  378. var findings []report.Finding
  379. // initiate fragment keywords
  380. fragment.keywords = make(map[string]bool)
  381. // check if filepath is allowed
  382. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  383. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  384. return findings
  385. }
  386. // add newline indices for location calculation in detectRule
  387. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  388. // build keyword map for prefiltering rules
  389. normalizedRaw := strings.ToLower(fragment.Raw)
  390. matches := d.prefilter.FindAll(normalizedRaw)
  391. for _, m := range matches {
  392. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  393. }
  394. for _, rule := range d.Config.Rules {
  395. if len(rule.Keywords) == 0 {
  396. // if not keywords are associated with the rule always scan the
  397. // fragment using the rule
  398. findings = append(findings, d.detectRule(fragment, rule)...)
  399. continue
  400. }
  401. fragmentContainsKeyword := false
  402. // check if keywords are in the fragment
  403. for _, k := range rule.Keywords {
  404. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  405. fragmentContainsKeyword = true
  406. }
  407. }
  408. if fragmentContainsKeyword {
  409. findings = append(findings, d.detectRule(fragment, rule)...)
  410. }
  411. }
  412. return filter(findings, d.Redact)
  413. }
  414. // addFinding synchronously adds a finding to the findings slice
  415. func (d *Detector) addFinding(finding report.Finding) {
  416. if finding.Commit == "" {
  417. finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  418. } else {
  419. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  420. }
  421. // check if we should ignore this finding
  422. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  423. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  424. finding.Fingerprint)
  425. return
  426. }
  427. if d.baseline != nil && !IsNew(finding, d.baseline) {
  428. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  429. return
  430. }
  431. d.findingMutex.Lock()
  432. d.findings = append(d.findings, finding)
  433. if d.Verbose {
  434. printFinding(finding)
  435. }
  436. d.findingMutex.Unlock()
  437. }
  438. // addCommit synchronously adds a commit to the commit slice
  439. func (d *Detector) addCommit(commit string) {
  440. d.commitMap[commit] = true
  441. }