detect.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "os"
  7. "path/filepath"
  8. "regexp"
  9. "strings"
  10. "sync"
  11. "github.com/zricethezav/gitleaks/v8/config"
  12. "github.com/zricethezav/gitleaks/v8/detect/git"
  13. "github.com/zricethezav/gitleaks/v8/report"
  14. "github.com/fatih/semgroup"
  15. "github.com/gitleaks/go-gitdiff/gitdiff"
  16. "github.com/h2non/filetype"
  17. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  18. "github.com/rs/zerolog/log"
  19. "github.com/spf13/viper"
  20. )
  21. // Type used to differentiate between git scan types:
  22. // $ gitleaks detect
  23. // $ gitleaks protect
  24. // $ gitleaks protect staged
  25. type GitScanType int
  26. const (
  27. DetectType GitScanType = iota
  28. ProtectType
  29. ProtectStagedType
  30. gitleaksAllowSignature = "gitleaks:allow"
  31. )
  32. // Detector is the main detector struct
  33. type Detector struct {
  34. // Config is the configuration for the detector
  35. Config config.Config
  36. // Redact is a flag to redact findings. This is exported
  37. // so users using gitleaks as a library can set this flag
  38. // without calling `detector.Start(cmd *cobra.Command)`
  39. Redact bool
  40. // verbose is a flag to print findings
  41. Verbose bool
  42. // files larger than this will be skipped
  43. MaxTargetMegaBytes int
  44. // commitMap is used to keep track of commits that have been scanned.
  45. // This is only used for logging purposes and git scans.
  46. commitMap map[string]bool
  47. // findingMutex is to prevent concurrent access to the
  48. // findings slice when adding findings.
  49. findingMutex *sync.Mutex
  50. // findings is a slice of report.Findings. This is the result
  51. // of the detector's scan which can then be used to generate a
  52. // report.
  53. findings []report.Finding
  54. // prefilter is a ahocorasick struct used for doing efficient string
  55. // matching given a set of words (keywords from the rules in the config)
  56. prefilter ahocorasick.AhoCorasick
  57. // a list of known findings that should be ignored
  58. baseline []report.Finding
  59. // path to baseline
  60. baselinePath string
  61. // gitleaksIgnore
  62. gitleaksIgnore map[string]bool
  63. }
  64. // Fragment contains the data to be scanned
  65. type Fragment struct {
  66. // Raw is the raw content of the fragment
  67. Raw string
  68. // FilePath is the path to the file if applicable
  69. FilePath string
  70. // CommitSHA is the SHA of the commit if applicable
  71. CommitSHA string
  72. // newlineIndices is a list of indices of newlines in the raw content.
  73. // This is used to calculate the line location of a finding
  74. newlineIndices [][]int
  75. // keywords is a map of all the keywords contain within the contents
  76. // of this fragment
  77. keywords map[string]bool
  78. }
  79. // NewDetector creates a new detector with the given config
  80. func NewDetector(cfg config.Config) *Detector {
  81. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  82. AsciiCaseInsensitive: true,
  83. MatchOnlyWholeWords: false,
  84. MatchKind: ahocorasick.LeftMostLongestMatch,
  85. DFA: true,
  86. })
  87. return &Detector{
  88. commitMap: make(map[string]bool),
  89. gitleaksIgnore: make(map[string]bool),
  90. findingMutex: &sync.Mutex{},
  91. findings: make([]report.Finding, 0),
  92. Config: cfg,
  93. prefilter: builder.Build(cfg.Keywords),
  94. }
  95. }
  96. // NewDetectorDefaultConfig creates a new detector with the default config
  97. func NewDetectorDefaultConfig() (*Detector, error) {
  98. viper.SetConfigType("toml")
  99. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  100. if err != nil {
  101. return nil, err
  102. }
  103. var vc config.ViperConfig
  104. err = viper.Unmarshal(&vc)
  105. if err != nil {
  106. return nil, err
  107. }
  108. cfg, err := vc.Translate()
  109. if err != nil {
  110. return nil, err
  111. }
  112. return NewDetector(cfg), nil
  113. }
  114. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  115. log.Debug().Msg("found .gitleaksignore file")
  116. file, err := os.Open(gitleaksIgnorePath)
  117. if err != nil {
  118. return err
  119. }
  120. defer file.Close()
  121. scanner := bufio.NewScanner(file)
  122. for scanner.Scan() {
  123. d.gitleaksIgnore[scanner.Text()] = true
  124. }
  125. return nil
  126. }
  127. func (d *Detector) AddBaseline(baselinePath string) error {
  128. if baselinePath != "" {
  129. baseline, err := LoadBaseline(baselinePath)
  130. if err != nil {
  131. return err
  132. }
  133. d.baseline = baseline
  134. }
  135. d.baselinePath = baselinePath
  136. return nil
  137. }
  138. // DetectBytes scans the given bytes and returns a list of findings
  139. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  140. return d.DetectString(string(content))
  141. }
  142. // DetectString scans the given string and returns a list of findings
  143. func (d *Detector) DetectString(content string) []report.Finding {
  144. return d.Detect(Fragment{
  145. Raw: content,
  146. })
  147. }
  148. // detectRule scans the given fragment for the given rule and returns a list of findings
  149. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  150. var findings []report.Finding
  151. // check if filepath or commit is allowed for this rule
  152. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  153. rule.Allowlist.PathAllowed(fragment.FilePath) {
  154. return findings
  155. }
  156. if rule.Path != nil && rule.Regex == nil {
  157. // Path _only_ rule
  158. if rule.Path.Match([]byte(fragment.FilePath)) {
  159. finding := report.Finding{
  160. Description: rule.Description,
  161. File: fragment.FilePath,
  162. RuleID: rule.RuleID,
  163. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  164. Tags: rule.Tags,
  165. }
  166. return append(findings, finding)
  167. }
  168. } else if rule.Path != nil {
  169. // if path is set _and_ a regex is set, then we need to check both
  170. // so if the path does not match, then we should return early and not
  171. // consider the regex
  172. if !rule.Path.Match([]byte(fragment.FilePath)) {
  173. return findings
  174. }
  175. }
  176. // if path only rule, skip content checks
  177. if rule.Regex == nil {
  178. return findings
  179. }
  180. // If flag configure and raw data size bigger then the flag
  181. if d.MaxTargetMegaBytes > 0 {
  182. rawLength := len(fragment.Raw) / 1000000
  183. if rawLength > d.MaxTargetMegaBytes {
  184. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  185. return findings
  186. }
  187. }
  188. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  189. for _, matchIndex := range matchIndices {
  190. // extract secret from match
  191. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  192. // determine location of match. Note that the location
  193. // in the finding will be the line/column numbers of the _match_
  194. // not the _secret_, which will be different if the secretGroup
  195. // value is set for this rule
  196. loc := location(fragment, matchIndex)
  197. if matchIndex[1] > loc.endLineIndex {
  198. loc.endLineIndex = matchIndex[1]
  199. }
  200. finding := report.Finding{
  201. Description: rule.Description,
  202. File: fragment.FilePath,
  203. RuleID: rule.RuleID,
  204. StartLine: loc.startLine,
  205. EndLine: loc.endLine,
  206. StartColumn: loc.startColumn,
  207. EndColumn: loc.endColumn,
  208. Secret: secret,
  209. Match: secret,
  210. Tags: rule.Tags,
  211. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  212. }
  213. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  214. gitleaksAllowSignature) {
  215. continue
  216. }
  217. // check if the secret is in the allowlist
  218. if rule.Allowlist.RegexAllowed(finding.Secret) ||
  219. d.Config.Allowlist.RegexAllowed(finding.Secret) {
  220. continue
  221. }
  222. // extract secret from secret group if set
  223. if rule.SecretGroup != 0 {
  224. groups := rule.Regex.FindStringSubmatch(secret)
  225. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  226. // Config validation should prevent this
  227. continue
  228. }
  229. secret = groups[rule.SecretGroup]
  230. finding.Secret = secret
  231. }
  232. // check if the secret is in the list of stopwords
  233. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  234. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  235. continue
  236. }
  237. // check entropy
  238. entropy := shannonEntropy(finding.Secret)
  239. finding.Entropy = float32(entropy)
  240. if rule.Entropy != 0.0 {
  241. if entropy <= rule.Entropy {
  242. // entropy is too low, skip this finding
  243. continue
  244. }
  245. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  246. // does not support positive lookaheads. Ideally we would want to add a
  247. // restriction on generic rules regex that requires the secret match group
  248. // contains both numbers and alphabetical characters, not just alphabetical characters.
  249. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  250. // secret contains both digits and alphabetical characters.
  251. // TODO: this should be replaced with stop words
  252. if strings.HasPrefix(rule.RuleID, "generic") {
  253. if !containsDigit(secret) {
  254. continue
  255. }
  256. }
  257. }
  258. findings = append(findings, finding)
  259. }
  260. return findings
  261. }
  262. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  263. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  264. // and determine if the patch contains any findings.
  265. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  266. var (
  267. gitdiffFiles <-chan *gitdiff.File
  268. err error
  269. )
  270. switch gitScanType {
  271. case DetectType:
  272. gitdiffFiles, err = git.GitLog(source, logOpts)
  273. if err != nil {
  274. return d.findings, err
  275. }
  276. case ProtectType:
  277. gitdiffFiles, err = git.GitDiff(source, false)
  278. if err != nil {
  279. return d.findings, err
  280. }
  281. case ProtectStagedType:
  282. gitdiffFiles, err = git.GitDiff(source, true)
  283. if err != nil {
  284. return d.findings, err
  285. }
  286. }
  287. s := semgroup.NewGroup(context.Background(), 4)
  288. for gitdiffFile := range gitdiffFiles {
  289. gitdiffFile := gitdiffFile
  290. // skip binary files
  291. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  292. continue
  293. }
  294. // Check if commit is allowed
  295. commitSHA := ""
  296. if gitdiffFile.PatchHeader != nil {
  297. commitSHA = gitdiffFile.PatchHeader.SHA
  298. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  299. continue
  300. }
  301. }
  302. d.addCommit(commitSHA)
  303. s.Go(func() error {
  304. for _, textFragment := range gitdiffFile.TextFragments {
  305. if textFragment == nil {
  306. return nil
  307. }
  308. fragment := Fragment{
  309. Raw: textFragment.Raw(gitdiff.OpAdd),
  310. CommitSHA: commitSHA,
  311. FilePath: gitdiffFile.NewName,
  312. }
  313. for _, finding := range d.Detect(fragment) {
  314. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  315. }
  316. }
  317. return nil
  318. })
  319. }
  320. if err := s.Wait(); err != nil {
  321. return d.findings, err
  322. }
  323. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  324. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  325. if git.ErrEncountered {
  326. return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
  327. }
  328. return d.findings, nil
  329. }
  330. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  331. // file or directory.
  332. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  333. s := semgroup.NewGroup(context.Background(), 4)
  334. paths := make(chan string)
  335. s.Go(func() error {
  336. defer close(paths)
  337. return filepath.Walk(source,
  338. func(path string, fInfo os.FileInfo, err error) error {
  339. if err != nil {
  340. return err
  341. }
  342. if fInfo.Name() == ".git" && fInfo.IsDir() {
  343. return filepath.SkipDir
  344. }
  345. if fInfo.Size() == 0 {
  346. return nil
  347. }
  348. if fInfo.Mode().IsRegular() {
  349. paths <- path
  350. }
  351. return nil
  352. })
  353. })
  354. for pa := range paths {
  355. p := pa
  356. s.Go(func() error {
  357. b, err := os.ReadFile(p)
  358. if err != nil {
  359. return err
  360. }
  361. mimetype, err := filetype.Match(b)
  362. if err != nil {
  363. return err
  364. }
  365. if mimetype.MIME.Type == "application" {
  366. return nil // skip binary files
  367. }
  368. fragment := Fragment{
  369. Raw: string(b),
  370. FilePath: p,
  371. }
  372. for _, finding := range d.Detect(fragment) {
  373. // need to add 1 since line counting starts at 1
  374. finding.EndLine++
  375. finding.StartLine++
  376. d.addFinding(finding)
  377. }
  378. return nil
  379. })
  380. }
  381. if err := s.Wait(); err != nil {
  382. return d.findings, err
  383. }
  384. return d.findings, nil
  385. }
  386. // Detect scans the given fragment and returns a list of findings
  387. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  388. var findings []report.Finding
  389. // initiate fragment keywords
  390. fragment.keywords = make(map[string]bool)
  391. // check if filepath is allowed
  392. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  393. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  394. return findings
  395. }
  396. // add newline indices for location calculation in detectRule
  397. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  398. // build keyword map for prefiltering rules
  399. normalizedRaw := strings.ToLower(fragment.Raw)
  400. matches := d.prefilter.FindAll(normalizedRaw)
  401. for _, m := range matches {
  402. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  403. }
  404. for _, rule := range d.Config.Rules {
  405. if len(rule.Keywords) == 0 {
  406. // if not keywords are associated with the rule always scan the
  407. // fragment using the rule
  408. findings = append(findings, d.detectRule(fragment, rule)...)
  409. continue
  410. }
  411. fragmentContainsKeyword := false
  412. // check if keywords are in the fragment
  413. for _, k := range rule.Keywords {
  414. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  415. fragmentContainsKeyword = true
  416. }
  417. }
  418. if fragmentContainsKeyword {
  419. findings = append(findings, d.detectRule(fragment, rule)...)
  420. }
  421. }
  422. return filter(findings, d.Redact)
  423. }
  424. // addFinding synchronously adds a finding to the findings slice
  425. func (d *Detector) addFinding(finding report.Finding) {
  426. if finding.Commit == "" {
  427. finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  428. } else {
  429. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  430. }
  431. // check if we should ignore this finding
  432. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  433. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  434. finding.Fingerprint)
  435. return
  436. }
  437. if d.baseline != nil && !IsNew(finding, d.baseline) {
  438. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  439. return
  440. }
  441. d.findingMutex.Lock()
  442. d.findings = append(d.findings, finding)
  443. if d.Verbose {
  444. printFinding(finding)
  445. }
  446. d.findingMutex.Unlock()
  447. }
  448. // addCommit synchronously adds a commit to the commit slice
  449. func (d *Detector) addCommit(commit string) {
  450. d.commitMap[commit] = true
  451. }