detect.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "io"
  7. "io/fs"
  8. "os"
  9. "path/filepath"
  10. "regexp"
  11. "strings"
  12. "sync"
  13. "github.com/h2non/filetype"
  14. "github.com/zricethezav/gitleaks/v8/config"
  15. "github.com/zricethezav/gitleaks/v8/detect/git"
  16. "github.com/zricethezav/gitleaks/v8/report"
  17. "github.com/fatih/semgroup"
  18. "github.com/gitleaks/go-gitdiff/gitdiff"
  19. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  20. "github.com/rs/zerolog/log"
  21. "github.com/spf13/viper"
  22. )
  23. // Type used to differentiate between git scan types:
  24. // $ gitleaks detect
  25. // $ gitleaks protect
  26. // $ gitleaks protect staged
  27. type GitScanType int
  28. const (
  29. DetectType GitScanType = iota
  30. ProtectType
  31. ProtectStagedType
  32. gitleaksAllowSignature = "gitleaks:allow"
  33. )
  34. // Detector is the main detector struct
  35. type Detector struct {
  36. // Config is the configuration for the detector
  37. Config config.Config
  38. // Redact is a flag to redact findings. This is exported
  39. // so users using gitleaks as a library can set this flag
  40. // without calling `detector.Start(cmd *cobra.Command)`
  41. Redact bool
  42. // verbose is a flag to print findings
  43. Verbose bool
  44. // files larger than this will be skipped
  45. MaxTargetMegaBytes int
  46. // followSymlinks is a flag to enable scanning symlink files
  47. FollowSymlinks bool
  48. // commitMap is used to keep track of commits that have been scanned.
  49. // This is only used for logging purposes and git scans.
  50. commitMap map[string]bool
  51. // findingMutex is to prevent concurrent access to the
  52. // findings slice when adding findings.
  53. findingMutex *sync.Mutex
  54. // findings is a slice of report.Findings. This is the result
  55. // of the detector's scan which can then be used to generate a
  56. // report.
  57. findings []report.Finding
  58. // prefilter is a ahocorasick struct used for doing efficient string
  59. // matching given a set of words (keywords from the rules in the config)
  60. prefilter ahocorasick.AhoCorasick
  61. // a list of known findings that should be ignored
  62. baseline []report.Finding
  63. // path to baseline
  64. baselinePath string
  65. // gitleaksIgnore
  66. gitleaksIgnore map[string]bool
  67. }
  68. // Fragment contains the data to be scanned
  69. type Fragment struct {
  70. // Raw is the raw content of the fragment
  71. Raw string
  72. // FilePath is the path to the file if applicable
  73. FilePath string
  74. SymlinkFile string
  75. // CommitSHA is the SHA of the commit if applicable
  76. CommitSHA string
  77. // newlineIndices is a list of indices of newlines in the raw content.
  78. // This is used to calculate the line location of a finding
  79. newlineIndices [][]int
  80. // keywords is a map of all the keywords contain within the contents
  81. // of this fragment
  82. keywords map[string]bool
  83. }
  84. // NewDetector creates a new detector with the given config
  85. func NewDetector(cfg config.Config) *Detector {
  86. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  87. AsciiCaseInsensitive: true,
  88. MatchOnlyWholeWords: false,
  89. MatchKind: ahocorasick.LeftMostLongestMatch,
  90. DFA: true,
  91. })
  92. return &Detector{
  93. commitMap: make(map[string]bool),
  94. gitleaksIgnore: make(map[string]bool),
  95. findingMutex: &sync.Mutex{},
  96. findings: make([]report.Finding, 0),
  97. Config: cfg,
  98. prefilter: builder.Build(cfg.Keywords),
  99. }
  100. }
  101. // NewDetectorDefaultConfig creates a new detector with the default config
  102. func NewDetectorDefaultConfig() (*Detector, error) {
  103. viper.SetConfigType("toml")
  104. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  105. if err != nil {
  106. return nil, err
  107. }
  108. var vc config.ViperConfig
  109. err = viper.Unmarshal(&vc)
  110. if err != nil {
  111. return nil, err
  112. }
  113. cfg, err := vc.Translate()
  114. if err != nil {
  115. return nil, err
  116. }
  117. return NewDetector(cfg), nil
  118. }
  119. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  120. log.Debug().Msg("found .gitleaksignore file")
  121. file, err := os.Open(gitleaksIgnorePath)
  122. if err != nil {
  123. return err
  124. }
  125. defer file.Close()
  126. scanner := bufio.NewScanner(file)
  127. for scanner.Scan() {
  128. d.gitleaksIgnore[scanner.Text()] = true
  129. }
  130. return nil
  131. }
  132. func (d *Detector) AddBaseline(baselinePath string, source string) error {
  133. if baselinePath != "" {
  134. absoluteSource, err := filepath.Abs(source)
  135. if err != nil {
  136. return err
  137. }
  138. absoluteBaseline, err := filepath.Abs(baselinePath)
  139. if err != nil {
  140. return err
  141. }
  142. relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline)
  143. if err != nil {
  144. return err
  145. }
  146. baseline, err := LoadBaseline(baselinePath)
  147. if err != nil {
  148. return err
  149. }
  150. d.baseline = baseline
  151. baselinePath = relativeBaseline
  152. }
  153. d.baselinePath = baselinePath
  154. return nil
  155. }
  156. // DetectBytes scans the given bytes and returns a list of findings
  157. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  158. return d.DetectString(string(content))
  159. }
  160. // DetectString scans the given string and returns a list of findings
  161. func (d *Detector) DetectString(content string) []report.Finding {
  162. return d.Detect(Fragment{
  163. Raw: content,
  164. })
  165. }
  166. // detectRule scans the given fragment for the given rule and returns a list of findings
  167. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  168. var findings []report.Finding
  169. // check if filepath or commit is allowed for this rule
  170. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  171. rule.Allowlist.PathAllowed(fragment.FilePath) {
  172. return findings
  173. }
  174. if rule.Path != nil && rule.Regex == nil {
  175. // Path _only_ rule
  176. if rule.Path.Match([]byte(fragment.FilePath)) {
  177. finding := report.Finding{
  178. Description: rule.Description,
  179. File: fragment.FilePath,
  180. SymlinkFile: fragment.SymlinkFile,
  181. RuleID: rule.RuleID,
  182. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  183. Tags: rule.Tags,
  184. }
  185. return append(findings, finding)
  186. }
  187. } else if rule.Path != nil {
  188. // if path is set _and_ a regex is set, then we need to check both
  189. // so if the path does not match, then we should return early and not
  190. // consider the regex
  191. if !rule.Path.Match([]byte(fragment.FilePath)) {
  192. return findings
  193. }
  194. }
  195. // if path only rule, skip content checks
  196. if rule.Regex == nil {
  197. return findings
  198. }
  199. // If flag configure and raw data size bigger then the flag
  200. if d.MaxTargetMegaBytes > 0 {
  201. rawLength := len(fragment.Raw) / 1000000
  202. if rawLength > d.MaxTargetMegaBytes {
  203. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  204. return findings
  205. }
  206. }
  207. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  208. for _, matchIndex := range matchIndices {
  209. // extract secret from match
  210. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  211. // determine location of match. Note that the location
  212. // in the finding will be the line/column numbers of the _match_
  213. // not the _secret_, which will be different if the secretGroup
  214. // value is set for this rule
  215. loc := location(fragment, matchIndex)
  216. if matchIndex[1] > loc.endLineIndex {
  217. loc.endLineIndex = matchIndex[1]
  218. }
  219. finding := report.Finding{
  220. Description: rule.Description,
  221. File: fragment.FilePath,
  222. SymlinkFile: fragment.SymlinkFile,
  223. RuleID: rule.RuleID,
  224. StartLine: loc.startLine,
  225. EndLine: loc.endLine,
  226. StartColumn: loc.startColumn,
  227. EndColumn: loc.endColumn,
  228. Secret: secret,
  229. Match: secret,
  230. Tags: rule.Tags,
  231. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  232. }
  233. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  234. gitleaksAllowSignature) {
  235. continue
  236. }
  237. // check if the regexTarget is defined in the allowlist "regexes" entry
  238. allowlistTarget := finding.Secret
  239. switch rule.Allowlist.RegexTarget {
  240. case "match":
  241. allowlistTarget = finding.Match
  242. case "line":
  243. allowlistTarget = finding.Line
  244. }
  245. globalAllowlistTarget := finding.Secret
  246. switch d.Config.Allowlist.RegexTarget {
  247. case "match":
  248. globalAllowlistTarget = finding.Match
  249. case "line":
  250. globalAllowlistTarget = finding.Line
  251. }
  252. if rule.Allowlist.RegexAllowed(allowlistTarget) ||
  253. d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
  254. continue
  255. }
  256. // extract secret from secret group if set
  257. if rule.SecretGroup != 0 {
  258. groups := rule.Regex.FindStringSubmatch(secret)
  259. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  260. // Config validation should prevent this
  261. continue
  262. }
  263. secret = groups[rule.SecretGroup]
  264. finding.Secret = secret
  265. }
  266. // check if the secret is in the list of stopwords
  267. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  268. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  269. continue
  270. }
  271. // check entropy
  272. entropy := shannonEntropy(finding.Secret)
  273. finding.Entropy = float32(entropy)
  274. if rule.Entropy != 0.0 {
  275. if entropy <= rule.Entropy {
  276. // entropy is too low, skip this finding
  277. continue
  278. }
  279. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  280. // does not support positive lookaheads. Ideally we would want to add a
  281. // restriction on generic rules regex that requires the secret match group
  282. // contains both numbers and alphabetical characters, not just alphabetical characters.
  283. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  284. // secret contains both digits and alphabetical characters.
  285. // TODO: this should be replaced with stop words
  286. if strings.HasPrefix(rule.RuleID, "generic") {
  287. if !containsDigit(secret) {
  288. continue
  289. }
  290. }
  291. }
  292. findings = append(findings, finding)
  293. }
  294. return findings
  295. }
  296. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  297. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  298. // and determine if the patch contains any findings.
  299. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  300. var (
  301. gitdiffFiles <-chan *gitdiff.File
  302. err error
  303. )
  304. switch gitScanType {
  305. case DetectType:
  306. gitdiffFiles, err = git.GitLog(source, logOpts)
  307. if err != nil {
  308. return d.findings, err
  309. }
  310. case ProtectType:
  311. gitdiffFiles, err = git.GitDiff(source, false)
  312. if err != nil {
  313. return d.findings, err
  314. }
  315. case ProtectStagedType:
  316. gitdiffFiles, err = git.GitDiff(source, true)
  317. if err != nil {
  318. return d.findings, err
  319. }
  320. }
  321. s := semgroup.NewGroup(context.Background(), 4)
  322. for gitdiffFile := range gitdiffFiles {
  323. gitdiffFile := gitdiffFile
  324. // skip binary files
  325. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  326. continue
  327. }
  328. // Check if commit is allowed
  329. commitSHA := ""
  330. if gitdiffFile.PatchHeader != nil {
  331. commitSHA = gitdiffFile.PatchHeader.SHA
  332. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  333. continue
  334. }
  335. }
  336. d.addCommit(commitSHA)
  337. s.Go(func() error {
  338. for _, textFragment := range gitdiffFile.TextFragments {
  339. if textFragment == nil {
  340. return nil
  341. }
  342. fragment := Fragment{
  343. Raw: textFragment.Raw(gitdiff.OpAdd),
  344. CommitSHA: commitSHA,
  345. FilePath: gitdiffFile.NewName,
  346. }
  347. for _, finding := range d.Detect(fragment) {
  348. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  349. }
  350. }
  351. return nil
  352. })
  353. }
  354. if err := s.Wait(); err != nil {
  355. return d.findings, err
  356. }
  357. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  358. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  359. if git.ErrEncountered {
  360. return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
  361. }
  362. return d.findings, nil
  363. }
  364. type scanTarget struct {
  365. Path string
  366. Symlink string
  367. }
  368. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  369. // file or directory.
  370. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  371. s := semgroup.NewGroup(context.Background(), 4)
  372. paths := make(chan scanTarget)
  373. s.Go(func() error {
  374. defer close(paths)
  375. return filepath.Walk(source,
  376. func(path string, fInfo os.FileInfo, err error) error {
  377. if err != nil {
  378. return err
  379. }
  380. if fInfo.Name() == ".git" && fInfo.IsDir() {
  381. return filepath.SkipDir
  382. }
  383. if fInfo.Size() == 0 {
  384. return nil
  385. }
  386. if fInfo.Mode().IsRegular() {
  387. paths <- scanTarget{
  388. Path: path,
  389. Symlink: "",
  390. }
  391. }
  392. if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
  393. realPath, err := filepath.EvalSymlinks(path)
  394. if err != nil {
  395. return err
  396. }
  397. realPathFileInfo, _ := os.Stat(realPath)
  398. if realPathFileInfo.IsDir() {
  399. log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
  400. return nil
  401. }
  402. paths <- scanTarget{
  403. Path: realPath,
  404. Symlink: path,
  405. }
  406. }
  407. return nil
  408. })
  409. })
  410. for pa := range paths {
  411. p := pa
  412. s.Go(func() error {
  413. b, err := os.ReadFile(p.Path)
  414. if err != nil {
  415. return err
  416. }
  417. mimetype, err := filetype.Match(b)
  418. if err != nil {
  419. return err
  420. }
  421. if mimetype.MIME.Type == "application" {
  422. return nil // skip binary files
  423. }
  424. fragment := Fragment{
  425. Raw: string(b),
  426. FilePath: p.Path,
  427. }
  428. if p.Symlink != "" {
  429. fragment.SymlinkFile = p.Symlink
  430. }
  431. for _, finding := range d.Detect(fragment) {
  432. // need to add 1 since line counting starts at 1
  433. finding.EndLine++
  434. finding.StartLine++
  435. d.addFinding(finding)
  436. }
  437. return nil
  438. })
  439. }
  440. if err := s.Wait(); err != nil {
  441. return d.findings, err
  442. }
  443. return d.findings, nil
  444. }
  445. // DetectReader accepts an io.Reader and a buffer size for the reader in KB
  446. func (d *Detector) DetectReader(r io.Reader, bufSize int) ([]report.Finding, error) {
  447. reader := bufio.NewReader(r)
  448. buf := make([]byte, 0, 1000*bufSize)
  449. findings := []report.Finding{}
  450. for {
  451. n, err := reader.Read(buf[:cap(buf)])
  452. buf = buf[:n]
  453. if err != nil {
  454. if err != io.EOF {
  455. return findings, err
  456. }
  457. break
  458. }
  459. fragment := Fragment{
  460. Raw: string(buf),
  461. }
  462. for _, finding := range d.Detect(fragment) {
  463. findings = append(findings, finding)
  464. if d.Verbose {
  465. printFinding(finding)
  466. }
  467. }
  468. }
  469. return findings, nil
  470. }
  471. // Detect scans the given fragment and returns a list of findings
  472. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  473. var findings []report.Finding
  474. // initiate fragment keywords
  475. fragment.keywords = make(map[string]bool)
  476. // check if filepath is allowed
  477. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  478. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  479. return findings
  480. }
  481. // add newline indices for location calculation in detectRule
  482. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  483. // build keyword map for prefiltering rules
  484. normalizedRaw := strings.ToLower(fragment.Raw)
  485. matches := d.prefilter.FindAll(normalizedRaw)
  486. for _, m := range matches {
  487. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  488. }
  489. for _, rule := range d.Config.Rules {
  490. if len(rule.Keywords) == 0 {
  491. // if not keywords are associated with the rule always scan the
  492. // fragment using the rule
  493. findings = append(findings, d.detectRule(fragment, rule)...)
  494. continue
  495. }
  496. fragmentContainsKeyword := false
  497. // check if keywords are in the fragment
  498. for _, k := range rule.Keywords {
  499. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  500. fragmentContainsKeyword = true
  501. }
  502. }
  503. if fragmentContainsKeyword {
  504. findings = append(findings, d.detectRule(fragment, rule)...)
  505. }
  506. }
  507. return filter(findings, d.Redact)
  508. }
  509. // addFinding synchronously adds a finding to the findings slice
  510. func (d *Detector) addFinding(finding report.Finding) {
  511. if finding.Commit == "" {
  512. finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  513. } else {
  514. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  515. }
  516. // check if we should ignore this finding
  517. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  518. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  519. finding.Fingerprint)
  520. return
  521. }
  522. if d.baseline != nil && !IsNew(finding, d.baseline) {
  523. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  524. return
  525. }
  526. d.findingMutex.Lock()
  527. d.findings = append(d.findings, finding)
  528. if d.Verbose {
  529. printFinding(finding)
  530. }
  531. d.findingMutex.Unlock()
  532. }
  533. // addCommit synchronously adds a commit to the commit slice
  534. func (d *Detector) addCommit(commit string) {
  535. d.commitMap[commit] = true
  536. }