detect.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. package detect
  2. import (
  3. "bufio"
  4. "context"
  5. "fmt"
  6. "io/fs"
  7. "os"
  8. "path/filepath"
  9. "regexp"
  10. "strings"
  11. "sync"
  12. "github.com/zricethezav/gitleaks/v8/config"
  13. "github.com/zricethezav/gitleaks/v8/detect/git"
  14. "github.com/zricethezav/gitleaks/v8/report"
  15. "github.com/fatih/semgroup"
  16. "github.com/gitleaks/go-gitdiff/gitdiff"
  17. "github.com/h2non/filetype"
  18. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  19. "github.com/rs/zerolog/log"
  20. "github.com/spf13/viper"
  21. )
  22. // Type used to differentiate between git scan types:
  23. // $ gitleaks detect
  24. // $ gitleaks protect
  25. // $ gitleaks protect staged
  26. type GitScanType int
  27. const (
  28. DetectType GitScanType = iota
  29. ProtectType
  30. ProtectStagedType
  31. gitleaksAllowSignature = "gitleaks:allow"
  32. )
  33. // Detector is the main detector struct
  34. type Detector struct {
  35. // Config is the configuration for the detector
  36. Config config.Config
  37. // Redact is a flag to redact findings. This is exported
  38. // so users using gitleaks as a library can set this flag
  39. // without calling `detector.Start(cmd *cobra.Command)`
  40. Redact bool
  41. // verbose is a flag to print findings
  42. Verbose bool
  43. // files larger than this will be skipped
  44. MaxTargetMegaBytes int
  45. // followSymlinks is a flag to enable scanning symlink files
  46. FollowSymlinks bool
  47. // commitMap is used to keep track of commits that have been scanned.
  48. // This is only used for logging purposes and git scans.
  49. commitMap map[string]bool
  50. // findingMutex is to prevent concurrent access to the
  51. // findings slice when adding findings.
  52. findingMutex *sync.Mutex
  53. // findings is a slice of report.Findings. This is the result
  54. // of the detector's scan which can then be used to generate a
  55. // report.
  56. findings []report.Finding
  57. // prefilter is a ahocorasick struct used for doing efficient string
  58. // matching given a set of words (keywords from the rules in the config)
  59. prefilter ahocorasick.AhoCorasick
  60. // a list of known findings that should be ignored
  61. baseline []report.Finding
  62. // path to baseline
  63. baselinePath string
  64. // gitleaksIgnore
  65. gitleaksIgnore map[string]bool
  66. }
  67. // Fragment contains the data to be scanned
  68. type Fragment struct {
  69. // Raw is the raw content of the fragment
  70. Raw string
  71. // FilePath is the path to the file if applicable
  72. FilePath string
  73. SymlinkFile string
  74. // CommitSHA is the SHA of the commit if applicable
  75. CommitSHA string
  76. // newlineIndices is a list of indices of newlines in the raw content.
  77. // This is used to calculate the line location of a finding
  78. newlineIndices [][]int
  79. // keywords is a map of all the keywords contain within the contents
  80. // of this fragment
  81. keywords map[string]bool
  82. }
  83. // NewDetector creates a new detector with the given config
  84. func NewDetector(cfg config.Config) *Detector {
  85. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  86. AsciiCaseInsensitive: true,
  87. MatchOnlyWholeWords: false,
  88. MatchKind: ahocorasick.LeftMostLongestMatch,
  89. DFA: true,
  90. })
  91. return &Detector{
  92. commitMap: make(map[string]bool),
  93. gitleaksIgnore: make(map[string]bool),
  94. findingMutex: &sync.Mutex{},
  95. findings: make([]report.Finding, 0),
  96. Config: cfg,
  97. prefilter: builder.Build(cfg.Keywords),
  98. }
  99. }
  100. // NewDetectorDefaultConfig creates a new detector with the default config
  101. func NewDetectorDefaultConfig() (*Detector, error) {
  102. viper.SetConfigType("toml")
  103. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  104. if err != nil {
  105. return nil, err
  106. }
  107. var vc config.ViperConfig
  108. err = viper.Unmarshal(&vc)
  109. if err != nil {
  110. return nil, err
  111. }
  112. cfg, err := vc.Translate()
  113. if err != nil {
  114. return nil, err
  115. }
  116. return NewDetector(cfg), nil
  117. }
  118. func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
  119. log.Debug().Msg("found .gitleaksignore file")
  120. file, err := os.Open(gitleaksIgnorePath)
  121. if err != nil {
  122. return err
  123. }
  124. defer file.Close()
  125. scanner := bufio.NewScanner(file)
  126. for scanner.Scan() {
  127. d.gitleaksIgnore[scanner.Text()] = true
  128. }
  129. return nil
  130. }
  131. func (d *Detector) AddBaseline(baselinePath string) error {
  132. if baselinePath != "" {
  133. baseline, err := LoadBaseline(baselinePath)
  134. if err != nil {
  135. return err
  136. }
  137. d.baseline = baseline
  138. }
  139. d.baselinePath = baselinePath
  140. return nil
  141. }
  142. // DetectBytes scans the given bytes and returns a list of findings
  143. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  144. return d.DetectString(string(content))
  145. }
  146. // DetectString scans the given string and returns a list of findings
  147. func (d *Detector) DetectString(content string) []report.Finding {
  148. return d.Detect(Fragment{
  149. Raw: content,
  150. })
  151. }
  152. // detectRule scans the given fragment for the given rule and returns a list of findings
  153. func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
  154. var findings []report.Finding
  155. // check if filepath or commit is allowed for this rule
  156. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  157. rule.Allowlist.PathAllowed(fragment.FilePath) {
  158. return findings
  159. }
  160. if rule.Path != nil && rule.Regex == nil {
  161. // Path _only_ rule
  162. if rule.Path.Match([]byte(fragment.FilePath)) {
  163. finding := report.Finding{
  164. Description: rule.Description,
  165. File: fragment.FilePath,
  166. SymlinkFile: fragment.SymlinkFile,
  167. RuleID: rule.RuleID,
  168. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  169. Tags: rule.Tags,
  170. }
  171. return append(findings, finding)
  172. }
  173. } else if rule.Path != nil {
  174. // if path is set _and_ a regex is set, then we need to check both
  175. // so if the path does not match, then we should return early and not
  176. // consider the regex
  177. if !rule.Path.Match([]byte(fragment.FilePath)) {
  178. return findings
  179. }
  180. }
  181. // if path only rule, skip content checks
  182. if rule.Regex == nil {
  183. return findings
  184. }
  185. // If flag configure and raw data size bigger then the flag
  186. if d.MaxTargetMegaBytes > 0 {
  187. rawLength := len(fragment.Raw) / 1000000
  188. if rawLength > d.MaxTargetMegaBytes {
  189. log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
  190. return findings
  191. }
  192. }
  193. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  194. for _, matchIndex := range matchIndices {
  195. // extract secret from match
  196. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  197. // determine location of match. Note that the location
  198. // in the finding will be the line/column numbers of the _match_
  199. // not the _secret_, which will be different if the secretGroup
  200. // value is set for this rule
  201. loc := location(fragment, matchIndex)
  202. if matchIndex[1] > loc.endLineIndex {
  203. loc.endLineIndex = matchIndex[1]
  204. }
  205. finding := report.Finding{
  206. Description: rule.Description,
  207. File: fragment.FilePath,
  208. SymlinkFile: fragment.SymlinkFile,
  209. RuleID: rule.RuleID,
  210. StartLine: loc.startLine,
  211. EndLine: loc.endLine,
  212. StartColumn: loc.startColumn,
  213. EndColumn: loc.endColumn,
  214. Secret: secret,
  215. Match: secret,
  216. Tags: rule.Tags,
  217. Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  218. }
  219. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  220. gitleaksAllowSignature) {
  221. continue
  222. }
  223. // check if the secret is in the allowlist
  224. if rule.Allowlist.RegexAllowed(finding.Secret) ||
  225. d.Config.Allowlist.RegexAllowed(finding.Secret) {
  226. continue
  227. }
  228. // extract secret from secret group if set
  229. if rule.SecretGroup != 0 {
  230. groups := rule.Regex.FindStringSubmatch(secret)
  231. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  232. // Config validation should prevent this
  233. continue
  234. }
  235. secret = groups[rule.SecretGroup]
  236. finding.Secret = secret
  237. }
  238. // check if the secret is in the list of stopwords
  239. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  240. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  241. continue
  242. }
  243. // check entropy
  244. entropy := shannonEntropy(finding.Secret)
  245. finding.Entropy = float32(entropy)
  246. if rule.Entropy != 0.0 {
  247. if entropy <= rule.Entropy {
  248. // entropy is too low, skip this finding
  249. continue
  250. }
  251. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  252. // does not support positive lookaheads. Ideally we would want to add a
  253. // restriction on generic rules regex that requires the secret match group
  254. // contains both numbers and alphabetical characters, not just alphabetical characters.
  255. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  256. // secret contains both digits and alphabetical characters.
  257. // TODO: this should be replaced with stop words
  258. if strings.HasPrefix(rule.RuleID, "generic") {
  259. if !containsDigit(secret) {
  260. continue
  261. }
  262. }
  263. }
  264. findings = append(findings, finding)
  265. }
  266. return findings
  267. }
  268. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  269. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  270. // and determine if the patch contains any findings.
  271. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  272. var (
  273. gitdiffFiles <-chan *gitdiff.File
  274. err error
  275. )
  276. switch gitScanType {
  277. case DetectType:
  278. gitdiffFiles, err = git.GitLog(source, logOpts)
  279. if err != nil {
  280. return d.findings, err
  281. }
  282. case ProtectType:
  283. gitdiffFiles, err = git.GitDiff(source, false)
  284. if err != nil {
  285. return d.findings, err
  286. }
  287. case ProtectStagedType:
  288. gitdiffFiles, err = git.GitDiff(source, true)
  289. if err != nil {
  290. return d.findings, err
  291. }
  292. }
  293. s := semgroup.NewGroup(context.Background(), 4)
  294. for gitdiffFile := range gitdiffFiles {
  295. gitdiffFile := gitdiffFile
  296. // skip binary files
  297. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  298. continue
  299. }
  300. // Check if commit is allowed
  301. commitSHA := ""
  302. if gitdiffFile.PatchHeader != nil {
  303. commitSHA = gitdiffFile.PatchHeader.SHA
  304. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  305. continue
  306. }
  307. }
  308. d.addCommit(commitSHA)
  309. s.Go(func() error {
  310. for _, textFragment := range gitdiffFile.TextFragments {
  311. if textFragment == nil {
  312. return nil
  313. }
  314. fragment := Fragment{
  315. Raw: textFragment.Raw(gitdiff.OpAdd),
  316. CommitSHA: commitSHA,
  317. FilePath: gitdiffFile.NewName,
  318. }
  319. for _, finding := range d.Detect(fragment) {
  320. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  321. }
  322. }
  323. return nil
  324. })
  325. }
  326. if err := s.Wait(); err != nil {
  327. return d.findings, err
  328. }
  329. log.Info().Msgf("%d commits scanned.", len(d.commitMap))
  330. log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
  331. if git.ErrEncountered {
  332. return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
  333. }
  334. return d.findings, nil
  335. }
  336. type scanTarget struct {
  337. Path string
  338. Symlink string
  339. }
  340. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  341. // file or directory.
  342. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  343. s := semgroup.NewGroup(context.Background(), 4)
  344. paths := make(chan scanTarget)
  345. s.Go(func() error {
  346. defer close(paths)
  347. return filepath.Walk(source,
  348. func(path string, fInfo os.FileInfo, err error) error {
  349. if err != nil {
  350. return err
  351. }
  352. if fInfo.Name() == ".git" && fInfo.IsDir() {
  353. return filepath.SkipDir
  354. }
  355. if fInfo.Size() == 0 {
  356. return nil
  357. }
  358. if fInfo.Mode().IsRegular() {
  359. paths <- scanTarget{
  360. Path: path,
  361. Symlink: "",
  362. }
  363. }
  364. if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
  365. realPath, err := filepath.EvalSymlinks(path)
  366. if err != nil {
  367. return err
  368. }
  369. realPathFileInfo, _ := os.Stat(realPath)
  370. if realPathFileInfo.IsDir() {
  371. log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
  372. return nil
  373. }
  374. paths <- scanTarget{
  375. Path: realPath,
  376. Symlink: path,
  377. }
  378. }
  379. return nil
  380. })
  381. })
  382. for pa := range paths {
  383. p := pa
  384. s.Go(func() error {
  385. b, err := os.ReadFile(p.Path)
  386. if err != nil {
  387. return err
  388. }
  389. mimetype, err := filetype.Match(b)
  390. if err != nil {
  391. return err
  392. }
  393. if mimetype.MIME.Type == "application" {
  394. return nil // skip binary files
  395. }
  396. fragment := Fragment{
  397. Raw: string(b),
  398. FilePath: p.Path,
  399. }
  400. if p.Symlink != "" {
  401. fragment.SymlinkFile = p.Symlink
  402. }
  403. for _, finding := range d.Detect(fragment) {
  404. // need to add 1 since line counting starts at 1
  405. finding.EndLine++
  406. finding.StartLine++
  407. d.addFinding(finding)
  408. }
  409. return nil
  410. })
  411. }
  412. if err := s.Wait(); err != nil {
  413. return d.findings, err
  414. }
  415. return d.findings, nil
  416. }
  417. // Detect scans the given fragment and returns a list of findings
  418. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  419. var findings []report.Finding
  420. // initiate fragment keywords
  421. fragment.keywords = make(map[string]bool)
  422. // check if filepath is allowed
  423. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  424. fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
  425. return findings
  426. }
  427. // add newline indices for location calculation in detectRule
  428. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  429. // build keyword map for prefiltering rules
  430. normalizedRaw := strings.ToLower(fragment.Raw)
  431. matches := d.prefilter.FindAll(normalizedRaw)
  432. for _, m := range matches {
  433. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  434. }
  435. for _, rule := range d.Config.Rules {
  436. if len(rule.Keywords) == 0 {
  437. // if not keywords are associated with the rule always scan the
  438. // fragment using the rule
  439. findings = append(findings, d.detectRule(fragment, rule)...)
  440. continue
  441. }
  442. fragmentContainsKeyword := false
  443. // check if keywords are in the fragment
  444. for _, k := range rule.Keywords {
  445. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  446. fragmentContainsKeyword = true
  447. }
  448. }
  449. if fragmentContainsKeyword {
  450. findings = append(findings, d.detectRule(fragment, rule)...)
  451. }
  452. }
  453. return filter(findings, d.Redact)
  454. }
  455. // addFinding synchronously adds a finding to the findings slice
  456. func (d *Detector) addFinding(finding report.Finding) {
  457. if finding.Commit == "" {
  458. finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
  459. } else {
  460. finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
  461. }
  462. // check if we should ignore this finding
  463. if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
  464. log.Debug().Msgf("ignoring finding with Fingerprint %s",
  465. finding.Fingerprint)
  466. return
  467. }
  468. if d.baseline != nil && !IsNew(finding, d.baseline) {
  469. log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
  470. return
  471. }
  472. d.findingMutex.Lock()
  473. d.findings = append(d.findings, finding)
  474. if d.Verbose {
  475. printFinding(finding)
  476. }
  477. d.findingMutex.Unlock()
  478. }
  479. // addCommit synchronously adds a commit to the commit slice
  480. func (d *Detector) addCommit(commit string) {
  481. d.commitMap[commit] = true
  482. }