detect.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. package detect
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/detect/git"
  12. "github.com/zricethezav/gitleaks/v8/report"
  13. "github.com/fatih/semgroup"
  14. "github.com/gitleaks/go-gitdiff/gitdiff"
  15. "github.com/h2non/filetype"
  16. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  17. "github.com/rs/zerolog/log"
  18. "github.com/spf13/viper"
  19. )
  20. // Type used to differentiate between git scan types:
  21. // $ gitleaks detect
  22. // $ gitleaks protect
  23. // $ gitleaks protect staged
  24. type GitScanType int
  25. const (
  26. DetectType GitScanType = iota
  27. ProtectType
  28. ProtectStagedType
  29. gitleaksAllowSignature = "gitleaks:allow"
  30. )
  31. // Detector is the main detector struct
  32. type Detector struct {
  33. // Config is the configuration for the detector
  34. Config config.Config
  35. // Redact is a flag to redact findings. This is exported
  36. // so users using gitleaks as a library can set this flag
  37. // without calling `detector.Start(cmd *cobra.Command)`
  38. Redact bool
  39. // verbose is a flag to print findings
  40. Verbose bool
  41. // commitMap is used to keep track of commits that have been scanned.
  42. // This is only used for logging purposes and git scans.
  43. commitMap map[string]bool
  44. // findingMutex is to prevent concurrent access to the
  45. // findings slice when adding findings.
  46. findingMutex *sync.Mutex
  47. // findings is a slice of report.Findings. This is the result
  48. // of the detector's scan which can then be used to generate a
  49. // report.
  50. findings []report.Finding
  51. // prefilter is a ahocorasick struct used for doing efficient string
  52. // matching given a set of words (keywords from the rules in the config)
  53. prefilter ahocorasick.AhoCorasick
  54. }
  55. // Fragment contains the data to be scanned
  56. type Fragment struct {
  57. // Raw is the raw content of the fragment
  58. Raw string
  59. // FilePath is the path to the file if applicable
  60. FilePath string
  61. // CommitSHA is the SHA of the commit if applicable
  62. CommitSHA string
  63. // newlineIndices is a list of indices of newlines in the raw content.
  64. // This is used to calculate the line location of a finding
  65. newlineIndices [][]int
  66. // keywords is a map of all the keywords contain within the contents
  67. // of this fragment
  68. keywords map[string]bool
  69. }
  70. // NewDetector creates a new detector with the given config
  71. func NewDetector(cfg config.Config) *Detector {
  72. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  73. AsciiCaseInsensitive: true,
  74. MatchOnlyWholeWords: false,
  75. MatchKind: ahocorasick.LeftMostLongestMatch,
  76. DFA: true,
  77. })
  78. return &Detector{
  79. commitMap: make(map[string]bool),
  80. findingMutex: &sync.Mutex{},
  81. findings: make([]report.Finding, 0),
  82. Config: cfg,
  83. prefilter: builder.Build(cfg.Keywords),
  84. }
  85. }
  86. // NewDetectorDefaultConfig creates a new detector with the default config
  87. func NewDetectorDefaultConfig() (*Detector, error) {
  88. viper.SetConfigType("toml")
  89. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  90. if err != nil {
  91. return nil, err
  92. }
  93. var vc config.ViperConfig
  94. err = viper.Unmarshal(&vc)
  95. if err != nil {
  96. return nil, err
  97. }
  98. cfg, err := vc.Translate()
  99. if err != nil {
  100. return nil, err
  101. }
  102. return NewDetector(cfg), nil
  103. }
  104. // DetectBytes scans the given bytes and returns a list of findings
  105. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  106. return d.DetectString(string(content))
  107. }
  108. // DetectString scans the given string and returns a list of findings
  109. func (d *Detector) DetectString(content string) []report.Finding {
  110. return d.Detect(Fragment{
  111. Raw: content,
  112. })
  113. }
  114. // detectRule scans the given fragment for the given rule and returns a list of findings
  115. func (d *Detector) detectRule(fragment Fragment, rule *config.Rule) []report.Finding {
  116. var findings []report.Finding
  117. // check if filepath or commit is allowed for this rule
  118. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  119. rule.Allowlist.PathAllowed(fragment.FilePath) {
  120. return findings
  121. }
  122. if rule.Path != nil && rule.Regex == nil {
  123. // Path _only_ rule
  124. if rule.Path.Match([]byte(fragment.FilePath)) {
  125. finding := report.Finding{
  126. Description: rule.Description,
  127. File: fragment.FilePath,
  128. RuleID: rule.RuleID,
  129. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  130. Tags: rule.Tags,
  131. }
  132. return append(findings, finding)
  133. }
  134. } else if rule.Path != nil {
  135. // if path is set _and_ a regex is set, then we need to check both
  136. // so if the path does not match, then we should return early and not
  137. // consider the regex
  138. if !rule.Path.Match([]byte(fragment.FilePath)) {
  139. return findings
  140. }
  141. }
  142. // if path only rule, skip content checks
  143. if rule.Regex == nil {
  144. return findings
  145. }
  146. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  147. for _, matchIndex := range matchIndices {
  148. // extract secret from match
  149. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  150. // determine location of match. Note that the location
  151. // in the finding will be the line/column numbers of the _match_
  152. // not the _secret_, which will be different if the secretGroup
  153. // value is set for this rule
  154. loc := location(fragment, matchIndex)
  155. finding := report.Finding{
  156. Description: rule.Description,
  157. File: fragment.FilePath,
  158. RuleID: rule.RuleID,
  159. StartLine: loc.startLine,
  160. EndLine: loc.endLine,
  161. StartColumn: loc.startColumn,
  162. EndColumn: loc.endColumn,
  163. Secret: secret,
  164. Match: secret,
  165. Tags: rule.Tags,
  166. }
  167. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  168. gitleaksAllowSignature) {
  169. continue
  170. }
  171. // check if the secret is in the allowlist
  172. if rule.Allowlist.RegexAllowed(finding.Secret) ||
  173. d.Config.Allowlist.RegexAllowed(finding.Secret) {
  174. continue
  175. }
  176. // extract secret from secret group if set
  177. if rule.SecretGroup != 0 {
  178. groups := rule.Regex.FindStringSubmatch(secret)
  179. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  180. // Config validation should prevent this
  181. continue
  182. }
  183. secret = groups[rule.SecretGroup]
  184. finding.Secret = secret
  185. }
  186. // check if the secret is in the list of stopwords
  187. if rule.Allowlist.ContainsStopWord(finding.Secret) ||
  188. d.Config.Allowlist.ContainsStopWord(finding.Secret) {
  189. continue
  190. }
  191. // check entropy
  192. entropy := shannonEntropy(finding.Secret)
  193. finding.Entropy = float32(entropy)
  194. if rule.Entropy != 0.0 {
  195. if entropy <= rule.Entropy {
  196. // entropy is too low, skip this finding
  197. continue
  198. }
  199. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  200. // does not support positive lookaheads. Ideally we would want to add a
  201. // restriction on generic rules regex that requires the secret match group
  202. // contains both numbers and alphabetical characters, not just alphabetical characters.
  203. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  204. // secret contains both digits and alphabetical characters.
  205. // TODO: this should be replaced with stop words
  206. if strings.HasPrefix(rule.RuleID, "generic") {
  207. if !containsDigit(secret) {
  208. continue
  209. }
  210. }
  211. }
  212. findings = append(findings, finding)
  213. }
  214. return findings
  215. }
  216. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  217. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  218. // and determine if the patch contains any findings.
  219. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  220. var (
  221. gitdiffFiles <-chan *gitdiff.File
  222. err error
  223. )
  224. switch gitScanType {
  225. case DetectType:
  226. gitdiffFiles, err = git.GitLog(source, logOpts)
  227. if err != nil {
  228. return d.findings, err
  229. }
  230. case ProtectType:
  231. gitdiffFiles, err = git.GitDiff(source, false)
  232. if err != nil {
  233. return d.findings, err
  234. }
  235. case ProtectStagedType:
  236. gitdiffFiles, err = git.GitDiff(source, true)
  237. if err != nil {
  238. return d.findings, err
  239. }
  240. }
  241. s := semgroup.NewGroup(context.Background(), 4)
  242. for gitdiffFile := range gitdiffFiles {
  243. gitdiffFile := gitdiffFile
  244. // skip binary files
  245. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  246. continue
  247. }
  248. // Check if commit is allowed
  249. commitSHA := ""
  250. if gitdiffFile.PatchHeader != nil {
  251. commitSHA = gitdiffFile.PatchHeader.SHA
  252. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  253. continue
  254. }
  255. }
  256. d.addCommit(commitSHA)
  257. s.Go(func() error {
  258. for _, textFragment := range gitdiffFile.TextFragments {
  259. if textFragment == nil {
  260. return nil
  261. }
  262. fragment := Fragment{
  263. Raw: textFragment.Raw(gitdiff.OpAdd),
  264. CommitSHA: commitSHA,
  265. FilePath: gitdiffFile.NewName,
  266. }
  267. for _, finding := range d.Detect(fragment) {
  268. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  269. }
  270. }
  271. return nil
  272. })
  273. }
  274. if err := s.Wait(); err != nil {
  275. return d.findings, err
  276. }
  277. log.Debug().Msgf("%d commits scanned. Note: this number might be smaller than expected due to commits with no additions", len(d.commitMap))
  278. return d.findings, nil
  279. }
  280. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  281. // file or directory.
  282. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  283. s := semgroup.NewGroup(context.Background(), 4)
  284. paths := make(chan string)
  285. s.Go(func() error {
  286. defer close(paths)
  287. return filepath.Walk(source,
  288. func(path string, fInfo os.FileInfo, err error) error {
  289. if err != nil {
  290. return err
  291. }
  292. if fInfo.Name() == ".git" && fInfo.IsDir() {
  293. return filepath.SkipDir
  294. }
  295. if fInfo.Mode().IsRegular() {
  296. paths <- path
  297. }
  298. return nil
  299. })
  300. })
  301. for pa := range paths {
  302. p := pa
  303. s.Go(func() error {
  304. b, err := os.ReadFile(p)
  305. if err != nil {
  306. return err
  307. }
  308. mimetype, err := filetype.Match(b)
  309. if err != nil {
  310. return err
  311. }
  312. if mimetype.MIME.Type == "application" {
  313. return nil // skip binary files
  314. }
  315. fragment := Fragment{
  316. Raw: string(b),
  317. FilePath: p,
  318. }
  319. for _, finding := range d.Detect(fragment) {
  320. // need to add 1 since line counting starts at 1
  321. finding.EndLine++
  322. finding.StartLine++
  323. d.addFinding(finding)
  324. }
  325. return nil
  326. })
  327. }
  328. if err := s.Wait(); err != nil {
  329. return d.findings, err
  330. }
  331. return d.findings, nil
  332. }
  333. // Detect scans the given fragment and returns a list of findings
  334. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  335. var findings []report.Finding
  336. // initiate fragment keywords
  337. fragment.keywords = make(map[string]bool)
  338. // check if filepath is allowed
  339. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  340. fragment.FilePath == d.Config.Path) {
  341. return findings
  342. }
  343. // add newline indices for location calculation in detectRule
  344. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  345. // build keyword map for prefiltering rules
  346. normalizedRaw := strings.ToLower(fragment.Raw)
  347. matches := d.prefilter.FindAll(normalizedRaw)
  348. for _, m := range matches {
  349. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  350. }
  351. for _, rule := range d.Config.Rules {
  352. if len(rule.Keywords) == 0 {
  353. // if not keywords are associated with the rule always scan the
  354. // fragment using the rule
  355. findings = append(findings, d.detectRule(fragment, rule)...)
  356. continue
  357. }
  358. fragmentContainsKeyword := false
  359. // check if keywords are in the fragment
  360. for _, k := range rule.Keywords {
  361. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  362. fragmentContainsKeyword = true
  363. }
  364. }
  365. if fragmentContainsKeyword {
  366. findings = append(findings, d.detectRule(fragment, rule)...)
  367. }
  368. }
  369. return filter(findings, d.Redact)
  370. }
  371. // addFinding synchronously adds a finding to the findings slice
  372. func (d *Detector) addFinding(finding report.Finding) {
  373. d.findingMutex.Lock()
  374. d.findings = append(d.findings, finding)
  375. if d.Verbose {
  376. printFinding(finding)
  377. }
  378. d.findingMutex.Unlock()
  379. }
  380. // addCommit synchronously adds a commit to the commit slice
  381. func (d *Detector) addCommit(commit string) {
  382. d.commitMap[commit] = true
  383. }