detect.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. package detect
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "github.com/zricethezav/gitleaks/v8/config"
  11. "github.com/zricethezav/gitleaks/v8/detect/git"
  12. "github.com/zricethezav/gitleaks/v8/report"
  13. "github.com/fatih/semgroup"
  14. "github.com/gitleaks/go-gitdiff/gitdiff"
  15. "github.com/h2non/filetype"
  16. ahocorasick "github.com/petar-dambovaliev/aho-corasick"
  17. "github.com/rs/zerolog/log"
  18. "github.com/spf13/viper"
  19. )
  20. // Type used to differentiate between git scan types:
  21. // $ gitleaks detect
  22. // $ gitleaks protect
  23. // $ gitleaks protect staged
  24. type GitScanType int
  25. const (
  26. DetectType GitScanType = iota
  27. ProtectType
  28. ProtectStagedType
  29. gitleaksAllowSignature = "gitleaks:allow"
  30. )
  31. // Detector is the main detector struct
  32. type Detector struct {
  33. // Config is the configuration for the detector
  34. Config config.Config
  35. // Redact is a flag to redact findings. This is exported
  36. // so users using gitleaks as a library can set this flag
  37. // without calling `detector.Start(cmd *cobra.Command)`
  38. Redact bool
  39. // verbose is a flag to print findings
  40. Verbose bool
  41. // commitMap is used to keep track of commits that have been scanned.
  42. // This is only used for logging purposes and git scans.
  43. commitMap map[string]bool
  44. // findingMutex is to prevent concurrent access to the
  45. // findings slice when adding findings.
  46. findingMutex *sync.Mutex
  47. // findings is a slice of report.Findings. This is the result
  48. // of the detector's scan which can then be used to generate a
  49. // report.
  50. findings []report.Finding
  51. // prefilter is a ahocorasick struct used for doing efficient string
  52. // matching given a set of words (keywords from the rules in the config)
  53. prefilter ahocorasick.AhoCorasick
  54. }
  55. // Fragment contains the data to be scanned
  56. type Fragment struct {
  57. // Raw is the raw content of the fragment
  58. Raw string
  59. // FilePath is the path to the file if applicable
  60. FilePath string
  61. // CommitSHA is the SHA of the commit if applicable
  62. CommitSHA string
  63. // newlineIndices is a list of indices of newlines in the raw content.
  64. // This is used to calculate the line location of a finding
  65. newlineIndices [][]int
  66. // keywords is a map of all the keywords contain within the contents
  67. // of this fragment
  68. keywords map[string]bool
  69. }
  70. // NewDetector creates a new detector with the given config
  71. func NewDetector(cfg config.Config) *Detector {
  72. builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
  73. AsciiCaseInsensitive: true,
  74. MatchOnlyWholeWords: false,
  75. MatchKind: ahocorasick.LeftMostLongestMatch,
  76. DFA: true,
  77. })
  78. return &Detector{
  79. commitMap: make(map[string]bool),
  80. findingMutex: &sync.Mutex{},
  81. findings: make([]report.Finding, 0),
  82. Config: cfg,
  83. prefilter: builder.Build(cfg.Keywords),
  84. }
  85. }
  86. // NewDetectorDefaultConfig creates a new detector with the default config
  87. func NewDetectorDefaultConfig() (*Detector, error) {
  88. viper.SetConfigType("toml")
  89. err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
  90. if err != nil {
  91. return nil, err
  92. }
  93. var vc config.ViperConfig
  94. err = viper.Unmarshal(&vc)
  95. if err != nil {
  96. return nil, err
  97. }
  98. cfg, err := vc.Translate()
  99. if err != nil {
  100. return nil, err
  101. }
  102. return NewDetector(cfg), nil
  103. }
  104. // DetectBytes scans the given bytes and returns a list of findings
  105. func (d *Detector) DetectBytes(content []byte) []report.Finding {
  106. return d.DetectString(string(content))
  107. }
  108. // DetectString scans the given string and returns a list of findings
  109. func (d *Detector) DetectString(content string) []report.Finding {
  110. return d.Detect(Fragment{
  111. Raw: content,
  112. })
  113. }
  114. // detectRule scans the given fragment for the given rule and returns a list of findings
  115. func (d *Detector) detectRule(fragment Fragment, rule *config.Rule) []report.Finding {
  116. var findings []report.Finding
  117. // check if filepath or commit is allowed for this rule
  118. if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
  119. rule.Allowlist.PathAllowed(fragment.FilePath) {
  120. return findings
  121. }
  122. if rule.Path != nil && rule.Regex == nil {
  123. // Path _only_ rule
  124. if rule.Path.Match([]byte(fragment.FilePath)) {
  125. finding := report.Finding{
  126. Description: rule.Description,
  127. File: fragment.FilePath,
  128. RuleID: rule.RuleID,
  129. Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
  130. Tags: rule.Tags,
  131. }
  132. return append(findings, finding)
  133. }
  134. } else if rule.Path != nil {
  135. // if path is set _and_ a regex is set, then we need to check both
  136. // so if the path does not match, then we should return early and not
  137. // consider the regex
  138. if !rule.Path.Match([]byte(fragment.FilePath)) {
  139. return findings
  140. }
  141. }
  142. // if path only rule, skip content checks
  143. if rule.Regex == nil {
  144. return findings
  145. }
  146. matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
  147. for _, matchIndex := range matchIndices {
  148. // extract secret from match
  149. secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
  150. // determine location of match. Note that the location
  151. // in the finding will be the line/column numbers of the _match_
  152. // not the _secret_, which will be different if the secretGroup
  153. // value is set for this rule
  154. loc := location(fragment, matchIndex)
  155. finding := report.Finding{
  156. Description: rule.Description,
  157. File: fragment.FilePath,
  158. RuleID: rule.RuleID,
  159. StartLine: loc.startLine,
  160. EndLine: loc.endLine,
  161. StartColumn: loc.startColumn,
  162. EndColumn: loc.endColumn,
  163. Secret: secret,
  164. Match: secret,
  165. Tags: rule.Tags,
  166. }
  167. if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
  168. gitleaksAllowSignature) {
  169. continue
  170. }
  171. // check if the secret is in the allowlist
  172. if rule.Allowlist.RegexAllowed(finding.Secret) ||
  173. d.Config.Allowlist.RegexAllowed(finding.Secret) {
  174. continue
  175. }
  176. // extract secret from secret group if set
  177. if rule.SecretGroup != 0 {
  178. groups := rule.Regex.FindStringSubmatch(secret)
  179. if len(groups) <= rule.SecretGroup || len(groups) == 0 {
  180. // Config validation should prevent this
  181. continue
  182. }
  183. secret = groups[rule.SecretGroup]
  184. finding.Secret = secret
  185. }
  186. // check entropy
  187. entropy := shannonEntropy(finding.Secret)
  188. finding.Entropy = float32(entropy)
  189. if rule.Entropy != 0.0 {
  190. if entropy <= rule.Entropy {
  191. // entropy is too low, skip this finding
  192. continue
  193. }
  194. // NOTE: this is a goofy hack to get around the fact there golang's regex engine
  195. // does not support positive lookaheads. Ideally we would want to add a
  196. // restriction on generic rules regex that requires the secret match group
  197. // contains both numbers and alphabetical characters, not just alphabetical characters.
  198. // What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
  199. // secret contains both digits and alphabetical characters.
  200. // TODO: this should be replaced with stop words
  201. if strings.HasPrefix(rule.RuleID, "generic") {
  202. if !containsDigit(secret) {
  203. continue
  204. }
  205. }
  206. }
  207. findings = append(findings, finding)
  208. }
  209. return findings
  210. }
  211. // GitScan accepts a *gitdiff.File channel which contents a git history generated from
  212. // the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
  213. // and determine if the patch contains any findings.
  214. func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
  215. var (
  216. gitdiffFiles <-chan *gitdiff.File
  217. err error
  218. )
  219. switch gitScanType {
  220. case DetectType:
  221. gitdiffFiles, err = git.GitLog(source, logOpts)
  222. if err != nil {
  223. return d.findings, err
  224. }
  225. case ProtectType:
  226. gitdiffFiles, err = git.GitDiff(source, false)
  227. if err != nil {
  228. return d.findings, err
  229. }
  230. case ProtectStagedType:
  231. gitdiffFiles, err = git.GitDiff(source, true)
  232. if err != nil {
  233. return d.findings, err
  234. }
  235. }
  236. s := semgroup.NewGroup(context.Background(), 4)
  237. for gitdiffFile := range gitdiffFiles {
  238. gitdiffFile := gitdiffFile
  239. // skip binary files
  240. if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
  241. continue
  242. }
  243. // Check if commit is allowed
  244. commitSHA := ""
  245. if gitdiffFile.PatchHeader != nil {
  246. commitSHA = gitdiffFile.PatchHeader.SHA
  247. if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
  248. continue
  249. }
  250. }
  251. d.addCommit(commitSHA)
  252. s.Go(func() error {
  253. for _, textFragment := range gitdiffFile.TextFragments {
  254. if textFragment == nil {
  255. return nil
  256. }
  257. fragment := Fragment{
  258. Raw: textFragment.Raw(gitdiff.OpAdd),
  259. CommitSHA: commitSHA,
  260. FilePath: gitdiffFile.NewName,
  261. }
  262. for _, finding := range d.Detect(fragment) {
  263. d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
  264. }
  265. }
  266. return nil
  267. })
  268. }
  269. if err := s.Wait(); err != nil {
  270. return d.findings, err
  271. }
  272. log.Debug().Msgf("%d commits scanned. Note: this number might be smaller than expected due to commits with no additions", len(d.commitMap))
  273. return d.findings, nil
  274. }
  275. // DetectFiles accepts a path to a source directory or file and begins a scan of the
  276. // file or directory.
  277. func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
  278. s := semgroup.NewGroup(context.Background(), 4)
  279. paths := make(chan string)
  280. s.Go(func() error {
  281. defer close(paths)
  282. return filepath.Walk(source,
  283. func(path string, fInfo os.FileInfo, err error) error {
  284. if err != nil {
  285. return err
  286. }
  287. if fInfo.Name() == ".git" {
  288. return filepath.SkipDir
  289. }
  290. if fInfo.Mode().IsRegular() {
  291. paths <- path
  292. }
  293. return nil
  294. })
  295. })
  296. for pa := range paths {
  297. p := pa
  298. s.Go(func() error {
  299. b, err := os.ReadFile(p)
  300. if err != nil {
  301. return err
  302. }
  303. mimetype, err := filetype.Match(b)
  304. if err != nil {
  305. return err
  306. }
  307. if mimetype.MIME.Type == "application" {
  308. return nil // skip binary files
  309. }
  310. fragment := Fragment{
  311. Raw: string(b),
  312. FilePath: p,
  313. }
  314. for _, finding := range d.Detect(fragment) {
  315. // need to add 1 since line counting starts at 1
  316. finding.EndLine++
  317. finding.StartLine++
  318. d.addFinding(finding)
  319. }
  320. return nil
  321. })
  322. }
  323. if err := s.Wait(); err != nil {
  324. return d.findings, err
  325. }
  326. return d.findings, nil
  327. }
  328. // Detect scans the given fragment and returns a list of findings
  329. func (d *Detector) Detect(fragment Fragment) []report.Finding {
  330. var findings []report.Finding
  331. // initiate fragment keywords
  332. fragment.keywords = make(map[string]bool)
  333. // check if filepath is allowed
  334. if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
  335. fragment.FilePath == d.Config.Path) {
  336. return findings
  337. }
  338. // add newline indices for location calculation in detectRule
  339. fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
  340. // build keyword map for prefiltering rules
  341. normalizedRaw := strings.ToLower(fragment.Raw)
  342. matches := d.prefilter.FindAll(normalizedRaw)
  343. for _, m := range matches {
  344. fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
  345. }
  346. for _, rule := range d.Config.Rules {
  347. if len(rule.Keywords) == 0 {
  348. // if not keywords are associated with the rule always scan the
  349. // fragment using the rule
  350. findings = append(findings, d.detectRule(fragment, rule)...)
  351. continue
  352. }
  353. fragmentContainsKeyword := false
  354. // check if keywords are in the fragment
  355. for _, k := range rule.Keywords {
  356. if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
  357. fragmentContainsKeyword = true
  358. }
  359. }
  360. if fragmentContainsKeyword {
  361. findings = append(findings, d.detectRule(fragment, rule)...)
  362. }
  363. }
  364. return filter(findings, d.Redact)
  365. }
  366. // addFinding synchronously adds a finding to the findings slice
  367. func (d *Detector) addFinding(finding report.Finding) {
  368. d.findingMutex.Lock()
  369. d.findings = append(d.findings, finding)
  370. if d.Verbose {
  371. printFinding(finding)
  372. }
  373. d.findingMutex.Unlock()
  374. }
  375. // addCommit synchronously adds a commit to the commit slice
  376. func (d *Detector) addCommit(commit string) {
  377. d.commitMap[commit] = true
  378. }