git.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "net/url"
  10. "os/exec"
  11. "path/filepath"
  12. "regexp"
  13. "strings"
  14. "sync"
  15. "time"
  16. "github.com/fatih/semgroup"
  17. "github.com/gitleaks/go-gitdiff/gitdiff"
  18. "github.com/zricethezav/gitleaks/v8/cmd/scm"
  19. "github.com/zricethezav/gitleaks/v8/config"
  20. "github.com/zricethezav/gitleaks/v8/logging"
  21. )
  22. var quotedOptPattern = regexp.MustCompile(`^(?:"[^"]+"|'[^']+')$`)
  23. // GitCmd helps to work with Git's output.
  24. type GitCmd struct {
  25. cmd *exec.Cmd
  26. diffFilesCh <-chan *gitdiff.File
  27. errCh <-chan error
  28. repoPath string
  29. }
  30. // blobReader provides a ReadCloser interface git cat-file blob to fetch
  31. // a blob from a repo
  32. type blobReader struct {
  33. io.ReadCloser
  34. cmd *exec.Cmd
  35. }
  36. // Close closes the underlying reader and then waits for the command to complete,
  37. // releasing its resources.
  38. func (br *blobReader) Close() error {
  39. // Discard the remaining data from the pipe to avoid blocking
  40. _, drainErr := io.Copy(io.Discard, br)
  41. // Close the pipe (should signal the command to stop if it hasn't already)
  42. closeErr := br.ReadCloser.Close()
  43. // Wait to prevent zombie processes.
  44. waitErr := br.cmd.Wait()
  45. // Return the first error encountered
  46. if drainErr != nil {
  47. return drainErr
  48. }
  49. if closeErr != nil {
  50. return closeErr
  51. }
  52. return waitErr
  53. }
  54. // NewGitLogCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
  55. // Caller should read everything from channels until receiving a signal about their closure and call
  56. // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
  57. func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
  58. sourceClean := filepath.Clean(source)
  59. var cmd *exec.Cmd
  60. if logOpts != "" {
  61. args := []string{"-C", sourceClean, "log", "-p", "-U0"}
  62. // Ensure that the user-provided |logOpts| aren't wrapped in quotes.
  63. // https://github.com/gitleaks/gitleaks/issues/1153
  64. userArgs := strings.Split(logOpts, " ")
  65. var quotedOpts []string
  66. for _, element := range userArgs {
  67. if quotedOptPattern.MatchString(element) {
  68. quotedOpts = append(quotedOpts, element)
  69. }
  70. }
  71. if len(quotedOpts) > 0 {
  72. logging.Warn().Msgf("the following `--log-opts` values may not work as expected: %v\n\tsee https://github.com/gitleaks/gitleaks/issues/1153 for more information", quotedOpts)
  73. }
  74. args = append(args, userArgs...)
  75. cmd = exec.Command("git", args...)
  76. } else {
  77. cmd = exec.Command("git", "-C", sourceClean, "log", "-p", "-U0",
  78. "--full-history", "--all", "--diff-filter=tuxdb")
  79. }
  80. logging.Debug().Msgf("executing: %s", cmd.String())
  81. stdout, err := cmd.StdoutPipe()
  82. if err != nil {
  83. return nil, err
  84. }
  85. stderr, err := cmd.StderrPipe()
  86. if err != nil {
  87. return nil, err
  88. }
  89. if err := cmd.Start(); err != nil {
  90. return nil, err
  91. }
  92. errCh := make(chan error)
  93. go listenForStdErr(stderr, errCh)
  94. gitdiffFiles, err := gitdiff.Parse(stdout)
  95. if err != nil {
  96. return nil, err
  97. }
  98. return &GitCmd{
  99. cmd: cmd,
  100. diffFilesCh: gitdiffFiles,
  101. errCh: errCh,
  102. repoPath: sourceClean,
  103. }, nil
  104. }
  105. // NewGitDiffCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
  106. // Caller should read everything from channels until receiving a signal about their closure and call
  107. // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
  108. func NewGitDiffCmd(source string, staged bool) (*GitCmd, error) {
  109. sourceClean := filepath.Clean(source)
  110. var cmd *exec.Cmd
  111. cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff", ".")
  112. if staged {
  113. cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff",
  114. "--staged", ".")
  115. }
  116. logging.Debug().Msgf("executing: %s", cmd.String())
  117. stdout, err := cmd.StdoutPipe()
  118. if err != nil {
  119. return nil, err
  120. }
  121. stderr, err := cmd.StderrPipe()
  122. if err != nil {
  123. return nil, err
  124. }
  125. if err := cmd.Start(); err != nil {
  126. return nil, err
  127. }
  128. errCh := make(chan error)
  129. go listenForStdErr(stderr, errCh)
  130. gitdiffFiles, err := gitdiff.Parse(stdout)
  131. if err != nil {
  132. return nil, err
  133. }
  134. return &GitCmd{
  135. cmd: cmd,
  136. diffFilesCh: gitdiffFiles,
  137. errCh: errCh,
  138. repoPath: sourceClean,
  139. }, nil
  140. }
  141. // DiffFilesCh returns a channel with *gitdiff.File.
  142. func (c *GitCmd) DiffFilesCh() <-chan *gitdiff.File {
  143. return c.diffFilesCh
  144. }
  145. // ErrCh returns a channel that could produce an error if there is something in stderr.
  146. func (c *GitCmd) ErrCh() <-chan error {
  147. return c.errCh
  148. }
  149. // Wait waits for the command to exit and waits for any copying to
  150. // stdin or copying from stdout or stderr to complete.
  151. //
  152. // Wait also closes underlying stdout and stderr.
  153. func (c *GitCmd) Wait() error {
  154. return c.cmd.Wait()
  155. }
  156. // NewBlobReader returns an io.ReadCloser that can be used to read a blob
  157. // within the git repo used to create the GitCmd.
  158. //
  159. // The caller is responsible for closing the reader.
  160. func (c *GitCmd) NewBlobReader(commit, path string) (io.ReadCloser, error) {
  161. gitArgs := []string{"-C", c.repoPath, "cat-file", "blob", commit + ":" + path}
  162. cmd := exec.Command("git", gitArgs...)
  163. cmd.Stderr = io.Discard
  164. stdout, err := cmd.StdoutPipe()
  165. if err != nil {
  166. return nil, fmt.Errorf("failed to get stdout pipe: %w", err)
  167. }
  168. if err := cmd.Start(); err != nil {
  169. return nil, fmt.Errorf("failed to start git command: %w", err)
  170. }
  171. return &blobReader{
  172. ReadCloser: stdout,
  173. cmd: cmd,
  174. }, nil
  175. }
  176. // listenForStdErr listens for stderr output from git, prints it to stdout,
  177. // sends to errCh and closes it.
  178. func listenForStdErr(stderr io.ReadCloser, errCh chan<- error) {
  179. defer close(errCh)
  180. var errEncountered bool
  181. scanner := bufio.NewScanner(stderr)
  182. for scanner.Scan() {
  183. // if git throws one of the following errors:
  184. //
  185. // exhaustive rename detection was skipped due to too many files.
  186. // you may want to set your diff.renameLimit variable to at least
  187. // (some large number) and retry the command.
  188. //
  189. // inexact rename detection was skipped due to too many files.
  190. // you may want to set your diff.renameLimit variable to at least
  191. // (some large number) and retry the command.
  192. //
  193. // Auto packing the repository in background for optimum performance.
  194. // See "git help gc" for manual housekeeping.
  195. //
  196. // we skip exiting the program as git log -p/git diff will continue
  197. // to send data to stdout and finish executing. This next bit of
  198. // code prevents gitleaks from stopping mid scan if this error is
  199. // encountered
  200. if strings.Contains(scanner.Text(),
  201. "exhaustive rename detection was skipped") ||
  202. strings.Contains(scanner.Text(),
  203. "inexact rename detection was skipped") ||
  204. strings.Contains(scanner.Text(),
  205. "you may want to set your diff.renameLimit") ||
  206. strings.Contains(scanner.Text(),
  207. "See \"git help gc\" for manual housekeeping") ||
  208. strings.Contains(scanner.Text(),
  209. "Auto packing the repository in background for optimum performance") {
  210. logging.Warn().Msg(scanner.Text())
  211. } else {
  212. logging.Error().Msgf("[git] %s", scanner.Text())
  213. errEncountered = true
  214. }
  215. }
  216. if errEncountered {
  217. errCh <- errors.New("stderr is not empty")
  218. return
  219. }
  220. }
  221. // RemoteInfo provides the info needed for reconstructing links from findings
  222. type RemoteInfo struct {
  223. Platform scm.Platform
  224. Url string
  225. }
  226. // Git is a source for yielding fragments from a git repo
  227. type Git struct {
  228. Cmd *GitCmd
  229. Config *config.Config
  230. Remote *RemoteInfo
  231. Sema *semgroup.Group
  232. MaxArchiveDepth int
  233. }
  234. // CommitInfo captures metadata about the commit
  235. type CommitInfo struct {
  236. AuthorEmail string
  237. AuthorName string
  238. Date string
  239. Message string
  240. Remote *RemoteInfo
  241. SHA string
  242. }
  243. // Fragments yields fragments from a git repo
  244. func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
  245. defer func() {
  246. _ = s.Cmd.Wait()
  247. }()
  248. var (
  249. diffFilesCh = s.Cmd.DiffFilesCh()
  250. errCh = s.Cmd.ErrCh()
  251. wg sync.WaitGroup
  252. )
  253. // loop to range over both DiffFiles (stdout) and ErrCh (stderr)
  254. for diffFilesCh != nil || errCh != nil {
  255. select {
  256. case gitdiffFile, open := <-diffFilesCh:
  257. if !open {
  258. diffFilesCh = nil
  259. break
  260. }
  261. if gitdiffFile.IsDelete {
  262. continue
  263. }
  264. // skip non-archive binary files
  265. yieldAsArchive := false
  266. if gitdiffFile.IsBinary {
  267. if !isArchive(ctx, gitdiffFile.NewName) {
  268. continue
  269. }
  270. yieldAsArchive = true
  271. }
  272. // Check if commit is allowed
  273. commitSHA := ""
  274. var commitInfo *CommitInfo
  275. if gitdiffFile.PatchHeader != nil {
  276. commitSHA = gitdiffFile.PatchHeader.SHA
  277. for _, a := range s.Config.Allowlists {
  278. if ok, c := a.CommitAllowed(gitdiffFile.PatchHeader.SHA); ok {
  279. logging.Trace().Str("allowed-commit", c).Msg("skipping commit: global allowlist")
  280. continue
  281. }
  282. }
  283. commitInfo = &CommitInfo{
  284. Date: gitdiffFile.PatchHeader.AuthorDate.UTC().Format(time.RFC3339),
  285. Message: gitdiffFile.PatchHeader.Message(),
  286. Remote: s.Remote,
  287. SHA: commitSHA,
  288. }
  289. if gitdiffFile.PatchHeader.Author != nil {
  290. commitInfo.AuthorName = gitdiffFile.PatchHeader.Author.Name
  291. commitInfo.AuthorEmail = gitdiffFile.PatchHeader.Author.Email
  292. }
  293. }
  294. wg.Add(1)
  295. s.Sema.Go(func() error {
  296. defer wg.Done()
  297. if yieldAsArchive {
  298. blob, err := s.Cmd.NewBlobReader(commitSHA, gitdiffFile.NewName)
  299. if err != nil {
  300. logging.Error().Err(err).Msg("could not read archive blob")
  301. return nil
  302. }
  303. file := File{
  304. Content: blob,
  305. Path: gitdiffFile.NewName,
  306. MaxArchiveDepth: s.MaxArchiveDepth,
  307. Config: s.Config,
  308. }
  309. // enrich and yield fragments
  310. err = file.Fragments(ctx, func(fragment Fragment, err error) error {
  311. fragment.CommitSHA = commitSHA
  312. fragment.CommitInfo = commitInfo
  313. return yield(fragment, err)
  314. })
  315. // Close the blob reader and log any issues
  316. if err := blob.Close(); err != nil {
  317. logging.Debug().Err(err).Msg("blobReader.Close() returned an error")
  318. }
  319. return err
  320. }
  321. for _, textFragment := range gitdiffFile.TextFragments {
  322. if textFragment == nil {
  323. return nil
  324. }
  325. fragment := Fragment{
  326. CommitSHA: commitSHA,
  327. FilePath: gitdiffFile.NewName,
  328. Raw: textFragment.Raw(gitdiff.OpAdd),
  329. StartLine: int(textFragment.NewPosition),
  330. CommitInfo: commitInfo,
  331. }
  332. if err := yield(fragment, nil); err != nil {
  333. return err
  334. }
  335. }
  336. return nil
  337. })
  338. case err, open := <-errCh:
  339. if !open {
  340. errCh = nil
  341. break
  342. }
  343. return yield(Fragment{}, err)
  344. }
  345. }
  346. wg.Wait()
  347. return nil
  348. }
  349. // NewRemoteInfo builds a new RemoteInfo for generating finding links
  350. func NewRemoteInfo(platform scm.Platform, source string) *RemoteInfo {
  351. if platform == scm.NoPlatform {
  352. return &RemoteInfo{Platform: platform}
  353. }
  354. remoteUrl, err := getRemoteUrl(source)
  355. if err != nil {
  356. if strings.Contains(err.Error(), "No remote configured") {
  357. logging.Debug().Msg("skipping finding links: repository has no configured remote.")
  358. platform = scm.NoPlatform
  359. } else {
  360. logging.Error().Err(err).Msg("skipping finding links: unable to parse remote URL")
  361. }
  362. goto End
  363. }
  364. if platform == scm.UnknownPlatform {
  365. platform = platformFromHost(remoteUrl)
  366. if platform == scm.UnknownPlatform {
  367. logging.Info().
  368. Str("host", remoteUrl.Hostname()).
  369. Msg("Unknown SCM platform. Use --platform to include links in findings.")
  370. } else {
  371. logging.Debug().
  372. Str("host", remoteUrl.Hostname()).
  373. Str("platform", platform.String()).
  374. Msg("SCM platform parsed from host")
  375. }
  376. }
  377. End:
  378. var rUrl string
  379. if remoteUrl != nil {
  380. rUrl = remoteUrl.String()
  381. }
  382. return &RemoteInfo{
  383. Platform: platform,
  384. Url: rUrl,
  385. }
  386. }
  387. var sshUrlpat = regexp.MustCompile(`^git@([a-zA-Z0-9.-]+):(?:\d{1,5}/)?([\w/.-]+?)(?:\.git)?$`)
  388. func getRemoteUrl(source string) (*url.URL, error) {
  389. // This will return the first remote — typically, "origin".
  390. cmd := exec.Command("git", "ls-remote", "--quiet", "--get-url")
  391. if source != "." {
  392. cmd.Dir = source
  393. }
  394. stdout, err := cmd.Output()
  395. if err != nil {
  396. var exitError *exec.ExitError
  397. if errors.As(err, &exitError) {
  398. return nil, fmt.Errorf("command failed (%d): %w, stderr: %s", exitError.ExitCode(), err, string(bytes.TrimSpace(exitError.Stderr)))
  399. }
  400. return nil, err
  401. }
  402. remoteUrl := string(bytes.TrimSpace(stdout))
  403. if matches := sshUrlpat.FindStringSubmatch(remoteUrl); matches != nil {
  404. remoteUrl = fmt.Sprintf("https://%s/%s", matches[1], matches[2])
  405. }
  406. remoteUrl = strings.TrimSuffix(remoteUrl, ".git")
  407. parsedUrl, err := url.Parse(remoteUrl)
  408. if err != nil {
  409. return nil, fmt.Errorf("unable to parse remote URL: %w", err)
  410. }
  411. // Remove any user info.
  412. parsedUrl.User = nil
  413. return parsedUrl, nil
  414. }
  415. func platformFromHost(u *url.URL) scm.Platform {
  416. switch strings.ToLower(u.Hostname()) {
  417. case "github.com":
  418. return scm.GitHubPlatform
  419. case "gitlab.com":
  420. return scm.GitLabPlatform
  421. case "dev.azure.com", "visualstudio.com":
  422. return scm.AzureDevOpsPlatform
  423. case "gitea.com", "code.forgejo.org", "codeberg.org":
  424. return scm.GiteaPlatform
  425. case "bitbucket.org":
  426. return scm.BitbucketPlatform
  427. default:
  428. return scm.UnknownPlatform
  429. }
  430. }