4
0

git.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. package sources
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "net/url"
  10. "os/exec"
  11. "path/filepath"
  12. "regexp"
  13. "strings"
  14. "sync"
  15. "time"
  16. "github.com/fatih/semgroup"
  17. "github.com/gitleaks/go-gitdiff/gitdiff"
  18. "github.com/zricethezav/gitleaks/v8/cmd/scm"
  19. "github.com/zricethezav/gitleaks/v8/config"
  20. "github.com/zricethezav/gitleaks/v8/logging"
  21. )
  22. var quotedOptPattern = regexp.MustCompile(`^(?:"[^"]+"|'[^']+')$`)
  23. // GitCmd helps to work with Git's output.
  24. type GitCmd struct {
  25. cmd *exec.Cmd
  26. diffFilesCh <-chan *gitdiff.File
  27. errCh <-chan error
  28. repoPath string
  29. }
  30. // blobReader provides a ReadCloser interface git cat-file blob to fetch
  31. // a blob from a repo
  32. type blobReader struct {
  33. io.ReadCloser
  34. cmd *exec.Cmd
  35. }
  36. // Close closes the underlying reader and then waits for the command to complete,
  37. // releasing its resources.
  38. func (br *blobReader) Close() error {
  39. // Discard the remaining data from the pipe to avoid blocking
  40. _, drainErr := io.Copy(io.Discard, br)
  41. // Close the pipe (should signal the command to stop if it hasn't already)
  42. closeErr := br.ReadCloser.Close()
  43. // Wait to prevent zombie processes.
  44. waitErr := br.cmd.Wait()
  45. // Return the first error encountered
  46. if drainErr != nil {
  47. return drainErr
  48. }
  49. if closeErr != nil {
  50. return closeErr
  51. }
  52. return waitErr
  53. }
  54. // NewGitLogCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
  55. // Caller should read everything from channels until receiving a signal about their closure and call
  56. // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
  57. func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
  58. return NewGitLogCmdContext(context.Background(), source, logOpts)
  59. }
  60. // NewGitLogCmdContext is the same as NewGitLogCmd but supports passing in a
  61. // context to use for timeouts
  62. func NewGitLogCmdContext(ctx context.Context, source string, logOpts string) (*GitCmd, error) {
  63. sourceClean := filepath.Clean(source)
  64. var cmd *exec.Cmd
  65. if logOpts != "" {
  66. args := []string{"-C", sourceClean, "log", "-p", "-U0"}
  67. // Ensure that the user-provided |logOpts| aren't wrapped in quotes.
  68. // https://github.com/gitleaks/gitleaks/issues/1153
  69. userArgs := strings.Split(logOpts, " ")
  70. var quotedOpts []string
  71. for _, element := range userArgs {
  72. if quotedOptPattern.MatchString(element) {
  73. quotedOpts = append(quotedOpts, element)
  74. }
  75. }
  76. if len(quotedOpts) > 0 {
  77. logging.Warn().Msgf("the following `--log-opts` values may not work as expected: %v\n\tsee https://github.com/gitleaks/gitleaks/issues/1153 for more information", quotedOpts)
  78. }
  79. args = append(args, userArgs...)
  80. cmd = exec.CommandContext(ctx, "git", args...)
  81. } else {
  82. cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "log", "-p", "-U0",
  83. "--full-history", "--all", "--diff-filter=tuxdb")
  84. }
  85. logging.Debug().Msgf("executing: %s", cmd.String())
  86. stdout, err := cmd.StdoutPipe()
  87. if err != nil {
  88. return nil, err
  89. }
  90. stderr, err := cmd.StderrPipe()
  91. if err != nil {
  92. return nil, err
  93. }
  94. if err := cmd.Start(); err != nil {
  95. return nil, err
  96. }
  97. errCh := make(chan error)
  98. go listenForStdErr(stderr, errCh)
  99. gitdiffFiles, err := gitdiff.Parse(stdout)
  100. if err != nil {
  101. return nil, err
  102. }
  103. return &GitCmd{
  104. cmd: cmd,
  105. diffFilesCh: gitdiffFiles,
  106. errCh: errCh,
  107. repoPath: sourceClean,
  108. }, nil
  109. }
  110. // NewGitDiffCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
  111. // Caller should read everything from channels until receiving a signal about their closure and call
  112. // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
  113. func NewGitDiffCmd(source string, staged bool) (*GitCmd, error) {
  114. return NewGitDiffCmdContext(context.Background(), source, staged)
  115. }
  116. // NewGitDiffCmdContext is the same as NewGitDiffCmd but supports passing in a
  117. // context to use for timeouts
  118. func NewGitDiffCmdContext(ctx context.Context, source string, staged bool) (*GitCmd, error) {
  119. sourceClean := filepath.Clean(source)
  120. var cmd *exec.Cmd
  121. cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff", ".")
  122. if staged {
  123. cmd = exec.CommandContext(ctx, "git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff",
  124. "--staged", ".")
  125. }
  126. logging.Debug().Msgf("executing: %s", cmd.String())
  127. stdout, err := cmd.StdoutPipe()
  128. if err != nil {
  129. return nil, err
  130. }
  131. stderr, err := cmd.StderrPipe()
  132. if err != nil {
  133. return nil, err
  134. }
  135. if err := cmd.Start(); err != nil {
  136. return nil, err
  137. }
  138. errCh := make(chan error)
  139. go listenForStdErr(stderr, errCh)
  140. gitdiffFiles, err := gitdiff.Parse(stdout)
  141. if err != nil {
  142. return nil, err
  143. }
  144. return &GitCmd{
  145. cmd: cmd,
  146. diffFilesCh: gitdiffFiles,
  147. errCh: errCh,
  148. repoPath: sourceClean,
  149. }, nil
  150. }
  151. // DiffFilesCh returns a channel with *gitdiff.File.
  152. func (c *GitCmd) DiffFilesCh() <-chan *gitdiff.File {
  153. return c.diffFilesCh
  154. }
  155. // ErrCh returns a channel that could produce an error if there is something in stderr.
  156. func (c *GitCmd) ErrCh() <-chan error {
  157. return c.errCh
  158. }
  159. // Wait waits for the command to exit and waits for any copying to
  160. // stdin or copying from stdout or stderr to complete.
  161. //
  162. // Wait also closes underlying stdout and stderr.
  163. func (c *GitCmd) Wait() error {
  164. return c.cmd.Wait()
  165. }
  166. // String displays the command used for GitCmd
  167. func (c *GitCmd) String() string {
  168. return c.cmd.String()
  169. }
  170. // NewBlobReader returns an io.ReadCloser that can be used to read a blob
  171. // within the git repo used to create the GitCmd.
  172. //
  173. // The caller is responsible for closing the reader.
  174. func (c *GitCmd) NewBlobReader(commit, path string) (io.ReadCloser, error) {
  175. return c.NewBlobReaderContext(context.Background(), commit, path)
  176. }
  177. // NewBlobReaderContext is the same as NewBlobReader but supports passing in a
  178. // context to use for timeouts
  179. func (c *GitCmd) NewBlobReaderContext(ctx context.Context, commit, path string) (io.ReadCloser, error) {
  180. gitArgs := []string{"-C", c.repoPath, "cat-file", "blob", commit + ":" + path}
  181. cmd := exec.CommandContext(ctx, "git", gitArgs...)
  182. cmd.Stderr = io.Discard
  183. stdout, err := cmd.StdoutPipe()
  184. if err != nil {
  185. return nil, fmt.Errorf("failed to get stdout pipe: %w", err)
  186. }
  187. if err := cmd.Start(); err != nil {
  188. return nil, fmt.Errorf("failed to start git command: %w", err)
  189. }
  190. return &blobReader{
  191. ReadCloser: stdout,
  192. cmd: cmd,
  193. }, nil
  194. }
  195. // listenForStdErr listens for stderr output from git, prints it to stdout,
  196. // sends to errCh and closes it.
  197. func listenForStdErr(stderr io.ReadCloser, errCh chan<- error) {
  198. defer close(errCh)
  199. var errEncountered bool
  200. scanner := bufio.NewScanner(stderr)
  201. for scanner.Scan() {
  202. // if git throws one of the following errors:
  203. //
  204. // exhaustive rename detection was skipped due to too many files.
  205. // you may want to set your diff.renameLimit variable to at least
  206. // (some large number) and retry the command.
  207. //
  208. // inexact rename detection was skipped due to too many files.
  209. // you may want to set your diff.renameLimit variable to at least
  210. // (some large number) and retry the command.
  211. //
  212. // Auto packing the repository in background for optimum performance.
  213. // See "git help gc" for manual housekeeping.
  214. //
  215. // we skip exiting the program as git log -p/git diff will continue
  216. // to send data to stdout and finish executing. This next bit of
  217. // code prevents gitleaks from stopping mid scan if this error is
  218. // encountered
  219. if strings.Contains(scanner.Text(),
  220. "exhaustive rename detection was skipped") ||
  221. strings.Contains(scanner.Text(),
  222. "inexact rename detection was skipped") ||
  223. strings.Contains(scanner.Text(),
  224. "you may want to set your diff.renameLimit") ||
  225. strings.Contains(scanner.Text(),
  226. "See \"git help gc\" for manual housekeeping") ||
  227. strings.Contains(scanner.Text(),
  228. "Auto packing the repository in background for optimum performance") {
  229. logging.Warn().Msg(scanner.Text())
  230. } else {
  231. logging.Error().Msgf("[git] %s", scanner.Text())
  232. errEncountered = true
  233. }
  234. }
  235. if errEncountered {
  236. errCh <- errors.New("stderr is not empty")
  237. return
  238. }
  239. }
  240. // RemoteInfo provides the info needed for reconstructing links from findings
  241. type RemoteInfo struct {
  242. Platform scm.Platform
  243. Url string
  244. }
  245. // Git is a source for yielding fragments from a git repo
  246. type Git struct {
  247. Cmd *GitCmd
  248. Config *config.Config
  249. Remote *RemoteInfo
  250. Sema *semgroup.Group
  251. MaxArchiveDepth int
  252. }
  253. // CommitInfo captures metadata about the commit
  254. type CommitInfo struct {
  255. AuthorEmail string
  256. AuthorName string
  257. Date string
  258. Message string
  259. Remote *RemoteInfo
  260. SHA string
  261. }
  262. // Fragments yields fragments from a git repo
  263. func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
  264. defer func() {
  265. if err := s.Cmd.Wait(); err != nil {
  266. logging.Debug().Err(err).Str("cmd", s.Cmd.String()).Msg("command aborted")
  267. }
  268. }()
  269. var (
  270. diffFilesCh = s.Cmd.DiffFilesCh()
  271. errCh = s.Cmd.ErrCh()
  272. wg sync.WaitGroup
  273. )
  274. // loop to range over both DiffFiles (stdout) and ErrCh (stderr)
  275. for diffFilesCh != nil || errCh != nil {
  276. select {
  277. case <-ctx.Done():
  278. return ctx.Err()
  279. case gitdiffFile, open := <-diffFilesCh:
  280. if !open {
  281. diffFilesCh = nil
  282. break
  283. }
  284. if gitdiffFile.IsDelete {
  285. continue
  286. }
  287. // skip non-archive binary files
  288. yieldAsArchive := false
  289. if gitdiffFile.IsBinary {
  290. if !isArchive(ctx, gitdiffFile.NewName) {
  291. continue
  292. }
  293. yieldAsArchive = true
  294. }
  295. // Check if commit is allowed
  296. commitSHA := ""
  297. var commitInfo *CommitInfo
  298. if gitdiffFile.PatchHeader != nil {
  299. commitSHA = gitdiffFile.PatchHeader.SHA
  300. for _, a := range s.Config.Allowlists {
  301. if ok, c := a.CommitAllowed(gitdiffFile.PatchHeader.SHA); ok {
  302. logging.Trace().Str("allowed-commit", c).Msg("skipping commit: global allowlist")
  303. continue
  304. }
  305. }
  306. commitInfo = &CommitInfo{
  307. Date: gitdiffFile.PatchHeader.AuthorDate.UTC().Format(time.RFC3339),
  308. Message: gitdiffFile.PatchHeader.Message(),
  309. Remote: s.Remote,
  310. SHA: commitSHA,
  311. }
  312. if gitdiffFile.PatchHeader.Author != nil {
  313. commitInfo.AuthorName = gitdiffFile.PatchHeader.Author.Name
  314. commitInfo.AuthorEmail = gitdiffFile.PatchHeader.Author.Email
  315. }
  316. }
  317. wg.Add(1)
  318. s.Sema.Go(func() error {
  319. defer wg.Done()
  320. if yieldAsArchive {
  321. blob, err := s.Cmd.NewBlobReaderContext(ctx, commitSHA, gitdiffFile.NewName)
  322. if err != nil {
  323. logging.Error().Err(err).Msg("could not read archive blob")
  324. return nil
  325. }
  326. file := File{
  327. Content: blob,
  328. Path: gitdiffFile.NewName,
  329. MaxArchiveDepth: s.MaxArchiveDepth,
  330. Config: s.Config,
  331. }
  332. // enrich and yield fragments
  333. err = file.Fragments(ctx, func(fragment Fragment, err error) error {
  334. fragment.CommitSHA = commitSHA
  335. fragment.CommitInfo = commitInfo
  336. return yield(fragment, err)
  337. })
  338. // Close the blob reader and log any issues
  339. if err := blob.Close(); err != nil {
  340. logging.Debug().Err(err).Msg("blobReader.Close() returned an error")
  341. }
  342. return err
  343. }
  344. for _, textFragment := range gitdiffFile.TextFragments {
  345. if textFragment == nil {
  346. return nil
  347. }
  348. fragment := Fragment{
  349. CommitSHA: commitSHA,
  350. FilePath: gitdiffFile.NewName,
  351. Raw: textFragment.Raw(gitdiff.OpAdd),
  352. StartLine: int(textFragment.NewPosition),
  353. CommitInfo: commitInfo,
  354. }
  355. if err := yield(fragment, nil); err != nil {
  356. return err
  357. }
  358. }
  359. return nil
  360. })
  361. case err, open := <-errCh:
  362. if !open {
  363. errCh = nil
  364. break
  365. }
  366. return yield(Fragment{}, err)
  367. }
  368. }
  369. select {
  370. case <-ctx.Done():
  371. return ctx.Err()
  372. default:
  373. wg.Wait()
  374. return nil
  375. }
  376. }
  377. // NewRemoteInfo builds a new RemoteInfo for generating finding links
  378. func NewRemoteInfo(platform scm.Platform, source string) *RemoteInfo {
  379. return NewRemoteInfoContext(context.Background(), platform, source)
  380. }
  381. // NewRemoteInfoContext is the same as NewRemoteInfo but supports passing in a
  382. // context to use for timeouts
  383. func NewRemoteInfoContext(ctx context.Context, platform scm.Platform, source string) *RemoteInfo {
  384. if platform == scm.NoPlatform {
  385. return &RemoteInfo{Platform: platform}
  386. }
  387. remoteUrl, err := getRemoteUrl(ctx, source)
  388. if err != nil {
  389. if strings.Contains(err.Error(), "No remote configured") {
  390. logging.Debug().Msg("skipping finding links: repository has no configured remote.")
  391. platform = scm.NoPlatform
  392. } else {
  393. logging.Error().Err(err).Msg("skipping finding links: unable to parse remote URL")
  394. }
  395. goto End
  396. }
  397. if platform == scm.UnknownPlatform {
  398. platform = platformFromHost(remoteUrl)
  399. if platform == scm.UnknownPlatform {
  400. logging.Info().
  401. Str("host", remoteUrl.Hostname()).
  402. Msg("Unknown SCM platform. Use --platform to include links in findings.")
  403. } else {
  404. logging.Debug().
  405. Str("host", remoteUrl.Hostname()).
  406. Str("platform", platform.String()).
  407. Msg("SCM platform parsed from host")
  408. }
  409. }
  410. End:
  411. var rUrl string
  412. if remoteUrl != nil {
  413. rUrl = remoteUrl.String()
  414. }
  415. return &RemoteInfo{
  416. Platform: platform,
  417. Url: rUrl,
  418. }
  419. }
  420. var sshUrlpat = regexp.MustCompile(`^git@([a-zA-Z0-9.-]+):(?:\d{1,5}/)?([\w/.-]+?)(?:\.git)?$`)
  421. func getRemoteUrl(ctx context.Context, source string) (*url.URL, error) {
  422. // This will return the first remote — typically, "origin".
  423. cmd := exec.CommandContext(ctx, "git", "ls-remote", "--quiet", "--get-url")
  424. if source != "." {
  425. cmd.Dir = source
  426. }
  427. stdout, err := cmd.Output()
  428. if err != nil {
  429. var exitError *exec.ExitError
  430. if errors.As(err, &exitError) {
  431. return nil, fmt.Errorf("command failed (%d): %w, stderr: %s", exitError.ExitCode(), err, string(bytes.TrimSpace(exitError.Stderr)))
  432. }
  433. return nil, err
  434. }
  435. remoteUrl := string(bytes.TrimSpace(stdout))
  436. if matches := sshUrlpat.FindStringSubmatch(remoteUrl); matches != nil {
  437. remoteUrl = fmt.Sprintf("https://%s/%s", matches[1], matches[2])
  438. }
  439. remoteUrl = strings.TrimSuffix(remoteUrl, ".git")
  440. parsedUrl, err := url.Parse(remoteUrl)
  441. if err != nil {
  442. return nil, fmt.Errorf("unable to parse remote URL: %w", err)
  443. }
  444. // Remove any user info.
  445. parsedUrl.User = nil
  446. return parsedUrl, nil
  447. }
  448. func platformFromHost(u *url.URL) scm.Platform {
  449. switch strings.ToLower(u.Hostname()) {
  450. case "github.com":
  451. return scm.GitHubPlatform
  452. case "gitlab.com":
  453. return scm.GitLabPlatform
  454. case "dev.azure.com", "visualstudio.com":
  455. return scm.AzureDevOpsPlatform
  456. case "gitea.com", "code.forgejo.org", "codeberg.org":
  457. return scm.GiteaPlatform
  458. case "bitbucket.org":
  459. return scm.BitbucketPlatform
  460. default:
  461. return scm.UnknownPlatform
  462. }
  463. }