| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- package sources
- import (
- "bufio"
- "bytes"
- "context"
- "errors"
- "fmt"
- "io"
- "net/url"
- "os/exec"
- "path/filepath"
- "regexp"
- "strings"
- "sync"
- "time"
- "github.com/fatih/semgroup"
- "github.com/gitleaks/go-gitdiff/gitdiff"
- "github.com/zricethezav/gitleaks/v8/cmd/scm"
- "github.com/zricethezav/gitleaks/v8/config"
- "github.com/zricethezav/gitleaks/v8/logging"
- )
- var quotedOptPattern = regexp.MustCompile(`^(?:"[^"]+"|'[^']+')$`)
- // GitCmd helps to work with Git's output.
- type GitCmd struct {
- cmd *exec.Cmd
- diffFilesCh <-chan *gitdiff.File
- errCh <-chan error
- repoPath string
- }
- // blobReader provides a ReadCloser interface git cat-file blob to fetch
- // a blob from a repo
- type blobReader struct {
- io.ReadCloser
- cmd *exec.Cmd
- }
- // Close closes the underlying reader and then waits for the command to complete,
- // releasing its resources.
- func (br *blobReader) Close() error {
- // Discard the remaining data from the pipe to avoid blocking
- _, drainErr := io.Copy(io.Discard, br)
- // Close the pipe (should signal the command to stop if it hasn't already)
- closeErr := br.ReadCloser.Close()
- // Wait to prevent zombie processes.
- waitErr := br.cmd.Wait()
- // Return the first error encountered
- if drainErr != nil {
- return drainErr
- }
- if closeErr != nil {
- return closeErr
- }
- return waitErr
- }
- // NewGitLogCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
- // Caller should read everything from channels until receiving a signal about their closure and call
- // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
- func NewGitLogCmd(source string, logOpts string) (*GitCmd, error) {
- sourceClean := filepath.Clean(source)
- var cmd *exec.Cmd
- if logOpts != "" {
- args := []string{"-C", sourceClean, "log", "-p", "-U0"}
- // Ensure that the user-provided |logOpts| aren't wrapped in quotes.
- // https://github.com/gitleaks/gitleaks/issues/1153
- userArgs := strings.Split(logOpts, " ")
- var quotedOpts []string
- for _, element := range userArgs {
- if quotedOptPattern.MatchString(element) {
- quotedOpts = append(quotedOpts, element)
- }
- }
- if len(quotedOpts) > 0 {
- logging.Warn().Msgf("the following `--log-opts` values may not work as expected: %v\n\tsee https://github.com/gitleaks/gitleaks/issues/1153 for more information", quotedOpts)
- }
- args = append(args, userArgs...)
- cmd = exec.Command("git", args...)
- } else {
- cmd = exec.Command("git", "-C", sourceClean, "log", "-p", "-U0",
- "--full-history", "--all", "--diff-filter=tuxdb")
- }
- logging.Debug().Msgf("executing: %s", cmd.String())
- stdout, err := cmd.StdoutPipe()
- if err != nil {
- return nil, err
- }
- stderr, err := cmd.StderrPipe()
- if err != nil {
- return nil, err
- }
- if err := cmd.Start(); err != nil {
- return nil, err
- }
- errCh := make(chan error)
- go listenForStdErr(stderr, errCh)
- gitdiffFiles, err := gitdiff.Parse(stdout)
- if err != nil {
- return nil, err
- }
- return &GitCmd{
- cmd: cmd,
- diffFilesCh: gitdiffFiles,
- errCh: errCh,
- repoPath: sourceClean,
- }, nil
- }
- // NewGitDiffCmd returns `*DiffFilesCmd` with two channels: `<-chan *gitdiff.File` and `<-chan error`.
- // Caller should read everything from channels until receiving a signal about their closure and call
- // the `func (*DiffFilesCmd) Wait()` error in order to release resources.
- func NewGitDiffCmd(source string, staged bool) (*GitCmd, error) {
- sourceClean := filepath.Clean(source)
- var cmd *exec.Cmd
- cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff", ".")
- if staged {
- cmd = exec.Command("git", "-C", sourceClean, "diff", "-U0", "--no-ext-diff",
- "--staged", ".")
- }
- logging.Debug().Msgf("executing: %s", cmd.String())
- stdout, err := cmd.StdoutPipe()
- if err != nil {
- return nil, err
- }
- stderr, err := cmd.StderrPipe()
- if err != nil {
- return nil, err
- }
- if err := cmd.Start(); err != nil {
- return nil, err
- }
- errCh := make(chan error)
- go listenForStdErr(stderr, errCh)
- gitdiffFiles, err := gitdiff.Parse(stdout)
- if err != nil {
- return nil, err
- }
- return &GitCmd{
- cmd: cmd,
- diffFilesCh: gitdiffFiles,
- errCh: errCh,
- repoPath: sourceClean,
- }, nil
- }
- // DiffFilesCh returns a channel with *gitdiff.File.
- func (c *GitCmd) DiffFilesCh() <-chan *gitdiff.File {
- return c.diffFilesCh
- }
- // ErrCh returns a channel that could produce an error if there is something in stderr.
- func (c *GitCmd) ErrCh() <-chan error {
- return c.errCh
- }
- // Wait waits for the command to exit and waits for any copying to
- // stdin or copying from stdout or stderr to complete.
- //
- // Wait also closes underlying stdout and stderr.
- func (c *GitCmd) Wait() error {
- return c.cmd.Wait()
- }
- // NewBlobReader returns an io.ReadCloser that can be used to read a blob
- // within the git repo used to create the GitCmd.
- //
- // The caller is responsible for closing the reader.
- func (c *GitCmd) NewBlobReader(commit, path string) (io.ReadCloser, error) {
- gitArgs := []string{"-C", c.repoPath, "cat-file", "blob", commit + ":" + path}
- cmd := exec.Command("git", gitArgs...)
- cmd.Stderr = io.Discard
- stdout, err := cmd.StdoutPipe()
- if err != nil {
- return nil, fmt.Errorf("failed to get stdout pipe: %w", err)
- }
- if err := cmd.Start(); err != nil {
- return nil, fmt.Errorf("failed to start git command: %w", err)
- }
- return &blobReader{
- ReadCloser: stdout,
- cmd: cmd,
- }, nil
- }
- // listenForStdErr listens for stderr output from git, prints it to stdout,
- // sends to errCh and closes it.
- func listenForStdErr(stderr io.ReadCloser, errCh chan<- error) {
- defer close(errCh)
- var errEncountered bool
- scanner := bufio.NewScanner(stderr)
- for scanner.Scan() {
- // if git throws one of the following errors:
- //
- // exhaustive rename detection was skipped due to too many files.
- // you may want to set your diff.renameLimit variable to at least
- // (some large number) and retry the command.
- //
- // inexact rename detection was skipped due to too many files.
- // you may want to set your diff.renameLimit variable to at least
- // (some large number) and retry the command.
- //
- // Auto packing the repository in background for optimum performance.
- // See "git help gc" for manual housekeeping.
- //
- // we skip exiting the program as git log -p/git diff will continue
- // to send data to stdout and finish executing. This next bit of
- // code prevents gitleaks from stopping mid scan if this error is
- // encountered
- if strings.Contains(scanner.Text(),
- "exhaustive rename detection was skipped") ||
- strings.Contains(scanner.Text(),
- "inexact rename detection was skipped") ||
- strings.Contains(scanner.Text(),
- "you may want to set your diff.renameLimit") ||
- strings.Contains(scanner.Text(),
- "See \"git help gc\" for manual housekeeping") ||
- strings.Contains(scanner.Text(),
- "Auto packing the repository in background for optimum performance") {
- logging.Warn().Msg(scanner.Text())
- } else {
- logging.Error().Msgf("[git] %s", scanner.Text())
- errEncountered = true
- }
- }
- if errEncountered {
- errCh <- errors.New("stderr is not empty")
- return
- }
- }
- // RemoteInfo provides the info needed for reconstructing links from findings
- type RemoteInfo struct {
- Platform scm.Platform
- Url string
- }
- // Git is a source for yielding fragments from a git repo
- type Git struct {
- Cmd *GitCmd
- Config *config.Config
- Remote *RemoteInfo
- Sema *semgroup.Group
- MaxArchiveDepth int
- }
- // CommitInfo captures metadata about the commit
- type CommitInfo struct {
- AuthorEmail string
- AuthorName string
- Date string
- Message string
- Remote *RemoteInfo
- SHA string
- }
- // Fragments yields fragments from a git repo
- func (s *Git) Fragments(ctx context.Context, yield FragmentsFunc) error {
- defer func() {
- _ = s.Cmd.Wait()
- }()
- var (
- diffFilesCh = s.Cmd.DiffFilesCh()
- errCh = s.Cmd.ErrCh()
- wg sync.WaitGroup
- )
- // loop to range over both DiffFiles (stdout) and ErrCh (stderr)
- for diffFilesCh != nil || errCh != nil {
- select {
- case gitdiffFile, open := <-diffFilesCh:
- if !open {
- diffFilesCh = nil
- break
- }
- if gitdiffFile.IsDelete {
- continue
- }
- // skip non-archive binary files
- yieldAsArchive := false
- if gitdiffFile.IsBinary {
- if !isArchive(ctx, gitdiffFile.NewName) {
- continue
- }
- yieldAsArchive = true
- }
- // Check if commit is allowed
- commitSHA := ""
- var commitInfo *CommitInfo
- if gitdiffFile.PatchHeader != nil {
- commitSHA = gitdiffFile.PatchHeader.SHA
- for _, a := range s.Config.Allowlists {
- if ok, c := a.CommitAllowed(gitdiffFile.PatchHeader.SHA); ok {
- logging.Trace().Str("allowed-commit", c).Msg("skipping commit: global allowlist")
- continue
- }
- }
- commitInfo = &CommitInfo{
- Date: gitdiffFile.PatchHeader.AuthorDate.UTC().Format(time.RFC3339),
- Message: gitdiffFile.PatchHeader.Message(),
- Remote: s.Remote,
- SHA: commitSHA,
- }
- if gitdiffFile.PatchHeader.Author != nil {
- commitInfo.AuthorName = gitdiffFile.PatchHeader.Author.Name
- commitInfo.AuthorEmail = gitdiffFile.PatchHeader.Author.Email
- }
- }
- wg.Add(1)
- s.Sema.Go(func() error {
- defer wg.Done()
- if yieldAsArchive {
- blob, err := s.Cmd.NewBlobReader(commitSHA, gitdiffFile.NewName)
- if err != nil {
- logging.Error().Err(err).Msg("could not read archive blob")
- return nil
- }
- file := File{
- Content: blob,
- Path: gitdiffFile.NewName,
- MaxArchiveDepth: s.MaxArchiveDepth,
- Config: s.Config,
- }
- // enrich and yield fragments
- err = file.Fragments(ctx, func(fragment Fragment, err error) error {
- fragment.CommitSHA = commitSHA
- fragment.CommitInfo = commitInfo
- return yield(fragment, err)
- })
- // Close the blob reader and log any issues
- if err := blob.Close(); err != nil {
- logging.Debug().Err(err).Msg("blobReader.Close() returned an error")
- }
- return err
- }
- for _, textFragment := range gitdiffFile.TextFragments {
- if textFragment == nil {
- return nil
- }
- fragment := Fragment{
- CommitSHA: commitSHA,
- FilePath: gitdiffFile.NewName,
- Raw: textFragment.Raw(gitdiff.OpAdd),
- StartLine: int(textFragment.NewPosition),
- CommitInfo: commitInfo,
- }
- if err := yield(fragment, nil); err != nil {
- return err
- }
- }
- return nil
- })
- case err, open := <-errCh:
- if !open {
- errCh = nil
- break
- }
- return yield(Fragment{}, err)
- }
- }
- wg.Wait()
- return nil
- }
- // NewRemoteInfo builds a new RemoteInfo for generating finding links
- func NewRemoteInfo(platform scm.Platform, source string) *RemoteInfo {
- if platform == scm.NoPlatform {
- return &RemoteInfo{Platform: platform}
- }
- remoteUrl, err := getRemoteUrl(source)
- if err != nil {
- if strings.Contains(err.Error(), "No remote configured") {
- logging.Debug().Msg("skipping finding links: repository has no configured remote.")
- platform = scm.NoPlatform
- } else {
- logging.Error().Err(err).Msg("skipping finding links: unable to parse remote URL")
- }
- goto End
- }
- if platform == scm.UnknownPlatform {
- platform = platformFromHost(remoteUrl)
- if platform == scm.UnknownPlatform {
- logging.Info().
- Str("host", remoteUrl.Hostname()).
- Msg("Unknown SCM platform. Use --platform to include links in findings.")
- } else {
- logging.Debug().
- Str("host", remoteUrl.Hostname()).
- Str("platform", platform.String()).
- Msg("SCM platform parsed from host")
- }
- }
- End:
- var rUrl string
- if remoteUrl != nil {
- rUrl = remoteUrl.String()
- }
- return &RemoteInfo{
- Platform: platform,
- Url: rUrl,
- }
- }
- var sshUrlpat = regexp.MustCompile(`^git@([a-zA-Z0-9.-]+):([\w/.-]+?)(?:\.git)?$`)
- func getRemoteUrl(source string) (*url.URL, error) {
- // This will return the first remote — typically, "origin".
- cmd := exec.Command("git", "ls-remote", "--quiet", "--get-url")
- if source != "." {
- cmd.Dir = source
- }
- stdout, err := cmd.Output()
- if err != nil {
- var exitError *exec.ExitError
- if errors.As(err, &exitError) {
- return nil, fmt.Errorf("command failed (%d): %w, stderr: %s", exitError.ExitCode(), err, string(bytes.TrimSpace(exitError.Stderr)))
- }
- return nil, err
- }
- remoteUrl := string(bytes.TrimSpace(stdout))
- if matches := sshUrlpat.FindStringSubmatch(remoteUrl); matches != nil {
- remoteUrl = fmt.Sprintf("https://%s/%s", matches[1], matches[2])
- }
- remoteUrl = strings.TrimSuffix(remoteUrl, ".git")
- parsedUrl, err := url.Parse(remoteUrl)
- if err != nil {
- return nil, fmt.Errorf("unable to parse remote URL: %w", err)
- }
- // Remove any user info.
- parsedUrl.User = nil
- return parsedUrl, nil
- }
- func platformFromHost(u *url.URL) scm.Platform {
- switch strings.ToLower(u.Hostname()) {
- case "github.com":
- return scm.GitHubPlatform
- case "gitlab.com":
- return scm.GitLabPlatform
- case "dev.azure.com", "visualstudio.com":
- return scm.AzureDevOpsPlatform
- default:
- return scm.UnknownPlatform
- }
- }
|