repo.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. package main
  2. import (
  3. "path/filepath"
  4. "os"
  5. "fmt"
  6. "os/exec"
  7. "sync"
  8. "bytes"
  9. "encoding/json"
  10. "log"
  11. "strings"
  12. "io/ioutil"
  13. )
  14. type Repo struct {
  15. name string
  16. url string
  17. path string
  18. status string // TODO
  19. leaks []Leak
  20. owner *Owner
  21. }
  22. type Leak struct {
  23. Line string `json:"line"`
  24. Commit string `json:"commit"`
  25. Offender string `json:"string"`
  26. Reason string `json:"reason"`
  27. Msg string `json:"commitMsg"`
  28. Time string `json:"time"`
  29. Author string `json:"author"`
  30. File string `json:"file"`
  31. RepoURL string `json:"repoURL"`
  32. }
  33. type Commit struct {
  34. Hash string
  35. Author string
  36. Time string
  37. Msg string
  38. }
  39. func newRepo(owner *Owner, name string, url string) *Repo {
  40. repo := &Repo{
  41. name: name,
  42. url: url,
  43. path: owner.path + "/" + name,
  44. }
  45. return repo
  46. }
  47. // Audit operates on a single repo and searches the full or partial history of the repo.
  48. // A semaphore is declared for every repo to bind concurrency. If unbounded, the system will throw a
  49. // `too many open files` error. Eventually, gitleaks should use src-d/go-git to avoid shelling out
  50. // commands so that users could opt for doing all clones/diffs in memory.
  51. // Audit also declares two WaitGroups, one for distributing regex/entropy checks, and one for receiving
  52. // the leaks if there are any. This could be done a little more elegantly in the future.
  53. func (repo *Repo) audit(owner *Owner, opts *Options) error {
  54. var (
  55. out []byte
  56. err error
  57. commitWG sync.WaitGroup
  58. gitLeakReceiverWG sync.WaitGroup
  59. gitLeaksChan = make(chan Leak)
  60. leaks []Leak
  61. semaphoreChan = make(chan struct{}, opts.Concurrency)
  62. )
  63. dotGitPath := filepath.Join(repo.path, ".git")
  64. // Navigate to proper location to being audit. Clone repo
  65. // if not present, otherwise fetch for new changes.
  66. if _, err := os.Stat(dotGitPath); os.IsNotExist(err) {
  67. // no repo present, clone it
  68. fmt.Printf("Cloning \x1b[37;1m%s\x1b[0m into %s...\n", repo.url, repo.path)
  69. err = exec.Command("git", "clone", repo.url, repo.path).Run()
  70. if err != nil{
  71. fmt.Println("can run clonse")
  72. }
  73. } else {
  74. fmt.Printf("Checking \x1b[37;1m%s\x1b[0m from %s...\n", repo.url, repo.path)
  75. err = exec.Command("git", "fetch").Run()
  76. if err != nil{
  77. fmt.Println("can run fetch")
  78. }
  79. }
  80. if err := os.Chdir(fmt.Sprintf(repo.path)); err != nil {
  81. fmt.Println("cant chdir")
  82. }
  83. gitFormat := "--format=%H%n%an%n%s%n%ci"
  84. out, err = exec.Command("git", "rev-list", "--all",
  85. "--remotes", "--topo-order", gitFormat).Output()
  86. if err != nil {
  87. fmt.Println("problem with rev list")
  88. }
  89. revListLines := bytes.Split(out, []byte("\n"))
  90. commits := parseRevList(revListLines)
  91. for _, commit := range commits {
  92. if commit.Hash == "" {
  93. continue
  94. }
  95. commitWG.Add(1)
  96. go auditDiff(commit, repo, &commitWG, &gitLeakReceiverWG, opts,
  97. semaphoreChan, gitLeaksChan)
  98. if commit.Hash == opts.SinceCommit {
  99. break
  100. }
  101. }
  102. go reportAggregator(&gitLeakReceiverWG, gitLeaksChan, &leaks)
  103. commitWG.Wait()
  104. gitLeakReceiverWG.Wait()
  105. // repo audit has finished
  106. repo.leaks = leaks
  107. if opts.EnableJSON && len(leaks) != 0 {
  108. repo.writeReport(owner)
  109. }
  110. return nil
  111. }
  112. func (repo *Repo) log() {
  113. }
  114. // Used by audit, writeReport will generate a report and write it out to
  115. // $GITLEAKS_HOME/report/<owner>/<repo>. No report will be generated if
  116. // no leaks have been found
  117. func (repo *Repo) writeReport(owner *Owner) {
  118. reportJSON, _ := json.MarshalIndent(repo.leaks, "", "\t")
  119. if _, err := os.Stat(owner.reportPath); os.IsNotExist(err) {
  120. os.Mkdir(repo.owner.reportPath, os.ModePerm)
  121. }
  122. reportFileName := fmt.Sprintf("%s_leaks.json", repo.name)
  123. reportFile := filepath.Join(owner.reportPath, reportFileName)
  124. err := ioutil.WriteFile(reportFile, reportJSON, 0644)
  125. if err != nil {
  126. fmt.Println("cant write report")
  127. }
  128. fmt.Printf("Report written to %s\n", reportFile)
  129. }
  130. // parseRevList is responsible for parsing the output of
  131. // $ `git rev-list --all -remotes --topo-order --format=%H%n%an%n%s%n%ci`
  132. // sample output from the above command looks like:
  133. // ...
  134. // SHA
  135. // Author Name
  136. // Commit Msg
  137. // Commit Date
  138. // ...
  139. // Used by audit
  140. func parseRevList(revList [][]byte) []Commit {
  141. var commits []Commit
  142. for i := 0; i < len(revList)-1; i = i + 5 {
  143. commit := Commit{
  144. Hash: string(revList[i+1]),
  145. Author: string(revList[i+2]),
  146. Msg: string(revList[i+3]),
  147. Time: string(revList[i+4]),
  148. }
  149. commits = append(commits, commit)
  150. }
  151. return commits
  152. }
  153. // reportAggregator is a go func responsible for ...
  154. func reportAggregator(gitLeakReceiverWG *sync.WaitGroup, gitLeaks chan Leak, leaks *[]Leak) {
  155. for gitLeak := range gitLeaks {
  156. b, err := json.MarshalIndent(gitLeak, "", " ")
  157. if err != nil {
  158. fmt.Println("failed to output leak:", err)
  159. }
  160. fmt.Println(string(b))
  161. *leaks = append(*leaks, gitLeak)
  162. gitLeakReceiverWG.Done()
  163. }
  164. }
  165. // Used by audit, auditDiff is a go func responsible for diffing and auditing a commit.
  166. // Three channels are input here: 1. a semaphore to bind gitleaks, 2. a leak stream, 3. error handling (TODO)
  167. // This func performs a diff and runs regexes checks on each line of the diff.
  168. func auditDiff(currCommit Commit, repo *Repo, commitWG *sync.WaitGroup,
  169. gitLeakReceiverWG *sync.WaitGroup, opts *Options, semaphoreChan chan struct{},
  170. gitLeaks chan Leak) {
  171. // signal to WG this diff is done being audited
  172. defer commitWG.Done()
  173. if err := os.Chdir(fmt.Sprintf(repo.path)); err != nil {
  174. log.Fatal(err)
  175. }
  176. commitCmp := fmt.Sprintf("%s^!", currCommit.Hash)
  177. semaphoreChan <- struct{}{}
  178. out, err := exec.Command("git", "diff", commitCmp).Output()
  179. <-semaphoreChan
  180. if err != nil {
  181. // TODO
  182. if strings.Contains(err.Error(), "too many files open") {
  183. log.Printf("error retrieving diff for commit %s. Try turning concurrency down. %v\n", currCommit, err)
  184. }
  185. }
  186. leaks := doChecks(string(out), currCommit, opts, repo)
  187. if len(leaks) == 0 {
  188. return
  189. }
  190. for _, leak := range leaks {
  191. gitLeakReceiverWG.Add(1)
  192. gitLeaks <- leak
  193. }
  194. }