encodings.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. package codec
  2. import (
  3. "fmt"
  4. "math"
  5. "strings"
  6. "github.com/zricethezav/gitleaks/v8/regexp"
  7. )
  8. var (
  9. // encodingsRe is a regex built by combining all the encoding patterns
  10. // into named capture groups so that a single pass can detect multiple
  11. // encodings
  12. encodingsRe *regexp.Regexp
  13. // encodings contains all the encoding configurations for the detector.
  14. // The precedence is important. You want more specific encodings to
  15. // have a higher precedence or encodings that partially encode the
  16. // values (e.g. percent) unlike encodings that fully encode the string
  17. // (e.g. base64). If two encoding matches overlap the decoder will use
  18. // this order to determine which encoding should wait till the next pass.
  19. encodings = []*encoding{
  20. {
  21. kind: percentKind,
  22. pattern: `%[0-9A-Fa-f]{2}(?:.*%[0-9A-Fa-f]{2})?`,
  23. decode: decodePercent,
  24. },
  25. {
  26. kind: unicodeKind,
  27. pattern: `(?:(?:U\+[a-fA-F0-9]{4}(?:\s|$))+|(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+)`,
  28. decode: decodeUnicode,
  29. },
  30. {
  31. kind: hexKind,
  32. pattern: `[0-9A-Fa-f]{32,}`,
  33. decode: decodeHex,
  34. },
  35. {
  36. kind: base64Kind,
  37. pattern: `[\w\/+-]{16,}={0,2}`,
  38. decode: decodeBase64,
  39. },
  40. }
  41. )
  42. // encodingNames is used to map the encodingKinds to their name
  43. var encodingNames = []string{
  44. "percent",
  45. "unicode",
  46. "hex",
  47. "base64",
  48. }
  49. // encodingKind can be or'd together to capture all of the unique encodings
  50. // that were present in a segment
  51. type encodingKind int
  52. var (
  53. // make sure these go up by powers of 2
  54. percentKind = encodingKind(1)
  55. unicodeKind = encodingKind(2)
  56. hexKind = encodingKind(4)
  57. base64Kind = encodingKind(8)
  58. )
  59. func (e encodingKind) String() string {
  60. i := int(math.Log2(float64(e)))
  61. if i >= len(encodingNames) {
  62. return ""
  63. }
  64. return encodingNames[i]
  65. }
  66. // kinds returns a list of encodingKinds combined in this one
  67. func (e encodingKind) kinds() []encodingKind {
  68. kinds := []encodingKind{}
  69. for i := 0; i < len(encodingNames); i++ {
  70. if kind := int(e) & int(math.Pow(2, float64(i))); kind != 0 {
  71. kinds = append(kinds, encodingKind(kind))
  72. }
  73. }
  74. return kinds
  75. }
  76. // encodingMatch represents a match of an encoding in the text
  77. type encodingMatch struct {
  78. encoding *encoding
  79. startEnd
  80. }
  81. // encoding represent a type of coding supported by the decoder.
  82. type encoding struct {
  83. // the kind of decoding (e.g. base64, etc)
  84. kind encodingKind
  85. // the regex pattern that matches the encoding format
  86. pattern string
  87. // take the match and return the decoded value
  88. decode func(string) string
  89. // determine which encoding should win out when two overlap
  90. precedence int
  91. }
  92. func init() {
  93. count := len(encodings)
  94. namedPatterns := make([]string, count)
  95. for i, encoding := range encodings {
  96. encoding.precedence = count - i
  97. namedPatterns[i] = fmt.Sprintf(
  98. "(?P<%s>%s)",
  99. encoding.kind,
  100. encoding.pattern,
  101. )
  102. }
  103. encodingsRe = regexp.MustCompile(strings.Join(namedPatterns, "|"))
  104. }
  105. // findEncodingMatches finds as many encodings as it can for this pass
  106. func findEncodingMatches(data string) []encodingMatch {
  107. var all []encodingMatch
  108. for _, matchIndex := range encodingsRe.FindAllStringSubmatchIndex(data, -1) {
  109. // Add the encodingMatch with its proper encoding
  110. for i, j := 2, 0; i < len(matchIndex); i, j = i+2, j+1 {
  111. if matchIndex[i] > -1 {
  112. all = append(all, encodingMatch{
  113. encoding: encodings[j],
  114. startEnd: startEnd{
  115. start: matchIndex[i],
  116. end: matchIndex[i+1],
  117. },
  118. })
  119. }
  120. }
  121. }
  122. totalMatches := len(all)
  123. if totalMatches == 1 {
  124. return all
  125. }
  126. // filter out lower precedence ones that overlap their neigbors
  127. filtered := make([]encodingMatch, 0, len(all))
  128. for i, m := range all {
  129. if i > 0 {
  130. prev := all[i-1]
  131. if m.overlaps(prev.startEnd) && prev.encoding.precedence > m.encoding.precedence {
  132. continue // skip this one
  133. }
  134. }
  135. if i+1 < totalMatches {
  136. next := all[i+1]
  137. if m.overlaps(next.startEnd) && next.encoding.precedence > m.encoding.precedence {
  138. continue // skip this one
  139. }
  140. }
  141. filtered = append(filtered, m)
  142. }
  143. return filtered
  144. }