4
0

decoder.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. package detect
  2. import (
  3. "bytes"
  4. "encoding/base64"
  5. "fmt"
  6. "regexp"
  7. "unicode"
  8. "github.com/zricethezav/gitleaks/v8/logging"
  9. )
  10. var b64LikelyChars [128]byte
  11. var b64Regexp = regexp.MustCompile(`[\w/+-]{16,}={0,3}`)
  12. var decoders = []func(string) ([]byte, error){
  13. base64.StdEncoding.DecodeString,
  14. base64.RawURLEncoding.DecodeString,
  15. }
  16. func init() {
  17. // Basically look for anything that isn't just letters
  18. for _, c := range `0123456789+/-_` {
  19. b64LikelyChars[c] = 1
  20. }
  21. }
  22. // EncodedSegment represents a portion of text that is encoded in some way.
  23. // `decode` supports recusive decoding and can result in "segment trees".
  24. // There can be multiple segments in the original text, so each can be thought
  25. // of as its own tree with the root being the original segment.
  26. type EncodedSegment struct {
  27. // The parent segment in a segment tree. If nil, it is a root segment
  28. parent *EncodedSegment
  29. // Relative start/end are the bounds of the encoded value in the current pass.
  30. relativeStart int
  31. relativeEnd int
  32. // Absolute start/end refer to the bounds of the root segment in this segment
  33. // tree
  34. absoluteStart int
  35. absoluteEnd int
  36. // Decoded start/end refer to the bounds of the decoded value in the current
  37. // pass. These can differ from relative values because decoding can shrink
  38. // or grow the size of the segment.
  39. decodedStart int
  40. decodedEnd int
  41. // This is the actual decoded content in the segment
  42. decodedValue string
  43. // This is the type of encoding
  44. encoding string
  45. }
  46. // isChildOf inspects the bounds of two segments to determine
  47. // if one should be the child of another
  48. func (s EncodedSegment) isChildOf(parent EncodedSegment) bool {
  49. return parent.decodedStart <= s.relativeStart && parent.decodedEnd >= s.relativeEnd
  50. }
  51. // decodedOverlaps checks if the decoded bounds of the segment overlaps a range
  52. func (s EncodedSegment) decodedOverlaps(start, end int) bool {
  53. return start <= s.decodedEnd && end >= s.decodedStart
  54. }
  55. // adjustMatchIndex takes the matchIndex from the current decoding pass and
  56. // updates it to match the absolute matchIndex in the original text.
  57. func (s EncodedSegment) adjustMatchIndex(matchIndex []int) []int {
  58. // The match is within the bounds of the segment so we just return
  59. // the absolute start and end of the root segment.
  60. if s.decodedStart <= matchIndex[0] && matchIndex[1] <= s.decodedEnd {
  61. return []int{
  62. s.absoluteStart,
  63. s.absoluteEnd,
  64. }
  65. }
  66. // Since it overlaps one side and/or the other, we're going to have to adjust
  67. // and climb parents until we're either at the root or we've determined
  68. // we're fully inside one of the parent segments.
  69. adjustedMatchIndex := make([]int, 2)
  70. if matchIndex[0] < s.decodedStart {
  71. // It starts before the encoded segment so adjust the start to match
  72. // the location before it was decoded
  73. matchStartDelta := s.decodedStart - matchIndex[0]
  74. adjustedMatchIndex[0] = s.relativeStart - matchStartDelta
  75. } else {
  76. // It starts within the encoded segment so set the bound to the
  77. // relative start
  78. adjustedMatchIndex[0] = s.relativeStart
  79. }
  80. if matchIndex[1] > s.decodedEnd {
  81. // It ends after the encoded segment so adjust the end to match
  82. // the location before it was decoded
  83. matchEndDelta := matchIndex[1] - s.decodedEnd
  84. adjustedMatchIndex[1] = s.relativeEnd + matchEndDelta
  85. } else {
  86. // It ends within the encoded segment so set the bound to the relative end
  87. adjustedMatchIndex[1] = s.relativeEnd
  88. }
  89. // We're still not at a root segment so we'll need to keep on adjusting
  90. if s.parent != nil {
  91. return s.parent.adjustMatchIndex(adjustedMatchIndex)
  92. }
  93. return adjustedMatchIndex
  94. }
  95. // depth reports how many levels of decoding needed to be done (default is 1)
  96. func (s EncodedSegment) depth() int {
  97. depth := 1
  98. // Climb the tree and increment the depth
  99. for current := &s; current.parent != nil; current = current.parent {
  100. depth++
  101. }
  102. return depth
  103. }
  104. // tags returns additional meta data tags related to the types of segments
  105. func (s EncodedSegment) tags() []string {
  106. return []string{
  107. fmt.Sprintf("decoded:%s", s.encoding),
  108. fmt.Sprintf("decode-depth:%d", s.depth()),
  109. }
  110. }
  111. // Decoder decodes various types of data in place
  112. type Decoder struct {
  113. decodedMap map[string]string
  114. }
  115. // NewDecoder creates a default decoder struct
  116. func NewDecoder() *Decoder {
  117. return &Decoder{
  118. decodedMap: make(map[string]string),
  119. }
  120. }
  121. // decode returns the data with the values decoded in-place
  122. func (d *Decoder) decode(data string, parentSegments []EncodedSegment) (string, []EncodedSegment) {
  123. segments := d.findEncodedSegments(data, parentSegments)
  124. if len(segments) > 0 {
  125. result := bytes.NewBuffer(make([]byte, 0, len(data)))
  126. relativeStart := 0
  127. for _, segment := range segments {
  128. result.WriteString(data[relativeStart:segment.relativeStart])
  129. result.WriteString(segment.decodedValue)
  130. relativeStart = segment.relativeEnd
  131. }
  132. result.WriteString(data[relativeStart:])
  133. return result.String(), segments
  134. }
  135. return data, segments
  136. }
  137. // findEncodedSegments finds the encoded segments in the data and updates the
  138. // segment tree for this pass
  139. func (d *Decoder) findEncodedSegments(data string, parentSegments []EncodedSegment) []EncodedSegment {
  140. if len(data) == 0 {
  141. return []EncodedSegment{}
  142. }
  143. matchIndices := b64Regexp.FindAllStringIndex(data, -1)
  144. if matchIndices == nil {
  145. return []EncodedSegment{}
  146. }
  147. segments := make([]EncodedSegment, 0, len(matchIndices))
  148. // Keeps up with offsets from the text changing size as things are decoded
  149. decodedShift := 0
  150. for _, matchIndex := range matchIndices {
  151. encodedValue := data[matchIndex[0]:matchIndex[1]]
  152. if !isLikelyB64(encodedValue) {
  153. d.decodedMap[encodedValue] = ""
  154. continue
  155. }
  156. decodedValue, alreadyDecoded := d.decodedMap[encodedValue]
  157. // We haven't decoded this yet, so go ahead and decode it
  158. if !alreadyDecoded {
  159. decodedValue = decodeValue(encodedValue)
  160. d.decodedMap[encodedValue] = decodedValue
  161. }
  162. // Skip this segment because there was nothing to check
  163. if len(decodedValue) == 0 {
  164. continue
  165. }
  166. // Create a segment for the encoded data
  167. segment := EncodedSegment{
  168. relativeStart: matchIndex[0],
  169. relativeEnd: matchIndex[1],
  170. absoluteStart: matchIndex[0],
  171. absoluteEnd: matchIndex[1],
  172. decodedStart: matchIndex[0] + decodedShift,
  173. decodedEnd: matchIndex[0] + decodedShift + len(decodedValue),
  174. decodedValue: decodedValue,
  175. encoding: "base64",
  176. }
  177. // Shift decoded start and ends based on size changes
  178. decodedShift += len(decodedValue) - len(encodedValue)
  179. // Adjust the absolute position of segments contained in parent segments
  180. for _, parentSegment := range parentSegments {
  181. if segment.isChildOf(parentSegment) {
  182. segment.absoluteStart = parentSegment.absoluteStart
  183. segment.absoluteEnd = parentSegment.absoluteEnd
  184. segment.parent = &parentSegment
  185. break
  186. }
  187. }
  188. logging.Debug().Msgf("segment found: %#v", segment)
  189. segments = append(segments, segment)
  190. }
  191. return segments
  192. }
  193. // decoders tries a list of decoders and returns the first successful one
  194. func decodeValue(encodedValue string) string {
  195. for _, decoder := range decoders {
  196. decodedValue, err := decoder(encodedValue)
  197. if err == nil && len(decodedValue) > 0 && isASCII(decodedValue) {
  198. return string(decodedValue)
  199. }
  200. }
  201. return ""
  202. }
  203. func isASCII(b []byte) bool {
  204. for i := 0; i < len(b); i++ {
  205. if b[i] > unicode.MaxASCII || b[i] < '\t' {
  206. return false
  207. }
  208. }
  209. return true
  210. }
  211. // Skip a lot of method signatures and things at the risk of missing about
  212. // 1% of base64
  213. func isLikelyB64(s string) bool {
  214. for _, c := range s {
  215. if b64LikelyChars[c] != 0 {
  216. return true
  217. }
  218. }
  219. return false
  220. }
  221. // Find a segment where the decoded bounds overlaps a range
  222. func segmentWithDecodedOverlap(encodedSegments []EncodedSegment, start, end int) *EncodedSegment {
  223. for _, segment := range encodedSegments {
  224. if segment.decodedOverlaps(start, end) {
  225. return &segment
  226. }
  227. }
  228. return nil
  229. }
  230. func (s EncodedSegment) lineStartIndex(currentRaw string) int {
  231. for i := s.decodedStart; i > -1; i-- {
  232. c := currentRaw[i]
  233. if c == '\n' {
  234. return i
  235. }
  236. }
  237. return 0
  238. }
  239. func (s EncodedSegment) lineEndIndex(currentRaw string, matchLen int) int {
  240. for i := s.decodedStart; i < s.decodedStart+matchLen; i++ {
  241. c := currentRaw[i]
  242. if c == '\n' {
  243. return i
  244. }
  245. }
  246. return len(currentRaw) - 1
  247. }