| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- package codec
- import (
- "fmt"
- "math"
- "strings"
- "github.com/zricethezav/gitleaks/v8/regexp"
- )
- var (
- // encodingsRe is a regex built by combining all the encoding patterns
- // into named capture groups so that a single pass can detect multiple
- // encodings
- encodingsRe *regexp.Regexp
- // encodings contains all the encoding configurations for the detector.
- // The precedence is important. You want more specific encodings to
- // have a higher precedence or encodings that partially encode the
- // values (e.g. percent) unlike encodings that fully encode the string
- // (e.g. base64). If two encoding matches overlap the decoder will use
- // this order to determine which encoding should wait till the next pass.
- encodings = []*encoding{
- {
- kind: percentKind,
- pattern: `%[0-9A-Fa-f]{2}(?:.*%[0-9A-Fa-f]{2})?`,
- decode: decodePercent,
- },
- {
- kind: unicodeKind,
- pattern: `(?:(?:U\+[a-fA-F0-9]{4}(?:\s|$))+|(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+)`,
- decode: decodeUnicode,
- },
- {
- kind: hexKind,
- pattern: `[0-9A-Fa-f]{32,}`,
- decode: decodeHex,
- },
- {
- kind: base64Kind,
- pattern: `[\w\/+-]{16,}={0,2}`,
- decode: decodeBase64,
- },
- }
- )
- // encodingNames is used to map the encodingKinds to their name
- var encodingNames = []string{
- "percent",
- "unicode",
- "hex",
- "base64",
- }
- // encodingKind can be or'd together to capture all of the unique encodings
- // that were present in a segment
- type encodingKind int
- var (
- // make sure these go up by powers of 2
- percentKind = encodingKind(1)
- unicodeKind = encodingKind(2)
- hexKind = encodingKind(4)
- base64Kind = encodingKind(8)
- )
- func (e encodingKind) String() string {
- i := int(math.Log2(float64(e)))
- if i >= len(encodingNames) {
- return ""
- }
- return encodingNames[i]
- }
- // kinds returns a list of encodingKinds combined in this one
- func (e encodingKind) kinds() []encodingKind {
- kinds := []encodingKind{}
- for i := 0; i < len(encodingNames); i++ {
- if kind := int(e) & int(math.Pow(2, float64(i))); kind != 0 {
- kinds = append(kinds, encodingKind(kind))
- }
- }
- return kinds
- }
- // encodingMatch represents a match of an encoding in the text
- type encodingMatch struct {
- encoding *encoding
- startEnd
- }
- // encoding represent a type of coding supported by the decoder.
- type encoding struct {
- // the kind of decoding (e.g. base64, etc)
- kind encodingKind
- // the regex pattern that matches the encoding format
- pattern string
- // take the match and return the decoded value
- decode func(string) string
- // determine which encoding should win out when two overlap
- precedence int
- }
- func init() {
- count := len(encodings)
- namedPatterns := make([]string, count)
- for i, encoding := range encodings {
- encoding.precedence = count - i
- namedPatterns[i] = fmt.Sprintf(
- "(?P<%s>%s)",
- encoding.kind,
- encoding.pattern,
- )
- }
- encodingsRe = regexp.MustCompile(strings.Join(namedPatterns, "|"))
- }
- // findEncodingMatches finds as many encodings as it can for this pass
- func findEncodingMatches(data string) []encodingMatch {
- var all []encodingMatch
- for _, matchIndex := range encodingsRe.FindAllStringSubmatchIndex(data, -1) {
- // Add the encodingMatch with its proper encoding
- for i, j := 2, 0; i < len(matchIndex); i, j = i+2, j+1 {
- if matchIndex[i] > -1 {
- all = append(all, encodingMatch{
- encoding: encodings[j],
- startEnd: startEnd{
- start: matchIndex[i],
- end: matchIndex[i+1],
- },
- })
- }
- }
- }
- totalMatches := len(all)
- if totalMatches == 1 {
- return all
- }
- // filter out lower precedence ones that overlap their neigbors
- filtered := make([]encodingMatch, 0, len(all))
- for i, m := range all {
- if i > 0 {
- prev := all[i-1]
- if m.overlaps(prev.startEnd) && prev.encoding.precedence > m.encoding.precedence {
- continue // skip this one
- }
- }
- if i+1 < totalMatches {
- next := all[i+1]
- if m.overlaps(next.startEnd) && next.encoding.precedence > m.encoding.precedence {
- continue // skip this one
- }
- }
- filtered = append(filtered, m)
- }
- return filtered
- }
|