فهرست منبع

Percent/URL Decoding Support (#1831)

* Add initial percent decoding support

* Refactor multi encoding support

* Add detect tests to confirm positions

* Avoid a few extra passes during decoding

* Do multiple passes for finding encodings re

* Fix issue with overlapping encodings when doing separate passes
bplaxco 9 ماه پیش
والد
کامیت
badcfda739

+ 3 - 1
README.md

@@ -449,7 +449,9 @@ ways:
 
 Currently supported encodings:
 
-- `base64` (both standard and base64url)
+- **percent** - Any printable ASCII percent encoded values
+- **hex** - Any printable ASCII hex encoded values >= 32 characters 
+- **base64** - Any printable ASCII base64 encoded values >= 16 characters 
 
 #### Reporting
 

+ 34 - 0
detect/codec/ascii.go

@@ -0,0 +1,34 @@
+package codec
+
+var printableASCII [256]bool
+
+func init() {
+	for b := 0; b < len(printableASCII); b++ {
+		if '\x08' < b && b < '\x7f' {
+			printableASCII[b] = true
+		}
+	}
+}
+
+// isPrintableASCII returns true if all bytes are printable ASCII
+func isPrintableASCII(b []byte) bool {
+	for _, c := range b {
+		if !printableASCII[c] {
+			return false
+		}
+	}
+
+	return true
+}
+
+// hasByte can be used to check if a string has at least one of the provided
+// bytes. Note: make sure byteset is long enough to handle the largest byte in
+// the string.
+func hasByte(data string, byteset []bool) bool {
+	for i := 0; i < len(data); i++ {
+		if byteset[data[i]] {
+			return true
+		}
+	}
+	return false
+}

+ 39 - 0
detect/codec/base64.go

@@ -0,0 +1,39 @@
+package codec
+
+import (
+	"encoding/base64"
+)
+
+// likelyBase64Chars is a set of characters that you would expect to find at
+// least one of in base64 encoded data. This risks missing about 1% of
+// base64 encoded data that doesn't contain these characters, but gives you
+// the performance gain of not trying to decode a lot of long symbols in code.
+var likelyBase64Chars = make([]bool, 256)
+
+func init() {
+	for _, c := range `0123456789+/-_` {
+		likelyBase64Chars[c] = true
+	}
+}
+
+// decodeBase64 decodes base64 encoded printable ASCII characters
+func decodeBase64(encodedValue string) string {
+	// Exit early if it doesn't seem like base64
+	if !hasByte(encodedValue, likelyBase64Chars) {
+		return ""
+	}
+
+	// Try standard base64 decoding
+	decodedValue, err := base64.StdEncoding.DecodeString(encodedValue)
+	if err == nil && isPrintableASCII(decodedValue) {
+		return string(decodedValue)
+	}
+
+	// Try base64url decoding
+	decodedValue, err = base64.RawURLEncoding.DecodeString(encodedValue)
+	if err == nil && isPrintableASCII(decodedValue) {
+		return string(decodedValue)
+	}
+
+	return ""
+}

+ 102 - 0
detect/codec/decoder.go

@@ -0,0 +1,102 @@
+package codec
+
+import (
+	"bytes"
+	"github.com/zricethezav/gitleaks/v8/logging"
+)
+
+// Decoder decodes various types of data in place
+type Decoder struct {
+	decodedMap map[string]string
+}
+
+// NewDecoder creates a default decoder struct
+func NewDecoder() *Decoder {
+	return &Decoder{
+		decodedMap: make(map[string]string),
+	}
+}
+
+// Decode returns the data with the values decoded in place along with the
+// encoded segment meta data for the next pass of decoding
+func (d *Decoder) Decode(data string, predecessors []*EncodedSegment) (string, []*EncodedSegment) {
+	segments := d.findEncodedSegments(data, predecessors)
+
+	if len(segments) > 0 {
+		result := bytes.NewBuffer(make([]byte, 0, len(data)))
+		encodedStart := 0
+		for _, segment := range segments {
+			result.WriteString(data[encodedStart:segment.encoded.start])
+			result.WriteString(segment.decodedValue)
+			encodedStart = segment.encoded.end
+		}
+
+		result.WriteString(data[encodedStart:])
+		return result.String(), segments
+	}
+
+	return data, segments
+}
+
+// findEncodedSegments finds the encoded segments in the data
+func (d *Decoder) findEncodedSegments(data string, predecessors []*EncodedSegment) []*EncodedSegment {
+	if len(data) == 0 {
+		return []*EncodedSegment{}
+	}
+
+	decodedShift := 0
+	encodingMatches := findEncodingMatches(data)
+	segments := make([]*EncodedSegment, 0, len(encodingMatches))
+	for _, m := range encodingMatches {
+		encodedValue := data[m.start:m.end]
+		decodedValue, alreadyDecoded := d.decodedMap[encodedValue]
+
+		if !alreadyDecoded {
+			decodedValue = m.encoding.decode(encodedValue)
+			d.decodedMap[encodedValue] = decodedValue
+		}
+
+		if len(decodedValue) == 0 {
+			continue
+		}
+
+		segment := &EncodedSegment{
+			predecessors: predecessors,
+			original:     toOriginal(predecessors, m.startEnd),
+			encoded:      m.startEnd,
+			decoded: startEnd{
+				m.start + decodedShift,
+				m.start + decodedShift + len(decodedValue),
+			},
+			decodedValue: decodedValue,
+			encodings:    m.encoding.kind,
+			depth:        1,
+		}
+
+		// Shift decoded start and ends based on size changes
+		decodedShift += len(decodedValue) - len(encodedValue)
+
+		// Adjust depth and encoding if applicable
+		if len(segment.predecessors) != 0 {
+			// Set the depth based on the predecessors' depth in the previous pass
+			segment.depth = 1 + segment.predecessors[0].depth
+			// Adjust encodings
+			for _, p := range segment.predecessors {
+				if segment.encoded.overlaps(p.decoded) {
+					segment.encodings |= p.encodings
+				}
+			}
+		}
+
+		segments = append(segments, segment)
+		logging.Debug().Msgf(
+			"segment found: original=%s pos=%s: %q -> %q",
+			segment.original,
+			segment.encoded,
+			encodedValue,
+			segment.decodedValue,
+		)
+	}
+
+	return segments
+}

+ 44 - 7
detect/decoder_test.go → detect/codec/decoder_test.go

@@ -1,9 +1,10 @@
-package detect
+package codec
 
 import (
-	"testing"
-
+	"encoding/hex"
 	"github.com/stretchr/testify/assert"
+	"net/url"
+	"testing"
 )
 
 func TestDecode(t *testing.T) {
@@ -66,8 +67,8 @@ func TestDecode(t *testing.T) {
 		},
 		{
 			name:     "b64-url-safe: hyphen url b64",
-			chunk:    `dHJ1ZmZsZWhvZz4-ZmluZHMtc2VjcmV0cw`,
-			expected: `trufflehog>>finds-secrets`,
+			chunk:    `Z2l0bGVha3M-PmZpbmRzLXNlY3JldHM`,
+			expected: `gitleaks>>finds-secrets`,
 		},
 		{
 			name:     "b64-url-safe: underscore url b64",
@@ -79,13 +80,49 @@ func TestDecode(t *testing.T) {
 			chunk:    `a3d3fa7c2bb99e469ba55e5834ce79ee4853a8a3`,
 			expected: `a3d3fa7c2bb99e469ba55e5834ce79ee4853a8a3`,
 		},
+		{
+			name:     "url encoded value",
+			chunk:    `secret%3D%22q%24%21%40%23%24%25%5E%26%2A%28%20asdf%22`,
+			expected: `secret="q$!@#$%^&*( asdf"`,
+		},
+		{
+			name:     "hex encoded value",
+			chunk:    `secret="466973684D617048756E6B79212121363334"`,
+			expected: `secret="FishMapHunky!!!634"`,
+		},
 	}
 
 	decoder := NewDecoder()
+	fullDecode := func(data string) string {
+		segments := []*EncodedSegment{}
+		for {
+			data, segments = decoder.Decode(data, segments)
+			if len(segments) == 0 {
+				return data
+			}
+		}
+	}
+
+	// Test value decoding
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.expected, fullDecode(tt.chunk))
+		})
+	}
+
+	// Percent encode the values to test percent decoding
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			encodedChunk := url.PathEscape(tt.chunk)
+			assert.Equal(t, tt.expected, fullDecode(encodedChunk))
+		})
+	}
+
+	// Hex encode the values to test hex decoding
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			decoded, _ := decoder.decode(tt.chunk, []EncodedSegment{})
-			assert.Equal(t, tt.expected, decoded)
+			encodedChunk := hex.EncodeToString([]byte(tt.chunk))
+			assert.Equal(t, tt.expected, fullDecode(encodedChunk))
 		})
 	}
 }

+ 153 - 0
detect/codec/encodings.go

@@ -0,0 +1,153 @@
+package codec
+
+import (
+	"fmt"
+	"math"
+	"regexp"
+	"strings"
+)
+
+var (
+	// encodingsRe is a regex built by combining all the encoding patterns
+	// into named capture groups so that a single pass can detect multiple
+	// encodings
+	encodingsRe *regexp.Regexp
+	// encodings contains all the encoding configurations for the detector.
+	// The precedence is important. You want more specific encodings to
+	// have a higher precedence or encodings that partially encode the
+	// values (e.g. percent) unlike encodings that fully encode the string
+	// (e.g. base64). If two encoding matches overlap the decoder will use
+	// this order to determine which encoding should wait till the next pass.
+	encodings = []*encoding{
+		&encoding{
+			kind:    percentKind,
+			pattern: `%[0-9A-Fa-f]{2}(?:.*%[0-9A-Fa-f]{2})?`,
+			decode:  decodePercent,
+		},
+		&encoding{
+			kind:    hexKind,
+			pattern: `[0-9A-Fa-f]{32,}`,
+			decode:  decodeHex,
+		},
+		&encoding{
+			kind:    base64Kind,
+			pattern: `[\w\/+-]{16,}={0,2}`,
+			decode:  decodeBase64,
+		},
+	}
+)
+
+// encodingNames is used to map the encodingKinds to their name
+var encodingNames = []string{
+	"percent",
+	"hex",
+	"base64",
+}
+
+// encodingKind can be or'd together to capture all of the unique encodings
+// that were present in a segment
+type encodingKind int
+
+var (
+	// make sure these go up by powers of 2
+	percentKind = encodingKind(1)
+	hexKind     = encodingKind(2)
+	base64Kind  = encodingKind(4)
+)
+
+func (e encodingKind) String() string {
+	i := int(math.Log2(float64(e)))
+	if i >= len(encodingNames) {
+		return ""
+	}
+	return encodingNames[i]
+}
+
+// kinds returns a list of encodingKinds combined in this one
+func (e encodingKind) kinds() []encodingKind {
+	kinds := []encodingKind{}
+
+	for i := 0; i < len(encodingNames); i++ {
+		if kind := int(e) & int(math.Pow(2, float64(i))); kind != 0 {
+			kinds = append(kinds, encodingKind(kind))
+		}
+	}
+
+	return kinds
+}
+
+// encodingMatch represents a match of an encoding in the text
+type encodingMatch struct {
+	encoding *encoding
+	startEnd
+}
+
+// encoding represent a type of coding supported by the decoder.
+type encoding struct {
+	// the kind of decoding (e.g. base64, etc)
+	kind encodingKind
+	// the regex pattern that matches the encoding format
+	pattern string
+	// take the match and return the decoded value
+	decode func(string) string
+	// determine which encoding should win out when two overlap
+	precedence int
+}
+
+func init() {
+	count := len(encodings)
+	namedPatterns := make([]string, count)
+	for i, encoding := range encodings {
+		encoding.precedence = count - i
+		namedPatterns[i] = fmt.Sprintf(
+			"(?P<%s>%s)",
+			encoding.kind,
+			encoding.pattern,
+		)
+	}
+	encodingsRe = regexp.MustCompile(strings.Join(namedPatterns, "|"))
+}
+
+// findEncodingMatches finds as many encodings as it can for this pass
+func findEncodingMatches(data string) []encodingMatch {
+	var all []encodingMatch
+	for _, matchIndex := range encodingsRe.FindAllStringSubmatchIndex(data, -1) {
+		// Add the encodingMatch with its proper encoding
+		for i, j := 2, 0; i < len(matchIndex); i, j = i+2, j+1 {
+			if matchIndex[i] > -1 {
+				all = append(all, encodingMatch{
+					encoding: encodings[j],
+					startEnd: startEnd{
+						start: matchIndex[i],
+						end:   matchIndex[i+1],
+					},
+				})
+			}
+		}
+	}
+
+	totalMatches := len(all)
+	if totalMatches == 1 {
+		return all
+	}
+
+	// filter out lower precedence ones that overlap their neigbors
+	filtered := make([]encodingMatch, 0, len(all))
+	for i, m := range all {
+		if i > 0 {
+			prev := all[i-1]
+			if m.overlaps(prev.startEnd) && prev.encoding.precedence > m.encoding.precedence {
+				continue // skip this one
+			}
+		}
+		if i+1 < totalMatches {
+			next := all[i+1]
+			if m.overlaps(next.startEnd) && next.encoding.precedence > m.encoding.precedence {
+				continue // skip this one
+			}
+		}
+		filtered = append(filtered, m)
+	}
+
+	return filtered
+}

+ 60 - 0
detect/codec/hex.go

@@ -0,0 +1,60 @@
+package codec
+
+// hexMap is a precalculated map of hex nibbles
+const hexMap = "" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" +
+	"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" +
+	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
+
+// likelyHexChars is a set of characters that you would expect to find at
+// least one of in hex encoded data. This risks missing some hex data that
+// doesn't contain these characters, but gives you the performance gain of not
+// trying to decode a lot of long symbols in code.
+var likelyHexChars = make([]bool, 256)
+
+func init() {
+	for _, c := range `0123456789` {
+		likelyHexChars[c] = true
+	}
+}
+
+// decodeHex decodes hex data
+func decodeHex(encodedValue string) string {
+	size := len(encodedValue)
+	// hex should have two characters per byte
+	if size%2 != 0 {
+		return ""
+	}
+	if !hasByte(encodedValue, likelyHexChars) {
+		return ""
+	}
+
+	decodedValue := make([]byte, size/2)
+	for i := 0; i < size; i += 2 {
+		n1 := hexMap[encodedValue[i]]
+		n2 := hexMap[encodedValue[i+1]]
+		if n1|n2 == '\xff' {
+			return ""
+		}
+		b := byte(n1<<4 | n2)
+		if !printableASCII[b] {
+			return ""
+		}
+		decodedValue[i/2] = b
+	}
+
+	return string(decodedValue)
+}

+ 34 - 0
detect/codec/percent.go

@@ -0,0 +1,34 @@
+package codec
+
+// decodePercent decodes percent encoded strings
+func decodePercent(encodedValue string) string {
+	encLen := len(encodedValue)
+	decodedValue := make([]byte, encLen)
+	decIndex := 0
+	encIndex := 0
+
+	for encIndex < encLen {
+		if encodedValue[encIndex] == '%' && encIndex+2 < encLen {
+			n1 := hexMap[encodedValue[encIndex+1]]
+			n2 := hexMap[encodedValue[encIndex+2]]
+			// Make sure they're hex characters
+			if n1|n2 != '\xff' {
+				b := byte(n1<<4 | n2)
+				if !printableASCII[b] {
+					return ""
+				}
+
+				decodedValue[decIndex] = b
+				encIndex += 3
+				decIndex += 1
+				continue
+			}
+		}
+
+		decodedValue[decIndex] = encodedValue[encIndex]
+		encIndex += 1
+		decIndex += 1
+	}
+
+	return string(decodedValue[:decIndex])
+}

+ 173 - 0
detect/codec/segment.go

@@ -0,0 +1,173 @@
+package codec
+
+import (
+	"fmt"
+)
+
+// EncodedSegment represents a portion of text that is encoded in some way.
+type EncodedSegment struct {
+	// predecessors are all of the segments from the previous decoding pass
+	predecessors []*EncodedSegment
+
+	// original start/end indices before decoding
+	original startEnd
+
+	// encoded start/end indices relative to the previous decoding pass.
+	// If it's a top level segment, original and encoded will be the
+	// same.
+	encoded startEnd
+
+	// decoded start/end indices in this pass after decoding
+	decoded startEnd
+
+	// decodedValue contains the decoded string for this segment
+	decodedValue string
+
+	// encodings is the encodings that make up this segment. encodingKind
+	// can be or'd together to hold multiple encodings
+	encodings encodingKind
+
+	// depth is how many decoding passes it took to decode this segment
+	depth int
+}
+
+// Tags returns additional meta data tags related to the types of segments
+func Tags(segments []*EncodedSegment) []string {
+	// Return an empty list if we don't have any segments
+	if len(segments) == 0 {
+		return []string{}
+	}
+
+	// Since decoding is done in passes, the depth of all the segments
+	// should be the same
+	depth := segments[0].depth
+
+	// Collect the encodings from the segments
+	encodings := segments[0].encodings
+	for i := 1; i < len(segments); i++ {
+		encodings |= segments[i].encodings
+	}
+
+	kinds := encodings.kinds()
+	tags := make([]string, len(kinds)+1)
+
+	tags[len(tags)-1] = fmt.Sprintf("decode-depth:%d", depth)
+	for i, kind := range kinds {
+		tags[i] = fmt.Sprintf("decoded:%s", kind)
+	}
+
+	return tags
+}
+
+// CurrentLine returns from the start of the line containing the segments
+// to the end of the line where the segment ends.
+func CurrentLine(segments []*EncodedSegment, currentRaw string) string {
+	// Return the whole thing if no segments are provided
+	if len(segments) == 0 {
+		return currentRaw
+	}
+
+	start := 0
+	end := len(currentRaw)
+
+	// Merge the ranges together into a single decoded value
+	decoded := segments[0].decoded
+	for i := 1; i < len(segments); i++ {
+		decoded = decoded.merge(segments[i].decoded)
+	}
+
+	// Find the start of the range
+	for i := decoded.start; i > -1; i-- {
+		c := currentRaw[i]
+		if c == '\n' {
+			start = i
+			break
+		}
+	}
+
+	// Find the end of the range
+	for i := decoded.end; i < end; i++ {
+		c := currentRaw[i]
+		if c == '\n' {
+			end = i
+			break
+		}
+	}
+
+	return currentRaw[start:end]
+}
+
+// AdjustMatchIndex maps a match index from the current decode pass back to
+// its location in the original text
+func AdjustMatchIndex(segments []*EncodedSegment, matchIndex []int) []int {
+	// Don't adjust if we're not provided any segments
+	if len(segments) == 0 {
+		return matchIndex
+	}
+
+	// Map the match to the location in the original text
+	match := startEnd{matchIndex[0], matchIndex[1]}
+
+	// Map the match to its orignal location
+	adjusted := toOriginal(segments, match)
+
+	// Return the adjusted match index
+	return []int{
+		adjusted.start,
+		adjusted.end,
+	}
+}
+
+// SegmentsWithDecodedOverlap the segments where the start and end overlap its
+// decoded range
+func SegmentsWithDecodedOverlap(segments []*EncodedSegment, start, end int) []*EncodedSegment {
+	se := startEnd{start, end}
+	overlaps := []*EncodedSegment{}
+
+	for _, segment := range segments {
+		if segment.decoded.overlaps(se) {
+			overlaps = append(overlaps, segment)
+		}
+	}
+
+	return overlaps
+}
+
+// toOriginal maps a start/end to its start/end in the original text
+// the provided start/end should be relative to the segment's decoded value
+func toOriginal(predecessors []*EncodedSegment, decoded startEnd) startEnd {
+	if len(predecessors) == 0 {
+		return decoded
+	}
+
+	// Map the decoded value one level up where it was encoded
+	encoded := startEnd{}
+
+	for _, p := range predecessors {
+		if !p.decoded.overlaps(decoded) {
+			continue // Not in scope
+		}
+
+		// If fully contained, return the segments original start/end
+		if p.decoded.contains(decoded) {
+			return p.original
+		}
+
+		// Map the value to be relative to the predecessors's decoded values
+		if encoded.end == 0 {
+			encoded = p.encoded.add(p.decoded.overflow(decoded))
+		} else {
+			encoded = encoded.merge(p.encoded.add(p.decoded.overflow(decoded)))
+		}
+	}
+
+	// Should only get here if the thing passed in wasn't in a decoded
+	// value. This shouldn't be the case
+	if encoded.end == 0 {
+		return decoded
+	}
+
+	// Climb up another level
+	// (NOTE: each segment references all the predecessors)
+	return toOriginal(predecessors[0].predecessors, encoded)
+}

+ 57 - 0
detect/codec/start_end.go

@@ -0,0 +1,57 @@
+package codec
+
+import (
+	"fmt"
+)
+
+// startEnd represents the start and end of some data. It mainly exists as a
+// helper when referencing the values
+type startEnd struct {
+	start int
+	end   int
+}
+
+// sub subtracts the values of two startEnds
+func (s startEnd) sub(o startEnd) startEnd {
+	return startEnd{
+		s.start - o.start,
+		s.end - o.end,
+	}
+}
+
+// add adds the values of two startEnds
+func (s startEnd) add(o startEnd) startEnd {
+	return startEnd{
+		s.start + o.start,
+		s.end + o.end,
+	}
+}
+
+// overlaps returns true if two startEnds overlap
+func (s startEnd) overlaps(o startEnd) bool {
+	return o.start <= s.end && o.end >= s.start
+}
+
+// contains returns true if the other is fully contained within this one
+func (s startEnd) contains(o startEnd) bool {
+	return s.start <= o.start && o.end <= s.end
+}
+
+// overflow returns a startEnd that tells how much the other goes outside the
+// bounds of this one
+func (s startEnd) overflow(o startEnd) startEnd {
+	return s.merge(o).sub(s)
+}
+
+// merge takes two start/ends and returns a single one that encompases both
+func (s startEnd) merge(o startEnd) startEnd {
+	return startEnd{
+		min(s.start, o.start),
+		max(s.end, o.end),
+	}
+}
+
+// String returns a string representation for clearer debugging
+func (s startEnd) String() string {
+	return fmt.Sprintf("[%d,%d]", s.start, s.end)
+}

+ 0 - 306
detect/decoder.go

@@ -1,306 +0,0 @@
-package detect
-
-import (
-	"bytes"
-	"encoding/base64"
-	"fmt"
-	"regexp"
-	"unicode"
-
-	"github.com/zricethezav/gitleaks/v8/logging"
-)
-
-var b64LikelyChars [128]byte
-var b64Regexp = regexp.MustCompile(`[\w/+-]{16,}={0,3}`)
-var decoders = []func(string) ([]byte, error){
-	base64.StdEncoding.DecodeString,
-	base64.RawURLEncoding.DecodeString,
-}
-
-func init() {
-	// Basically look for anything that isn't just letters
-	for _, c := range `0123456789+/-_` {
-		b64LikelyChars[c] = 1
-	}
-}
-
-// EncodedSegment represents a portion of text that is encoded in some way.
-// `decode` supports recusive decoding and can result in "segment trees".
-// There can be multiple segments in the original text, so each can be thought
-// of as its own tree with the root being the original segment.
-type EncodedSegment struct {
-	// The parent segment in a segment tree. If nil, it is a root segment
-	parent *EncodedSegment
-
-	// Relative start/end are the bounds of the encoded value in the current pass.
-	relativeStart int
-	relativeEnd   int
-
-	// Absolute start/end refer to the bounds of the root segment in this segment
-	// tree
-	absoluteStart int
-	absoluteEnd   int
-
-	// Decoded start/end refer to the bounds of the decoded value in the current
-	// pass. These can differ from relative values because decoding can shrink
-	// or grow the size of the segment.
-	decodedStart int
-	decodedEnd   int
-
-	// This is the actual decoded content in the segment
-	decodedValue string
-
-	// This is the type of encoding
-	encoding string
-}
-
-// isChildOf inspects the bounds of two segments to determine
-// if one should be the child of another
-func (s EncodedSegment) isChildOf(parent EncodedSegment) bool {
-	return parent.decodedStart <= s.relativeStart && parent.decodedEnd >= s.relativeEnd
-}
-
-// decodedOverlaps checks if the decoded bounds of the segment overlaps a range
-func (s EncodedSegment) decodedOverlaps(start, end int) bool {
-	return start <= s.decodedEnd && end >= s.decodedStart
-}
-
-// adjustMatchIndex takes the matchIndex from the current decoding pass and
-// updates it to match the absolute matchIndex in the original text.
-func (s EncodedSegment) adjustMatchIndex(matchIndex []int) []int {
-	// The match is within the bounds of the segment so we just return
-	// the absolute start and end of the root segment.
-	if s.decodedStart <= matchIndex[0] && matchIndex[1] <= s.decodedEnd {
-		return []int{
-			s.absoluteStart,
-			s.absoluteEnd,
-		}
-	}
-
-	// Since it overlaps one side and/or the other, we're going to have to adjust
-	// and climb parents until we're either at the root or we've determined
-	// we're fully inside one of the parent segments.
-	adjustedMatchIndex := make([]int, 2)
-
-	if matchIndex[0] < s.decodedStart {
-		// It starts before the encoded segment so adjust the start to match
-		// the location before it was decoded
-		matchStartDelta := s.decodedStart - matchIndex[0]
-		adjustedMatchIndex[0] = s.relativeStart - matchStartDelta
-	} else {
-		// It starts within the encoded segment so set the bound to the
-		// relative start
-		adjustedMatchIndex[0] = s.relativeStart
-	}
-
-	if matchIndex[1] > s.decodedEnd {
-		// It ends after the encoded segment so adjust the end to match
-		// the location before it was decoded
-		matchEndDelta := matchIndex[1] - s.decodedEnd
-		adjustedMatchIndex[1] = s.relativeEnd + matchEndDelta
-	} else {
-		// It ends within the encoded segment so set the bound to the relative end
-		adjustedMatchIndex[1] = s.relativeEnd
-	}
-
-	// We're still not at a root segment so we'll need to keep on adjusting
-	if s.parent != nil {
-		return s.parent.adjustMatchIndex(adjustedMatchIndex)
-	}
-
-	return adjustedMatchIndex
-}
-
-// depth reports how many levels of decoding needed to be done (default is 1)
-func (s EncodedSegment) depth() int {
-	depth := 1
-
-	// Climb the tree and increment the depth
-	for current := &s; current.parent != nil; current = current.parent {
-		depth++
-	}
-
-	return depth
-}
-
-// tags returns additional meta data tags related to the types of segments
-func (s EncodedSegment) tags() []string {
-	return []string{
-		fmt.Sprintf("decoded:%s", s.encoding),
-		fmt.Sprintf("decode-depth:%d", s.depth()),
-	}
-}
-
-// Decoder decodes various types of data in place
-type Decoder struct {
-	decodedMap map[string]string
-}
-
-// NewDecoder creates a default decoder struct
-func NewDecoder() *Decoder {
-	return &Decoder{
-		decodedMap: make(map[string]string),
-	}
-}
-
-// decode returns the data with the values decoded in-place
-func (d *Decoder) decode(data string, parentSegments []EncodedSegment) (string, []EncodedSegment) {
-	segments := d.findEncodedSegments(data, parentSegments)
-
-	if len(segments) > 0 {
-		result := bytes.NewBuffer(make([]byte, 0, len(data)))
-
-		relativeStart := 0
-		for _, segment := range segments {
-			result.WriteString(data[relativeStart:segment.relativeStart])
-			result.WriteString(segment.decodedValue)
-			relativeStart = segment.relativeEnd
-		}
-		result.WriteString(data[relativeStart:])
-
-		return result.String(), segments
-	}
-
-	return data, segments
-}
-
-// findEncodedSegments finds the encoded segments in the data and updates the
-// segment tree for this pass
-func (d *Decoder) findEncodedSegments(data string, parentSegments []EncodedSegment) []EncodedSegment {
-	if len(data) == 0 {
-		return []EncodedSegment{}
-	}
-
-	matchIndices := b64Regexp.FindAllStringIndex(data, -1)
-	if matchIndices == nil {
-		return []EncodedSegment{}
-	}
-
-	segments := make([]EncodedSegment, 0, len(matchIndices))
-
-	// Keeps up with offsets from the text changing size as things are decoded
-	decodedShift := 0
-
-	for _, matchIndex := range matchIndices {
-		encodedValue := data[matchIndex[0]:matchIndex[1]]
-
-		if !isLikelyB64(encodedValue) {
-			d.decodedMap[encodedValue] = ""
-			continue
-		}
-
-		decodedValue, alreadyDecoded := d.decodedMap[encodedValue]
-
-		// We haven't decoded this yet, so go ahead and decode it
-		if !alreadyDecoded {
-			decodedValue = decodeValue(encodedValue)
-			d.decodedMap[encodedValue] = decodedValue
-		}
-
-		// Skip this segment because there was nothing to check
-		if len(decodedValue) == 0 {
-			continue
-		}
-
-		// Create a segment for the encoded data
-		segment := EncodedSegment{
-			relativeStart: matchIndex[0],
-			relativeEnd:   matchIndex[1],
-			absoluteStart: matchIndex[0],
-			absoluteEnd:   matchIndex[1],
-			decodedStart:  matchIndex[0] + decodedShift,
-			decodedEnd:    matchIndex[0] + decodedShift + len(decodedValue),
-			decodedValue:  decodedValue,
-			encoding:      "base64",
-		}
-
-		// Shift decoded start and ends based on size changes
-		decodedShift += len(decodedValue) - len(encodedValue)
-
-		// Adjust the absolute position of segments contained in parent segments
-		for _, parentSegment := range parentSegments {
-			if segment.isChildOf(parentSegment) {
-				segment.absoluteStart = parentSegment.absoluteStart
-				segment.absoluteEnd = parentSegment.absoluteEnd
-				segment.parent = &parentSegment
-				break
-			}
-		}
-
-		logging.Debug().Msgf("segment found: %#v", segment)
-		segments = append(segments, segment)
-	}
-
-	return segments
-}
-
-// decoders tries a list of decoders and returns the first successful one
-func decodeValue(encodedValue string) string {
-	for _, decoder := range decoders {
-		decodedValue, err := decoder(encodedValue)
-
-		if err == nil && len(decodedValue) > 0 && isASCII(decodedValue) {
-			return string(decodedValue)
-		}
-	}
-
-	return ""
-}
-
-func isASCII(b []byte) bool {
-	for i := 0; i < len(b); i++ {
-		if b[i] > unicode.MaxASCII || b[i] < '\t' {
-			return false
-		}
-	}
-
-	return true
-}
-
-// Skip a lot of method signatures and things at the risk of missing about
-// 1% of base64
-func isLikelyB64(s string) bool {
-	for _, c := range s {
-		if b64LikelyChars[c] != 0 {
-			return true
-		}
-	}
-
-	return false
-}
-
-// Find a segment where the decoded bounds overlaps a range
-func segmentWithDecodedOverlap(encodedSegments []EncodedSegment, start, end int) *EncodedSegment {
-	for _, segment := range encodedSegments {
-		if segment.decodedOverlaps(start, end) {
-			return &segment
-		}
-	}
-
-	return nil
-}
-
-func (s EncodedSegment) currentLine(currentRaw string) string {
-	start := 0
-	end := len(currentRaw)
-
-	// Find the start of the range
-	for i := s.decodedStart; i > -1; i-- {
-		c := currentRaw[i]
-		if c == '\n' {
-			start = i
-			break
-		}
-	}
-
-	// Find the end of the range
-	for i := s.decodedEnd; i < end; i++ {
-		c := currentRaw[i]
-		if c == '\n' {
-			end = i
-			break
-		}
-	}
-
-	return currentRaw[start:end]
-}

+ 11 - 9
detect/detect.go

@@ -12,6 +12,7 @@ import (
 	"time"
 
 	"github.com/zricethezav/gitleaks/v8/config"
+	"github.com/zricethezav/gitleaks/v8/detect/codec"
 	"github.com/zricethezav/gitleaks/v8/logging"
 	"github.com/zricethezav/gitleaks/v8/regexp"
 	"github.com/zricethezav/gitleaks/v8/report"
@@ -247,9 +248,9 @@ func (d *Detector) Detect(fragment Fragment) []report.Finding {
 
 	// setup variables to handle different decoding passes
 	currentRaw := fragment.Raw
-	encodedSegments := []EncodedSegment{}
+	encodedSegments := []*codec.EncodedSegment{}
 	currentDecodeDepth := 0
-	decoder := NewDecoder()
+	decoder := codec.NewDecoder()
 
 	for {
 		// build keyword map for prefiltering rules
@@ -286,7 +287,7 @@ func (d *Detector) Detect(fragment Fragment) []report.Finding {
 		}
 
 		// decode the currentRaw for the next pass
-		currentRaw, encodedSegments = decoder.decode(currentRaw, encodedSegments)
+		currentRaw, encodedSegments = decoder.Decode(currentRaw, encodedSegments)
 
 		// stop the loop when there's nothing else to decode
 		if len(encodedSegments) == 0 {
@@ -298,7 +299,7 @@ func (d *Detector) Detect(fragment Fragment) []report.Finding {
 }
 
 // detectRule scans the given fragment for the given rule and returns a list of findings
-func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []EncodedSegment) []report.Finding {
+func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rule, encodedSegments []*codec.EncodedSegment) []report.Finding {
 	var (
 		findings []report.Finding
 		logger   = func() zerolog.Logger {
@@ -370,14 +371,15 @@ func (d *Detector) detectRule(fragment Fragment, currentRaw string, r config.Rul
 		// Check if the decoded portions of the segment overlap with the match
 		// to see if its potentially a new match
 		if len(encodedSegments) > 0 {
-			if segment := segmentWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]); segment != nil {
-				matchIndex = segment.adjustMatchIndex(matchIndex)
-				metaTags = append(metaTags, segment.tags()...)
-				currentLine = segment.currentLine(currentRaw)
-			} else {
+			segments := codec.SegmentsWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1])
+			if len(segments) == 0 {
 				// This item has already been added to a finding
 				continue
 			}
+
+			matchIndex = codec.AdjustMatchIndex(segments, matchIndex)
+			metaTags = append(metaTags, codec.Tags(segments)...)
+			currentLine = codec.CurrentLine(segments, currentRaw)
 		} else {
 			// Fixes: https://github.com/gitleaks/gitleaks/issues/1352
 			// removes the incorrectly following line that was detected by regex expression '\n'

+ 229 - 44
detect/detect_test.go

@@ -17,6 +17,7 @@ import (
 
 	"github.com/zricethezav/gitleaks/v8/cmd/scm"
 	"github.com/zricethezav/gitleaks/v8/config"
+	"github.com/zricethezav/gitleaks/v8/detect/codec"
 	"github.com/zricethezav/gitleaks/v8/logging"
 	"github.com/zricethezav/gitleaks/v8/regexp"
 	"github.com/zricethezav/gitleaks/v8/report"
@@ -26,7 +27,7 @@ import (
 const maxDecodeDepth = 8
 const configPath = "../testdata/config/"
 const repoBasePath = "../testdata/repos/"
-const b64TestValues = `
+const encodedTestValues = `
 # Decoded
 -----BEGIN PRIVATE KEY-----
 135f/bRUBHrbHqLY/xS3I7Oth+8rgG+0tBwfMcbk05Sgxq6QUzSYIQAop+WvsTwk2sR+C38g0Mnb
@@ -44,12 +45,43 @@ eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwiY29uZmlnIjoiVzJ
 c21hbGwtc2VjcmV0
 
 # This tests how it handles when the match bounds go outside the decoded value
-secret=ZGVjb2RlZC1zZWNyZXQtdmFsdWU=
+secret=ZGVjb2RlZC1zZWNyZXQtdmFsdWUwMA==
 # The above encoded again
 c2VjcmV0PVpHVmpiMlJsWkMxelpXTnlaWFF0ZG1Gc2RXVT0=
 
 # Confirm you can ignore on the decoded value
 password="bFJxQkstejVrZjQtcGxlYXNlLWlnbm9yZS1tZS1YLVhJSk0yUGRkdw=="
+
+# This tests that it can do hex encoded data
+secret=6465636F6465642D7365637265742D76616C756576484558
+
+# This tests that it can do percent encoded data
+## partial encoded data
+secret=decoded-%73%65%63%72%65%74-valuev2
+## scattered encoded
+secret=%64%65coded-%73%65%63%72%65%74-valuev3
+
+# Test multi levels of encoding where the source is a partal encoding
+# it is important that the bounds of the predecessors are properly
+# considered
+## single percent encoding in the middle of multi layer b64
+c2VjcmV0PVpHVmpiMl%4AsWkMxelpXTnlaWFF0ZG1Gc2RXVjJOQT09
+## single percent encoding at the beginning of hex
+secret%3d6465636F6465642D7365637265742D76616C75657635
+## multiple percent encodings in a single layer base64
+secret=ZGVjb2%52lZC1zZWNyZXQtdm%46sdWV4ODY=  # ends in x86
+## base64 encoded partially percent encoded value
+secret=ZGVjb2RlZC0lNzMlNjUlNjMlNzIlNjUlNzQtdmFsdWU=
+## one of the lines above that went through... a lot
+## and there's surrounding text around it
+Look at this value: %4EjMzMjU2NkE2MzZENTYzMDUwNTY3MDQ4%4eTY2RDcwNjk0RDY5NTUzMTRENkQ3ODYx%25%34%65TE3QTQ2MzY1NzZDNjQ0RjY1NTY3MDU5NTU1ODUyNkI2MjUzNTUzMDRFNkU0RTZCNTYzMTU1MzkwQQ== # isn't it crazy?
+## Multi percent encode two random characters close to the bounds of the base64
+## encoded data to make sure that the bounds are still correctly calculated
+secret=ZG%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%36%25%33%31%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%33%25%33%322RlZC1zZWNyZXQtd%25%36%64%25%34%36%25%37%33dWU=
+## The similar to the above but also touching the edge of the base64
+secret=%25%35%61%25%34%37%25%35%36jb2RlZC1zZWNyZXQtdmFsdWU%25%32%35%25%33%33%25%36%34
+## The similar to the above but also touching and overlapping the base64
+secret%3D%25%35%61%25%34%37%25%35%36jb2RlZC1zZWNyZXQtdmFsdWU%25%32%35%25%33%33%25%36%34
 `
 
 func TestDetect(t *testing.T) {
@@ -390,12 +422,11 @@ const token = "mockSecret";
 				FilePath: "tmp.go",
 			},
 		},
-
-		// Base64-decoding
-		"detect base64": {
-			cfgName: "base64_encoded",
+		// Decoding
+		"detect encoded": {
+			cfgName: "encoded",
 			fragment: Fragment{
-				Raw:      b64TestValues,
+				Raw:      encodedTestValues,
 				FilePath: "tmp.go",
 			},
 			expectedFindings: []report.Finding{
@@ -441,6 +472,90 @@ const token = "mockSecret";
 					EndColumn:   207,
 					Entropy:     5.350665,
 				},
+				{ // Encoded Small secret at the end to make sure it's picked up by the decoding
+					Description: "Small Secret",
+					Secret:      "small-secret",
+					Match:       "small-secret",
+					File:        "tmp.go",
+					Line:        "\nc21hbGwtc2VjcmV0",
+					RuleID:      "small-secret",
+					Tags:        []string{"small", "secret", "decoded:base64", "decode-depth:1"},
+					StartLine:   15,
+					EndLine:     15,
+					StartColumn: 2,
+					EndColumn:   17,
+					Entropy:     3.0849626,
+				},
+				{ // Secret where the decoded match goes outside the encoded value
+					Description: "Overlapping",
+					Secret:      "decoded-secret-value00",
+					Match:       "secret=decoded-secret-value00",
+					File:        "tmp.go",
+					Line:        "\nsecret=ZGVjb2RlZC1zZWNyZXQtdmFsdWUwMA==",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:base64", "decode-depth:1"},
+					StartLine:   18,
+					EndLine:     18,
+					StartColumn: 2,
+					EndColumn:   40,
+					Entropy:     3.4428623,
+				},
+				{ // This just confirms that with no allowlist the pattern is detected (i.e. the regex is good)
+					Description: "Make sure this would be detected with no allowlist",
+					Secret:      "lRqBK-z5kf4-please-ignore-me-X-XIJM2Pddw",
+					Match:       "password=\"lRqBK-z5kf4-please-ignore-me-X-XIJM2Pddw\"",
+					File:        "tmp.go",
+					Line:        "\npassword=\"bFJxQkstejVrZjQtcGxlYXNlLWlnbm9yZS1tZS1YLVhJSk0yUGRkdw==\"",
+					RuleID:      "decoded-password-dont-ignore",
+					Tags:        []string{"decode-ignore", "decoded:base64", "decode-depth:1"},
+					StartLine:   23,
+					EndLine:     23,
+					StartColumn: 2,
+					EndColumn:   68,
+					Entropy:     4.5841837,
+				},
+				{ // Hex encoded data check
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuevHEX",
+					Match:       "secret=decoded-secret-valuevHEX",
+					File:        "tmp.go",
+					Line:        "\nsecret=6465636F6465642D7365637265742D76616C756576484558",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:hex", "decode-depth:1"},
+					StartLine:   26,
+					EndLine:     26,
+					StartColumn: 2,
+					EndColumn:   56,
+					Entropy:     3.6531072,
+				},
+				{ // handle partial encoded percent data
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuev2",
+					Match:       "secret=decoded-secret-valuev2",
+					File:        "tmp.go",
+					Line:        "\nsecret=decoded-%73%65%63%72%65%74-valuev2",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decode-depth:1"},
+					StartLine:   30,
+					EndLine:     30,
+					StartColumn: 2,
+					EndColumn:   42,
+					Entropy:     3.4428623,
+				},
+				{ // handle partial encoded percent data
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuev3",
+					Match:       "secret=decoded-secret-valuev3",
+					File:        "tmp.go",
+					Line:        "\nsecret=%64%65coded-%73%65%63%72%65%74-valuev3",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decode-depth:1"},
+					StartLine:   32,
+					EndLine:     32,
+					StartColumn: 2,
+					EndColumn:   46,
+					Entropy:     3.4428623,
+				},
 				{ // Encoded AWS config with a access key id inside a JWT
 					Description: "AWS IAM Unique Identifier",
 					Secret:      "ASIAIOSFODNN7LXM10JI",
@@ -469,61 +584,131 @@ const token = "mockSecret";
 					EndColumn:   344,
 					Entropy:     4.721928,
 				},
-				{ // Encoded Small secret at the end to make sure it's picked up by the decoding
-					Description: "Small Secret",
-					Secret:      "small-secret",
-					Match:       "small-secret",
+				{ // Secret where the decoded match goes outside the encoded value and then encoded again
+					Description: "Overlapping",
+					Secret:      "decoded-secret-value",
+					Match:       "secret=decoded-secret-value",
 					File:        "tmp.go",
-					Line:        "\nc21hbGwtc2VjcmV0",
-					RuleID:      "small-secret",
-					Tags:        []string{"small", "secret", "decoded:base64", "decode-depth:1"},
-					StartLine:   15,
-					EndLine:     15,
+					Line:        "\nc2VjcmV0PVpHVmpiMlJsWkMxelpXTnlaWFF0ZG1Gc2RXVT0=",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:base64", "decode-depth:2"},
+					StartLine:   20,
+					EndLine:     20,
 					StartColumn: 2,
-					EndColumn:   17,
-					Entropy:     3.0849626,
+					EndColumn:   49,
+					Entropy:     3.3037016,
 				},
-				{ // Secret where the decoded match goes outside the encoded value
+				{ // handle encodings that touch eachother
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuev5",
+					Match:       "secret=decoded-secret-valuev5",
+					File:        "tmp.go",
+					Line:        "\nsecret%3d6465636F6465642D7365637265742D76616C75657635",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:hex", "decode-depth:2"},
+					StartLine:   40,
+					EndLine:     40,
+					StartColumn: 2,
+					EndColumn:   54,
+					Entropy:     3.4428623,
+				},
+				{ // handle partial encoded percent data465642D7365637265742D76616C75657635
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuev4",
+					Match:       "secret=decoded-secret-valuev4",
+					File:        "tmp.go",
+					Line:        "\nc2VjcmV0PVpHVmpiMl%4AsWkMxelpXTnlaWFF0ZG1Gc2RXVjJOQT09",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:3"},
+					StartLine:   38,
+					EndLine:     38,
+					StartColumn: 2,
+					EndColumn:   55,
+					Entropy:     3.4428623,
+				},
+				{ // multiple percent encodings in a single layer base64
+					Description: "Overlapping",
+					Secret:      "decoded-secret-valuex86",
+					Match:       "secret=decoded-secret-valuex86",
+					File:        "tmp.go",
+					Line:        "\nsecret=ZGVjb2%52lZC1zZWNyZXQtdm%46sdWV4ODY=  # ends in x86",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:2"},
+					StartLine:   42,
+					EndLine:     42,
+					StartColumn: 2,
+					EndColumn:   44,
+					Entropy:     3.6381476,
+				},
+				{ // base64 encoded partially percent encoded value
 					Description: "Overlapping",
 					Secret:      "decoded-secret-value",
 					Match:       "secret=decoded-secret-value",
 					File:        "tmp.go",
-					Line:        "\nsecret=ZGVjb2RlZC1zZWNyZXQtdmFsdWU=",
+					Line:        "\nsecret=ZGVjb2RlZC0lNzMlNjUlNjMlNzIlNjUlNzQtdmFsdWU=",
 					RuleID:      "overlapping",
-					Tags:        []string{"overlapping", "decoded:base64", "decode-depth:1"},
-					StartLine:   18,
-					EndLine:     18,
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:2"},
+					StartLine:   44,
+					EndLine:     44,
 					StartColumn: 2,
-					EndColumn:   36,
+					EndColumn:   52,
 					Entropy:     3.3037016,
 				},
-				{ // Secret where the decoded match goes outside the encoded value and then encoded again
+				{ // one of the lines above that went through... a lot
 					Description: "Overlapping",
 					Secret:      "decoded-secret-value",
 					Match:       "secret=decoded-secret-value",
 					File:        "tmp.go",
-					Line:        "\nc2VjcmV0PVpHVmpiMlJsWkMxelpXTnlaWFF0ZG1Gc2RXVT0=",
+					Line:        "\nLook at this value: %4EjMzMjU2NkE2MzZENTYzMDUwNTY3MDQ4%4eTY2RDcwNjk0RDY5NTUzMTRENkQ3ODYx%25%34%65TE3QTQ2MzY1NzZDNjQ0RjY1NTY3MDU5NTU1ODUyNkI2MjUzNTUzMDRFNkU0RTZCNTYzMTU1MzkwQQ== # isn't it crazy?",
 					RuleID:      "overlapping",
-					Tags:        []string{"overlapping", "decoded:base64", "decode-depth:2"},
-					StartLine:   20,
-					EndLine:     20,
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:hex", "decoded:base64", "decode-depth:7"},
+					StartLine:   47,
+					EndLine:     47,
+					StartColumn: 22,
+					EndColumn:   177,
+					Entropy:     3.3037016,
+				},
+				{ // Multi percent encode two random characters close to the bounds of the base64
+					Description: "Overlapping",
+					Secret:      "decoded-secret-value",
+					Match:       "secret=decoded-secret-value",
+					File:        "tmp.go",
+					Line:        "\nsecret=ZG%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%36%25%33%31%25%32%35%25%33%32%25%33%35%25%32%35%25%33%33%25%33%36%25%32%35%25%33%33%25%33%322RlZC1zZWNyZXQtd%25%36%64%25%34%36%25%37%33dWU=",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:5"},
+					StartLine:   50,
+					EndLine:     50,
 					StartColumn: 2,
-					EndColumn:   49,
+					EndColumn:   300,
 					Entropy:     3.3037016,
 				},
-				{ // This just confirms that with no allowlist the pattern is detected (i.e. the regex is good)
-					Description: "Make sure this would be detected with no allowlist",
-					Secret:      "lRqBK-z5kf4-please-ignore-me-X-XIJM2Pddw",
-					Match:       "password=\"lRqBK-z5kf4-please-ignore-me-X-XIJM2Pddw\"",
+				{ // The similar to the above but also touching the edge of the base64
+					Description: "Overlapping",
+					Secret:      "decoded-secret-value",
+					Match:       "secret=decoded-secret-value",
 					File:        "tmp.go",
-					Line:        "\npassword=\"bFJxQkstejVrZjQtcGxlYXNlLWlnbm9yZS1tZS1YLVhJSk0yUGRkdw==\"",
-					RuleID:      "decoded-password-dont-ignore",
-					Tags:        []string{"decode-ignore", "decoded:base64", "decode-depth:1"},
-					StartLine:   23,
-					EndLine:     23,
+					Line:        "\nsecret=%25%35%61%25%34%37%25%35%36jb2RlZC1zZWNyZXQtdmFsdWU%25%32%35%25%33%33%25%36%34",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:4"},
+					StartLine:   52,
+					EndLine:     52,
 					StartColumn: 2,
-					EndColumn:   68,
-					Entropy:     4.5841837,
+					EndColumn:   86,
+					Entropy:     3.3037016,
+				},
+				{ // The similar to the above but also touching and overlapping the base64
+					Description: "Overlapping",
+					Secret:      "decoded-secret-value",
+					Match:       "secret=decoded-secret-value",
+					File:        "tmp.go",
+					Line:        "\nsecret%3D%25%35%61%25%34%37%25%35%36jb2RlZC1zZWNyZXQtdmFsdWU%25%32%35%25%33%33%25%36%34",
+					RuleID:      "overlapping",
+					Tags:        []string{"overlapping", "decoded:percent", "decoded:base64", "decode-depth:4"},
+					StartLine:   54,
+					EndLine:     54,
+					StartColumn: 2,
+					EndColumn:   88,
+					Entropy:     3.3037016,
 				},
 			},
 		},
@@ -1174,7 +1359,7 @@ let password = 'Summer2024!';`
 
 			f := tc.fragment
 			f.Raw = raw
-			actual := d.detectRule(f, raw, rule, []EncodedSegment{})
+			actual := d.detectRule(f, raw, rule, []*codec.EncodedSegment{})
 			if diff := cmp.Diff(tc.expected, actual); diff != "" {
 				t.Errorf("diff: (-want +got)\n%s", diff)
 			}
@@ -1335,7 +1520,7 @@ func TestWindowsFileSeparator_RulePath(t *testing.T) {
 	require.NoError(t, err)
 	for name, test := range tests {
 		t.Run(name, func(t *testing.T) {
-			actual := d.detectRule(test.fragment, test.fragment.Raw, test.rule, []EncodedSegment{})
+			actual := d.detectRule(test.fragment, test.fragment.Raw, test.rule, []*codec.EncodedSegment{})
 			if diff := cmp.Diff(test.expected, actual); diff != "" {
 				t.Errorf("diff: (-want +got)\n%s", diff)
 			}
@@ -1521,7 +1706,7 @@ func TestWindowsFileSeparator_RuleAllowlistPaths(t *testing.T) {
 	require.NoError(t, err)
 	for name, test := range tests {
 		t.Run(name, func(t *testing.T) {
-			actual := d.detectRule(test.fragment, test.fragment.Raw, test.rule, []EncodedSegment{})
+			actual := d.detectRule(test.fragment, test.fragment.Raw, test.rule, []*codec.EncodedSegment{})
 			if diff := cmp.Diff(test.expected, actual); diff != "" {
 				t.Errorf("diff: (-want +got)\n%s", diff)
 			}

+ 1 - 1
testdata/config/base64_encoded.toml → testdata/config/encoded.toml

@@ -70,7 +70,7 @@
   # goes outside the bounds of the encoded value
   id = 'overlapping'
   description = 'Overlapping'
-  regex = '''secret=(decoded-secret-value)'''
+  regex = '''secret=(decoded-secret-value\w*)'''
   tags = ['overlapping']
   secretGroup = 1