LBP
/
gitleaks
зеркало из https://github.com/gitleaks/gitleaks.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
							package codec

import (
	"bytes"
	"strconv"
	"strings"
	"unicode/utf8"

	"github.com/zricethezav/gitleaks/v8/regexp"
)

var (
	// Standard Unicode notation (e.g., U+1234)
	unicodeCodePointPat = regexp.MustCompile(`U\+([a-fA-F0-9]{4}).?`)

	// Multiple code points pattern - used for continuous sequences like "U+0074 U+006F U+006B..."
	unicodeMultiCodePointPat = regexp.MustCompile(`(?:U\+[a-fA-F0-9]{4}(?:\s|$))+`)

	// Common escape sequence used in programming languages (e.g., \u1234)
	unicodeEscapePat = regexp.MustCompile(`(?i)\\{1,2}u([a-fA-F0-9]{4})`)

	// Multiple escape sequences pattern - used for continuous sequences like "\u0074\u006F\u006B..."
	unicodeMultiEscapePat = regexp.MustCompile(`(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+`)
)

// Unicode characters are encoded as 1 to 4 bytes per rune.
const maxBytesPerRune = 4

// decodeUnicode decodes Unicode escape sequences in the given string
func decodeUnicode(encodedValue string) string {
	// First, check if we have a continuous sequence of Unicode code points
	if matches := unicodeMultiCodePointPat.FindAllString(encodedValue, -1); len(matches) > 0 {
		// For each detected sequence of code points
		for _, match := range matches {
			// Decode the entire sequence at once
			decodedSequence := decodeMultiCodePoint(match)

			// If we successfully decoded something, replace it in the original string
			if decodedSequence != "" && decodedSequence != match {
				encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
			}
		}
		return encodedValue
	}

	// Next, check if we have a continuous sequence of escape sequences
	if matches := unicodeMultiEscapePat.FindAllString(encodedValue, -1); len(matches) > 0 {
		// For each detected sequence of escape sequences
		for _, match := range matches {
			// Decode the entire sequence at once
			decodedSequence := decodeMultiEscape(match)

			// If we successfully decoded something, replace it in the original string
			if decodedSequence != "" && decodedSequence != match {
				encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
			}
		}
		return encodedValue
	}

	// If no multi-patterns were matched, fall back to the original implementation
	// for individual code points and escape sequences

	// Create a copy of the input to work with
	data := []byte(encodedValue)

	// Store the result
	var result []byte

	// Check and decode Unicode code points (U+1234 format)
	if unicodeCodePointPat.Match(data) {
		result = decodeIndividualCodePoints(data)
	}

	// If no code points were found or we have a mix of formats,
	// also check for Unicode escape sequences (\u1234 format)
	if len(result) == 0 || unicodeEscapePat.Match(data) {
		// If we already have some result from code point decoding,
		// continue decoding escape sequences on that result
		if len(result) > 0 {
			result = decodeIndividualEscapes(result)
		} else {
			result = decodeIndividualEscapes(data)
		}
	}

	// If nothing was decoded, return original string
	if len(result) == 0 || bytes.Equal(result, data) {
		return encodedValue
	}

	return string(result)
}

// decodeMultiCodePoint decodes a continuous sequence of Unicode code points (U+XXXX format)
func decodeMultiCodePoint(sequence string) string {
	// If the sequence is empty, return empty string
	if sequence == "" {
		return ""
	}

	// Split the sequence by whitespace to get individual code points
	codePoints := strings.Fields(sequence)
	if len(codePoints) == 0 {
		return sequence
	}

	// Decode each code point and build the result
	var decodedRunes []rune
	for _, cp := range codePoints {
		// Check if it follows the U+XXXX pattern
		if !strings.HasPrefix(cp, "U+") || len(cp) < 6 {
			continue
		}

		// Extract the hexadecimal value
		hexValue := cp[2:]

		// Parse the hexadecimal value to an integer
		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
		if err != nil {
			continue
		}

		// Convert to rune and add to result
		decodedRunes = append(decodedRunes, rune(unicodeInt))
	}

	// If we didn't decode anything, return the original sequence
	if len(decodedRunes) == 0 {
		return sequence
	}

	// Return the decoded string
	return string(decodedRunes)
}

// decodeMultiEscape decodes a continuous sequence of Unicode escape sequences (\uXXXX format)
func decodeMultiEscape(sequence string) string {
	// If the sequence is empty, return empty string
	if sequence == "" {
		return ""
	}

	// Find all escape sequences
	escapes := unicodeEscapePat.FindAllStringSubmatch(sequence, -1)
	if len(escapes) == 0 {
		return sequence
	}

	// Decode each escape sequence and build the result
	var decodedRunes []rune
	for _, esc := range escapes {
		// Extract the hexadecimal value
		hexValue := esc[1]

		// Parse the hexadecimal value to an integer
		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
		if err != nil {
			continue
		}

		// Convert to rune and add to result
		decodedRunes = append(decodedRunes, rune(unicodeInt))
	}

	// If we didn't decode anything, return the original sequence
	if len(decodedRunes) == 0 {
		return sequence
	}

	// Return the decoded string
	return string(decodedRunes)
}

// decodeIndividualCodePoints decodes individual Unicode code points (U+1234 format)
// This is a fallback for when we don't have a continuous sequence of code points
func decodeIndividualCodePoints(input []byte) []byte {
	// Find all Unicode code point sequences in the input byte slice
	indices := unicodeCodePointPat.FindAllSubmatchIndex(input, -1)

	// If none found, return original input
	if len(indices) == 0 {
		return input
	}

	// Iterate over found indices in reverse order to avoid modifying the slice length
	utf8Bytes := make([]byte, maxBytesPerRune)
	for i := len(indices) - 1; i >= 0; i-- {
		matches := indices[i]

		startIndex := matches[0]
		endIndex := matches[1]
		hexStartIndex := matches[2]
		hexEndIndex := matches[3]

		// If the input is like `U+1234 U+5678` we should replace `U+1234 `.
		// Otherwise, we should only replace `U+1234`.
		if endIndex != hexEndIndex && endIndex < len(input) && input[endIndex-1] == ' ' {
			endIndex = endIndex - 1
		}

		// Extract the hexadecimal value from the escape sequence
		hexValue := string(input[hexStartIndex:hexEndIndex])

		// Parse the hexadecimal value to an integer
		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
		if err != nil {
			// If there's an error, continue to the next escape sequence
			continue
		}

		// Convert the Unicode code point to a UTF-8 representation
		utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))

		// Replace the escape sequence with the UTF-8 representation
		input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
	}

	return input
}

// decodeIndividualEscapes decodes individual Unicode escape sequences (\u1234 format)
// This is a fallback for when we don't have a continuous sequence of escape sequences
func decodeIndividualEscapes(input []byte) []byte {
	// Find all Unicode escape sequences in the input byte slice
	indices := unicodeEscapePat.FindAllSubmatchIndex(input, -1)

	// If none found, return original input
	if len(indices) == 0 {
		return input
	}

	// Iterate over found indices in reverse order to avoid modifying the slice length
	utf8Bytes := make([]byte, maxBytesPerRune)
	for i := len(indices) - 1; i >= 0; i-- {
		matches := indices[i]

		startIndex := matches[0]
		hexStartIndex := matches[2]
		endIndex := matches[3]

		// Extract the hexadecimal value from the escape sequence
		hexValue := string(input[hexStartIndex:endIndex])

		// Parse the hexadecimal value to an integer
		unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
		if err != nil {
			// If there's an error, continue to the next escape sequence
			continue
		}

		// Convert the Unicode code point to a UTF-8 representation
		utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))

		// Replace the escape sequence with the UTF-8 representation
		input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
	}

	return input
}