| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261 |
- package codec
- import (
- "bytes"
- "strconv"
- "strings"
- "unicode/utf8"
- "github.com/zricethezav/gitleaks/v8/regexp"
- )
- var (
- // Standard Unicode notation (e.g., U+1234)
- unicodeCodePointPat = regexp.MustCompile(`U\+([a-fA-F0-9]{4}).?`)
- // Multiple code points pattern - used for continuous sequences like "U+0074 U+006F U+006B..."
- unicodeMultiCodePointPat = regexp.MustCompile(`(?:U\+[a-fA-F0-9]{4}(?:\s|$))+`)
- // Common escape sequence used in programming languages (e.g., \u1234)
- unicodeEscapePat = regexp.MustCompile(`(?i)\\{1,2}u([a-fA-F0-9]{4})`)
- // Multiple escape sequences pattern - used for continuous sequences like "\u0074\u006F\u006B..."
- unicodeMultiEscapePat = regexp.MustCompile(`(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+`)
- )
- // Unicode characters are encoded as 1 to 4 bytes per rune.
- const maxBytesPerRune = 4
- // decodeUnicode decodes Unicode escape sequences in the given string
- func decodeUnicode(encodedValue string) string {
- // First, check if we have a continuous sequence of Unicode code points
- if matches := unicodeMultiCodePointPat.FindAllString(encodedValue, -1); len(matches) > 0 {
- // For each detected sequence of code points
- for _, match := range matches {
- // Decode the entire sequence at once
- decodedSequence := decodeMultiCodePoint(match)
- // If we successfully decoded something, replace it in the original string
- if decodedSequence != "" && decodedSequence != match {
- encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
- }
- }
- return encodedValue
- }
- // Next, check if we have a continuous sequence of escape sequences
- if matches := unicodeMultiEscapePat.FindAllString(encodedValue, -1); len(matches) > 0 {
- // For each detected sequence of escape sequences
- for _, match := range matches {
- // Decode the entire sequence at once
- decodedSequence := decodeMultiEscape(match)
- // If we successfully decoded something, replace it in the original string
- if decodedSequence != "" && decodedSequence != match {
- encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
- }
- }
- return encodedValue
- }
- // If no multi-patterns were matched, fall back to the original implementation
- // for individual code points and escape sequences
- // Create a copy of the input to work with
- data := []byte(encodedValue)
- // Store the result
- var result []byte
- // Check and decode Unicode code points (U+1234 format)
- if unicodeCodePointPat.Match(data) {
- result = decodeIndividualCodePoints(data)
- }
- // If no code points were found or we have a mix of formats,
- // also check for Unicode escape sequences (\u1234 format)
- if len(result) == 0 || unicodeEscapePat.Match(data) {
- // If we already have some result from code point decoding,
- // continue decoding escape sequences on that result
- if len(result) > 0 {
- result = decodeIndividualEscapes(result)
- } else {
- result = decodeIndividualEscapes(data)
- }
- }
- // If nothing was decoded, return original string
- if len(result) == 0 || bytes.Equal(result, data) {
- return encodedValue
- }
- return string(result)
- }
- // decodeMultiCodePoint decodes a continuous sequence of Unicode code points (U+XXXX format)
- func decodeMultiCodePoint(sequence string) string {
- // If the sequence is empty, return empty string
- if sequence == "" {
- return ""
- }
- // Split the sequence by whitespace to get individual code points
- codePoints := strings.Fields(sequence)
- if len(codePoints) == 0 {
- return sequence
- }
- // Decode each code point and build the result
- var decodedRunes []rune
- for _, cp := range codePoints {
- // Check if it follows the U+XXXX pattern
- if !strings.HasPrefix(cp, "U+") || len(cp) < 6 {
- continue
- }
- // Extract the hexadecimal value
- hexValue := cp[2:]
- // Parse the hexadecimal value to an integer
- unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
- if err != nil {
- continue
- }
- // Convert to rune and add to result
- decodedRunes = append(decodedRunes, rune(unicodeInt))
- }
- // If we didn't decode anything, return the original sequence
- if len(decodedRunes) == 0 {
- return sequence
- }
- // Return the decoded string
- return string(decodedRunes)
- }
- // decodeMultiEscape decodes a continuous sequence of Unicode escape sequences (\uXXXX format)
- func decodeMultiEscape(sequence string) string {
- // If the sequence is empty, return empty string
- if sequence == "" {
- return ""
- }
- // Find all escape sequences
- escapes := unicodeEscapePat.FindAllStringSubmatch(sequence, -1)
- if len(escapes) == 0 {
- return sequence
- }
- // Decode each escape sequence and build the result
- var decodedRunes []rune
- for _, esc := range escapes {
- // Extract the hexadecimal value
- hexValue := esc[1]
- // Parse the hexadecimal value to an integer
- unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
- if err != nil {
- continue
- }
- // Convert to rune and add to result
- decodedRunes = append(decodedRunes, rune(unicodeInt))
- }
- // If we didn't decode anything, return the original sequence
- if len(decodedRunes) == 0 {
- return sequence
- }
- // Return the decoded string
- return string(decodedRunes)
- }
- // decodeIndividualCodePoints decodes individual Unicode code points (U+1234 format)
- // This is a fallback for when we don't have a continuous sequence of code points
- func decodeIndividualCodePoints(input []byte) []byte {
- // Find all Unicode code point sequences in the input byte slice
- indices := unicodeCodePointPat.FindAllSubmatchIndex(input, -1)
- // If none found, return original input
- if len(indices) == 0 {
- return input
- }
- // Iterate over found indices in reverse order to avoid modifying the slice length
- utf8Bytes := make([]byte, maxBytesPerRune)
- for i := len(indices) - 1; i >= 0; i-- {
- matches := indices[i]
- startIndex := matches[0]
- endIndex := matches[1]
- hexStartIndex := matches[2]
- hexEndIndex := matches[3]
- // If the input is like `U+1234 U+5678` we should replace `U+1234 `.
- // Otherwise, we should only replace `U+1234`.
- if endIndex != hexEndIndex && endIndex < len(input) && input[endIndex-1] == ' ' {
- endIndex = endIndex - 1
- }
- // Extract the hexadecimal value from the escape sequence
- hexValue := string(input[hexStartIndex:hexEndIndex])
- // Parse the hexadecimal value to an integer
- unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
- if err != nil {
- // If there's an error, continue to the next escape sequence
- continue
- }
- // Convert the Unicode code point to a UTF-8 representation
- utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
- // Replace the escape sequence with the UTF-8 representation
- input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
- }
- return input
- }
- // decodeIndividualEscapes decodes individual Unicode escape sequences (\u1234 format)
- // This is a fallback for when we don't have a continuous sequence of escape sequences
- func decodeIndividualEscapes(input []byte) []byte {
- // Find all Unicode escape sequences in the input byte slice
- indices := unicodeEscapePat.FindAllSubmatchIndex(input, -1)
- // If none found, return original input
- if len(indices) == 0 {
- return input
- }
- // Iterate over found indices in reverse order to avoid modifying the slice length
- utf8Bytes := make([]byte, maxBytesPerRune)
- for i := len(indices) - 1; i >= 0; i-- {
- matches := indices[i]
- startIndex := matches[0]
- hexStartIndex := matches[2]
- endIndex := matches[3]
- // Extract the hexadecimal value from the escape sequence
- hexValue := string(input[hexStartIndex:endIndex])
- // Parse the hexadecimal value to an integer
- unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
- if err != nil {
- // If there's an error, continue to the next escape sequence
- continue
- }
- // Convert the Unicode code point to a UTF-8 representation
- utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
- // Replace the escape sequence with the UTF-8 representation
- input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
- }
- return input
- }
|