4
0

unicode.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. package codec
  2. import (
  3. "bytes"
  4. "strconv"
  5. "strings"
  6. "unicode/utf8"
  7. "github.com/zricethezav/gitleaks/v8/regexp"
  8. )
  9. var (
  10. // Standard Unicode notation (e.g., U+1234)
  11. unicodeCodePointPat = regexp.MustCompile(`U\+([a-fA-F0-9]{4}).?`)
  12. // Multiple code points pattern - used for continuous sequences like "U+0074 U+006F U+006B..."
  13. unicodeMultiCodePointPat = regexp.MustCompile(`(?:U\+[a-fA-F0-9]{4}(?:\s|$))+`)
  14. // Common escape sequence used in programming languages (e.g., \u1234)
  15. unicodeEscapePat = regexp.MustCompile(`(?i)\\{1,2}u([a-fA-F0-9]{4})`)
  16. // Multiple escape sequences pattern - used for continuous sequences like "\u0074\u006F\u006B..."
  17. unicodeMultiEscapePat = regexp.MustCompile(`(?i)(?:\\{1,2}u[a-fA-F0-9]{4})+`)
  18. )
  19. // Unicode characters are encoded as 1 to 4 bytes per rune.
  20. const maxBytesPerRune = 4
  21. // decodeUnicode decodes Unicode escape sequences in the given string
  22. func decodeUnicode(encodedValue string) string {
  23. // First, check if we have a continuous sequence of Unicode code points
  24. if matches := unicodeMultiCodePointPat.FindAllString(encodedValue, -1); len(matches) > 0 {
  25. // For each detected sequence of code points
  26. for _, match := range matches {
  27. // Decode the entire sequence at once
  28. decodedSequence := decodeMultiCodePoint(match)
  29. // If we successfully decoded something, replace it in the original string
  30. if decodedSequence != "" && decodedSequence != match {
  31. encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
  32. }
  33. }
  34. return encodedValue
  35. }
  36. // Next, check if we have a continuous sequence of escape sequences
  37. if matches := unicodeMultiEscapePat.FindAllString(encodedValue, -1); len(matches) > 0 {
  38. // For each detected sequence of escape sequences
  39. for _, match := range matches {
  40. // Decode the entire sequence at once
  41. decodedSequence := decodeMultiEscape(match)
  42. // If we successfully decoded something, replace it in the original string
  43. if decodedSequence != "" && decodedSequence != match {
  44. encodedValue = strings.Replace(encodedValue, match, decodedSequence, 1)
  45. }
  46. }
  47. return encodedValue
  48. }
  49. // If no multi-patterns were matched, fall back to the original implementation
  50. // for individual code points and escape sequences
  51. // Create a copy of the input to work with
  52. data := []byte(encodedValue)
  53. // Store the result
  54. var result []byte
  55. // Check and decode Unicode code points (U+1234 format)
  56. if unicodeCodePointPat.Match(data) {
  57. result = decodeIndividualCodePoints(data)
  58. }
  59. // If no code points were found or we have a mix of formats,
  60. // also check for Unicode escape sequences (\u1234 format)
  61. if len(result) == 0 || unicodeEscapePat.Match(data) {
  62. // If we already have some result from code point decoding,
  63. // continue decoding escape sequences on that result
  64. if len(result) > 0 {
  65. result = decodeIndividualEscapes(result)
  66. } else {
  67. result = decodeIndividualEscapes(data)
  68. }
  69. }
  70. // If nothing was decoded, return original string
  71. if len(result) == 0 || bytes.Equal(result, data) {
  72. return encodedValue
  73. }
  74. return string(result)
  75. }
  76. // decodeMultiCodePoint decodes a continuous sequence of Unicode code points (U+XXXX format)
  77. func decodeMultiCodePoint(sequence string) string {
  78. // If the sequence is empty, return empty string
  79. if sequence == "" {
  80. return ""
  81. }
  82. // Split the sequence by whitespace to get individual code points
  83. codePoints := strings.Fields(sequence)
  84. if len(codePoints) == 0 {
  85. return sequence
  86. }
  87. // Decode each code point and build the result
  88. var decodedRunes []rune
  89. for _, cp := range codePoints {
  90. // Check if it follows the U+XXXX pattern
  91. if !strings.HasPrefix(cp, "U+") || len(cp) < 6 {
  92. continue
  93. }
  94. // Extract the hexadecimal value
  95. hexValue := cp[2:]
  96. // Parse the hexadecimal value to an integer
  97. unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
  98. if err != nil {
  99. continue
  100. }
  101. // Convert to rune and add to result
  102. decodedRunes = append(decodedRunes, rune(unicodeInt))
  103. }
  104. // If we didn't decode anything, return the original sequence
  105. if len(decodedRunes) == 0 {
  106. return sequence
  107. }
  108. // Return the decoded string
  109. return string(decodedRunes)
  110. }
  111. // decodeMultiEscape decodes a continuous sequence of Unicode escape sequences (\uXXXX format)
  112. func decodeMultiEscape(sequence string) string {
  113. // If the sequence is empty, return empty string
  114. if sequence == "" {
  115. return ""
  116. }
  117. // Find all escape sequences
  118. escapes := unicodeEscapePat.FindAllStringSubmatch(sequence, -1)
  119. if len(escapes) == 0 {
  120. return sequence
  121. }
  122. // Decode each escape sequence and build the result
  123. var decodedRunes []rune
  124. for _, esc := range escapes {
  125. // Extract the hexadecimal value
  126. hexValue := esc[1]
  127. // Parse the hexadecimal value to an integer
  128. unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
  129. if err != nil {
  130. continue
  131. }
  132. // Convert to rune and add to result
  133. decodedRunes = append(decodedRunes, rune(unicodeInt))
  134. }
  135. // If we didn't decode anything, return the original sequence
  136. if len(decodedRunes) == 0 {
  137. return sequence
  138. }
  139. // Return the decoded string
  140. return string(decodedRunes)
  141. }
  142. // decodeIndividualCodePoints decodes individual Unicode code points (U+1234 format)
  143. // This is a fallback for when we don't have a continuous sequence of code points
  144. func decodeIndividualCodePoints(input []byte) []byte {
  145. // Find all Unicode code point sequences in the input byte slice
  146. indices := unicodeCodePointPat.FindAllSubmatchIndex(input, -1)
  147. // If none found, return original input
  148. if len(indices) == 0 {
  149. return input
  150. }
  151. // Iterate over found indices in reverse order to avoid modifying the slice length
  152. utf8Bytes := make([]byte, maxBytesPerRune)
  153. for i := len(indices) - 1; i >= 0; i-- {
  154. matches := indices[i]
  155. startIndex := matches[0]
  156. endIndex := matches[1]
  157. hexStartIndex := matches[2]
  158. hexEndIndex := matches[3]
  159. // If the input is like `U+1234 U+5678` we should replace `U+1234 `.
  160. // Otherwise, we should only replace `U+1234`.
  161. if endIndex != hexEndIndex && endIndex < len(input) && input[endIndex-1] == ' ' {
  162. endIndex = endIndex - 1
  163. }
  164. // Extract the hexadecimal value from the escape sequence
  165. hexValue := string(input[hexStartIndex:hexEndIndex])
  166. // Parse the hexadecimal value to an integer
  167. unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
  168. if err != nil {
  169. // If there's an error, continue to the next escape sequence
  170. continue
  171. }
  172. // Convert the Unicode code point to a UTF-8 representation
  173. utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
  174. // Replace the escape sequence with the UTF-8 representation
  175. input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
  176. }
  177. return input
  178. }
  179. // decodeIndividualEscapes decodes individual Unicode escape sequences (\u1234 format)
  180. // This is a fallback for when we don't have a continuous sequence of escape sequences
  181. func decodeIndividualEscapes(input []byte) []byte {
  182. // Find all Unicode escape sequences in the input byte slice
  183. indices := unicodeEscapePat.FindAllSubmatchIndex(input, -1)
  184. // If none found, return original input
  185. if len(indices) == 0 {
  186. return input
  187. }
  188. // Iterate over found indices in reverse order to avoid modifying the slice length
  189. utf8Bytes := make([]byte, maxBytesPerRune)
  190. for i := len(indices) - 1; i >= 0; i-- {
  191. matches := indices[i]
  192. startIndex := matches[0]
  193. hexStartIndex := matches[2]
  194. endIndex := matches[3]
  195. // Extract the hexadecimal value from the escape sequence
  196. hexValue := string(input[hexStartIndex:endIndex])
  197. // Parse the hexadecimal value to an integer
  198. unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
  199. if err != nil {
  200. // If there's an error, continue to the next escape sequence
  201. continue
  202. }
  203. // Convert the Unicode code point to a UTF-8 representation
  204. utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
  205. // Replace the escape sequence with the UTF-8 representation
  206. input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
  207. }
  208. return input
  209. }