encoding_test.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package encoding // import "miniflux.app/v2/internal/reader/encoding"
  4. import (
  5. "bytes"
  6. "io"
  7. "os"
  8. "testing"
  9. "unicode/utf8"
  10. )
  11. func TestCharsetReaderWithUTF8(t *testing.T) {
  12. file := "testdata/utf8.xml"
  13. f, err := os.Open(file)
  14. if err != nil {
  15. t.Fatalf("Unable to open file: %v", err)
  16. }
  17. reader, err := CharsetReader("UTF-8", f)
  18. if err != nil {
  19. t.Fatalf("Unable to create reader: %v", err)
  20. }
  21. data, err := io.ReadAll(reader)
  22. if err != nil {
  23. t.Fatalf("Unable to read data: %v", err)
  24. }
  25. if !utf8.Valid(data) {
  26. t.Fatalf("Data is not valid UTF-8")
  27. }
  28. expectedUnicodeString := "Café"
  29. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  30. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  31. }
  32. }
  33. func TestCharsetReaderWithISO88591(t *testing.T) {
  34. file := "testdata/iso-8859-1.xml"
  35. f, err := os.Open(file)
  36. if err != nil {
  37. t.Fatalf("Unable to open file: %v", err)
  38. }
  39. reader, err := CharsetReader("ISO-8859-1", f)
  40. if err != nil {
  41. t.Fatalf("Unable to create reader: %v", err)
  42. }
  43. data, err := io.ReadAll(reader)
  44. if err != nil {
  45. t.Fatalf("Unable to read data: %v", err)
  46. }
  47. if !utf8.Valid(data) {
  48. t.Fatalf("Data is not valid UTF-8")
  49. }
  50. expectedUnicodeString := "Café"
  51. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  52. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  53. }
  54. }
  55. func TestCharsetReaderWithWindows1252(t *testing.T) {
  56. file := "testdata/windows-1252.xml"
  57. f, err := os.Open(file)
  58. if err != nil {
  59. t.Fatalf("Unable to open file: %v", err)
  60. }
  61. reader, err := CharsetReader("windows-1252", f)
  62. if err != nil {
  63. t.Fatalf("Unable to create reader: %v", err)
  64. }
  65. data, err := io.ReadAll(reader)
  66. if err != nil {
  67. t.Fatalf("Unable to read data: %v", err)
  68. }
  69. if !utf8.Valid(data) {
  70. t.Fatalf("Data is not valid UTF-8")
  71. }
  72. expectedUnicodeString := "Euro €"
  73. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  74. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  75. }
  76. }
  77. func TestCharsetReaderWithInvalidProlog(t *testing.T) {
  78. file := "testdata/invalid-prolog.xml"
  79. f, err := os.Open(file)
  80. if err != nil {
  81. t.Fatalf("Unable to open file: %v", err)
  82. }
  83. reader, err := CharsetReader("invalid", f)
  84. if err != nil {
  85. t.Fatalf("Unable to create reader: %v", err)
  86. }
  87. data, err := io.ReadAll(reader)
  88. if err != nil {
  89. t.Fatalf("Unable to read data: %v", err)
  90. }
  91. if !utf8.Valid(data) {
  92. t.Fatalf("Data is not valid UTF-8")
  93. }
  94. expectedUnicodeString := "Café"
  95. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  96. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  97. }
  98. }
  99. func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
  100. file := "testdata/utf8-incorrect-prolog.xml"
  101. f, err := os.Open(file)
  102. if err != nil {
  103. t.Fatalf("Unable to open file: %v", err)
  104. }
  105. reader, err := CharsetReader("ISO-8859-1", f)
  106. if err != nil {
  107. t.Fatalf("Unable to create reader: %v", err)
  108. }
  109. data, err := io.ReadAll(reader)
  110. if err != nil {
  111. t.Fatalf("Unable to read data: %v", err)
  112. }
  113. if !utf8.Valid(data) {
  114. t.Fatalf("Data is not valid UTF-8")
  115. }
  116. expectedUnicodeString := "Café"
  117. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  118. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  119. }
  120. }
  121. func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
  122. file := "testdata/windows-1252-incorrect-prolog.xml"
  123. f, err := os.Open(file)
  124. if err != nil {
  125. t.Fatalf("Unable to open file: %v", err)
  126. }
  127. reader, err := CharsetReader("windows-1252", f)
  128. if err != nil {
  129. t.Fatalf("Unable to create reader: %v", err)
  130. }
  131. data, err := io.ReadAll(reader)
  132. if err != nil {
  133. t.Fatalf("Unable to read data: %v", err)
  134. }
  135. if !utf8.Valid(data) {
  136. t.Fatalf("Data is not valid UTF-8")
  137. }
  138. expectedUnicodeString := "Euro €"
  139. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  140. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  141. }
  142. }
  143. func TestNewReaderWithUTF8Document(t *testing.T) {
  144. file := "testdata/utf8.html"
  145. f, err := os.Open(file)
  146. if err != nil {
  147. t.Fatalf("Unable to open file: %v", err)
  148. }
  149. reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
  150. if err != nil {
  151. t.Fatalf("Unable to create reader: %v", err)
  152. }
  153. data, err := io.ReadAll(reader)
  154. if err != nil {
  155. t.Fatalf("Unable to read data: %v", err)
  156. }
  157. if !utf8.Valid(data) {
  158. t.Fatalf("Data is not valid UTF-8")
  159. }
  160. expectedUnicodeString := "Café"
  161. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  162. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  163. }
  164. }
  165. func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
  166. file := "testdata/utf8.html"
  167. f, err := os.Open(file)
  168. if err != nil {
  169. t.Fatalf("Unable to open file: %v", err)
  170. }
  171. reader, err := NewCharsetReader(f, "text/html")
  172. if err != nil {
  173. t.Fatalf("Unable to create reader: %v", err)
  174. }
  175. data, err := io.ReadAll(reader)
  176. if err != nil {
  177. t.Fatalf("Unable to read data: %v", err)
  178. }
  179. if !utf8.Valid(data) {
  180. t.Fatalf("Data is not valid UTF-8")
  181. }
  182. expectedUnicodeString := "Café"
  183. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  184. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  185. }
  186. }
  187. func TestNewReaderWithISO88591Document(t *testing.T) {
  188. file := "testdata/iso-8859-1.xml"
  189. f, err := os.Open(file)
  190. if err != nil {
  191. t.Fatalf("Unable to open file: %v", err)
  192. }
  193. reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
  194. if err != nil {
  195. t.Fatalf("Unable to create reader: %v", err)
  196. }
  197. data, err := io.ReadAll(reader)
  198. if err != nil {
  199. t.Fatalf("Unable to read data: %v", err)
  200. }
  201. if !utf8.Valid(data) {
  202. t.Fatalf("Data is not valid UTF-8")
  203. }
  204. expectedUnicodeString := "Café"
  205. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  206. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  207. }
  208. }
  209. func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
  210. file := "testdata/iso-8859-1.xml"
  211. f, err := os.Open(file)
  212. if err != nil {
  213. t.Fatalf("Unable to open file: %v", err)
  214. }
  215. reader, err := NewCharsetReader(f, "")
  216. if err != nil {
  217. t.Fatalf("Unable to create reader: %v", err)
  218. }
  219. data, err := io.ReadAll(reader)
  220. if err != nil {
  221. t.Fatalf("Unable to read data: %v", err)
  222. }
  223. if !utf8.Valid(data) {
  224. t.Fatalf("Data is not valid UTF-8")
  225. }
  226. expectedUnicodeString := "Café"
  227. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  228. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  229. }
  230. }
  231. func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
  232. file := "testdata/iso-8859-1-meta-after-1024.html"
  233. f, err := os.Open(file)
  234. if err != nil {
  235. t.Fatalf("Unable to open file: %v", err)
  236. }
  237. reader, err := NewCharsetReader(f, "text/html")
  238. if err != nil {
  239. t.Fatalf("Unable to create reader: %v", err)
  240. }
  241. data, err := io.ReadAll(reader)
  242. if err != nil {
  243. t.Fatalf("Unable to read data: %v", err)
  244. }
  245. if !utf8.Valid(data) {
  246. t.Fatalf("Data is not valid UTF-8")
  247. }
  248. expectedUnicodeString := "Café"
  249. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  250. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  251. }
  252. }
  253. func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
  254. file := "testdata/utf8-meta-after-1024.html"
  255. f, err := os.Open(file)
  256. if err != nil {
  257. t.Fatalf("Unable to open file: %v", err)
  258. }
  259. reader, err := NewCharsetReader(f, "text/html")
  260. if err != nil {
  261. t.Fatalf("Unable to create reader: %v", err)
  262. }
  263. data, err := io.ReadAll(reader)
  264. if err != nil {
  265. t.Fatalf("Unable to read data: %v", err)
  266. }
  267. if !utf8.Valid(data) {
  268. t.Fatalf("Data is not valid UTF-8")
  269. }
  270. expectedUnicodeString := "Café"
  271. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  272. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  273. }
  274. }