encoding_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package encoding // import "miniflux.app/v2/internal/reader/encoding"
  4. import (
  5. "bytes"
  6. "io"
  7. "os"
  8. "testing"
  9. "unicode/utf8"
  10. "golang.org/x/text/encoding/charmap"
  11. )
  12. func TestCharsetReaderWithUTF8(t *testing.T) {
  13. file := "testdata/utf8.xml"
  14. f, err := os.Open(file)
  15. if err != nil {
  16. t.Fatalf("Unable to open file: %v", err)
  17. }
  18. reader, err := CharsetReader("UTF-8", f)
  19. if err != nil {
  20. t.Fatalf("Unable to create reader: %v", err)
  21. }
  22. data, err := io.ReadAll(reader)
  23. if err != nil {
  24. t.Fatalf("Unable to read data: %v", err)
  25. }
  26. if !utf8.Valid(data) {
  27. t.Fatalf("Data is not valid UTF-8")
  28. }
  29. expectedUnicodeString := "Café"
  30. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  31. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  32. }
  33. }
  34. func TestCharsetReaderWithISO88591(t *testing.T) {
  35. file := "testdata/iso-8859-1.xml"
  36. f, err := os.Open(file)
  37. if err != nil {
  38. t.Fatalf("Unable to open file: %v", err)
  39. }
  40. reader, err := CharsetReader("ISO-8859-1", f)
  41. if err != nil {
  42. t.Fatalf("Unable to create reader: %v", err)
  43. }
  44. data, err := io.ReadAll(reader)
  45. if err != nil {
  46. t.Fatalf("Unable to read data: %v", err)
  47. }
  48. if !utf8.Valid(data) {
  49. t.Fatalf("Data is not valid UTF-8")
  50. }
  51. expectedUnicodeString := "Café"
  52. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  53. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  54. }
  55. }
  56. func TestCharsetReaderWithWindows1252(t *testing.T) {
  57. file := "testdata/windows-1252.xml"
  58. f, err := os.Open(file)
  59. if err != nil {
  60. t.Fatalf("Unable to open file: %v", err)
  61. }
  62. reader, err := CharsetReader("windows-1252", f)
  63. if err != nil {
  64. t.Fatalf("Unable to create reader: %v", err)
  65. }
  66. data, err := io.ReadAll(reader)
  67. if err != nil {
  68. t.Fatalf("Unable to read data: %v", err)
  69. }
  70. if !utf8.Valid(data) {
  71. t.Fatalf("Data is not valid UTF-8")
  72. }
  73. expectedUnicodeString := "Euro €"
  74. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  75. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  76. }
  77. }
  78. func TestCharsetReaderWithInvalidProlog(t *testing.T) {
  79. file := "testdata/invalid-prolog.xml"
  80. f, err := os.Open(file)
  81. if err != nil {
  82. t.Fatalf("Unable to open file: %v", err)
  83. }
  84. reader, err := CharsetReader("invalid", f)
  85. if err != nil {
  86. t.Fatalf("Unable to create reader: %v", err)
  87. }
  88. data, err := io.ReadAll(reader)
  89. if err != nil {
  90. t.Fatalf("Unable to read data: %v", err)
  91. }
  92. if !utf8.Valid(data) {
  93. t.Fatalf("Data is not valid UTF-8")
  94. }
  95. expectedUnicodeString := "Café"
  96. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  97. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  98. }
  99. }
  100. func TestCharsetReaderWithUTF8DocumentWithIncorrectProlog(t *testing.T) {
  101. file := "testdata/utf8-incorrect-prolog.xml"
  102. f, err := os.Open(file)
  103. if err != nil {
  104. t.Fatalf("Unable to open file: %v", err)
  105. }
  106. reader, err := CharsetReader("ISO-8859-1", f)
  107. if err != nil {
  108. t.Fatalf("Unable to create reader: %v", err)
  109. }
  110. data, err := io.ReadAll(reader)
  111. if err != nil {
  112. t.Fatalf("Unable to read data: %v", err)
  113. }
  114. if !utf8.Valid(data) {
  115. t.Fatalf("Data is not valid UTF-8")
  116. }
  117. expectedUnicodeString := "Café"
  118. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  119. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  120. }
  121. }
  122. func TestCharsetReaderWithWindows1252DocumentWithIncorrectProlog(t *testing.T) {
  123. file := "testdata/windows-1252-incorrect-prolog.xml"
  124. f, err := os.Open(file)
  125. if err != nil {
  126. t.Fatalf("Unable to open file: %v", err)
  127. }
  128. reader, err := CharsetReader("windows-1252", f)
  129. if err != nil {
  130. t.Fatalf("Unable to create reader: %v", err)
  131. }
  132. data, err := io.ReadAll(reader)
  133. if err != nil {
  134. t.Fatalf("Unable to read data: %v", err)
  135. }
  136. if !utf8.Valid(data) {
  137. t.Fatalf("Data is not valid UTF-8")
  138. }
  139. expectedUnicodeString := "Euro €"
  140. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  141. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  142. }
  143. }
  144. func TestNewReaderWithUTF8Document(t *testing.T) {
  145. file := "testdata/utf8.html"
  146. f, err := os.Open(file)
  147. if err != nil {
  148. t.Fatalf("Unable to open file: %v", err)
  149. }
  150. reader, err := NewCharsetReader(f, "text/html; charset=UTF-8")
  151. if err != nil {
  152. t.Fatalf("Unable to create reader: %v", err)
  153. }
  154. data, err := io.ReadAll(reader)
  155. if err != nil {
  156. t.Fatalf("Unable to read data: %v", err)
  157. }
  158. if !utf8.Valid(data) {
  159. t.Fatalf("Data is not valid UTF-8")
  160. }
  161. expectedUnicodeString := "Café"
  162. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  163. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  164. }
  165. }
  166. func TestNewReaderWithUTF8DocumentAndNoContentEncoding(t *testing.T) {
  167. file := "testdata/utf8.html"
  168. f, err := os.Open(file)
  169. if err != nil {
  170. t.Fatalf("Unable to open file: %v", err)
  171. }
  172. reader, err := NewCharsetReader(f, "text/html")
  173. if err != nil {
  174. t.Fatalf("Unable to create reader: %v", err)
  175. }
  176. data, err := io.ReadAll(reader)
  177. if err != nil {
  178. t.Fatalf("Unable to read data: %v", err)
  179. }
  180. if !utf8.Valid(data) {
  181. t.Fatalf("Data is not valid UTF-8")
  182. }
  183. expectedUnicodeString := "Café"
  184. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  185. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  186. }
  187. }
  188. func TestNewReaderWithISO88591Document(t *testing.T) {
  189. file := "testdata/iso-8859-1.xml"
  190. f, err := os.Open(file)
  191. if err != nil {
  192. t.Fatalf("Unable to open file: %v", err)
  193. }
  194. reader, err := NewCharsetReader(f, "text/html; charset=ISO-8859-1")
  195. if err != nil {
  196. t.Fatalf("Unable to create reader: %v", err)
  197. }
  198. data, err := io.ReadAll(reader)
  199. if err != nil {
  200. t.Fatalf("Unable to read data: %v", err)
  201. }
  202. if !utf8.Valid(data) {
  203. t.Fatalf("Data is not valid UTF-8")
  204. }
  205. expectedUnicodeString := "Café"
  206. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  207. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  208. }
  209. }
  210. func TestNewReaderWithISO88591DocumentAndNoContentType(t *testing.T) {
  211. file := "testdata/iso-8859-1.xml"
  212. f, err := os.Open(file)
  213. if err != nil {
  214. t.Fatalf("Unable to open file: %v", err)
  215. }
  216. reader, err := NewCharsetReader(f, "")
  217. if err != nil {
  218. t.Fatalf("Unable to create reader: %v", err)
  219. }
  220. data, err := io.ReadAll(reader)
  221. if err != nil {
  222. t.Fatalf("Unable to read data: %v", err)
  223. }
  224. if !utf8.Valid(data) {
  225. t.Fatalf("Data is not valid UTF-8")
  226. }
  227. expectedUnicodeString := "Café"
  228. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  229. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  230. }
  231. }
  232. func TestNewReaderWithISO88591DocumentWithMetaAfter1024Bytes(t *testing.T) {
  233. file := "testdata/iso-8859-1-meta-after-1024.html"
  234. f, err := os.Open(file)
  235. if err != nil {
  236. t.Fatalf("Unable to open file: %v", err)
  237. }
  238. reader, err := NewCharsetReader(f, "text/html")
  239. if err != nil {
  240. t.Fatalf("Unable to create reader: %v", err)
  241. }
  242. data, err := io.ReadAll(reader)
  243. if err != nil {
  244. t.Fatalf("Unable to read data: %v", err)
  245. }
  246. if !utf8.Valid(data) {
  247. t.Fatalf("Data is not valid UTF-8")
  248. }
  249. expectedUnicodeString := "Café"
  250. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  251. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  252. }
  253. }
  254. func TestNewReaderWithUTF8DocumentWithMetaAfter1024Bytes(t *testing.T) {
  255. file := "testdata/utf8-meta-after-1024.html"
  256. f, err := os.Open(file)
  257. if err != nil {
  258. t.Fatalf("Unable to open file: %v", err)
  259. }
  260. reader, err := NewCharsetReader(f, "text/html")
  261. if err != nil {
  262. t.Fatalf("Unable to create reader: %v", err)
  263. }
  264. data, err := io.ReadAll(reader)
  265. if err != nil {
  266. t.Fatalf("Unable to read data: %v", err)
  267. }
  268. if !utf8.Valid(data) {
  269. t.Fatalf("Data is not valid UTF-8")
  270. }
  271. expectedUnicodeString := "Café"
  272. if !bytes.Contains(data, []byte(expectedUnicodeString)) {
  273. t.Fatalf("Data does not contain expected unicode string: %s", expectedUnicodeString)
  274. }
  275. }
  276. func TestCharsetReaderWithKOI8RLabel(t *testing.T) {
  277. expectedUnicodeString := "Привет мир"
  278. input, err := charmap.KOI8R.NewEncoder().Bytes([]byte(expectedUnicodeString))
  279. if err != nil {
  280. t.Fatalf("Unable to build KOI8-R input: %v", err)
  281. }
  282. reader, err := CharsetReader("koi8-r", bytes.NewReader(input))
  283. if err != nil {
  284. t.Fatalf("Unable to create reader: %v", err)
  285. }
  286. data, err := io.ReadAll(reader)
  287. if err != nil {
  288. t.Fatalf("Unable to read data: %v", err)
  289. }
  290. if !utf8.Valid(data) {
  291. t.Fatalf("Data is not valid UTF-8")
  292. }
  293. if string(data) != expectedUnicodeString {
  294. t.Fatalf("Data does not match expected unicode string, got %q expected %q", string(data), expectedUnicodeString)
  295. }
  296. }
  297. func TestCharsetReaderWithUppercaseKOI8RLabel(t *testing.T) {
  298. expectedUnicodeString := "Привет мир"
  299. input, err := charmap.KOI8R.NewEncoder().Bytes([]byte(expectedUnicodeString))
  300. if err != nil {
  301. t.Fatalf("Unable to build KOI8-R input: %v", err)
  302. }
  303. reader, err := CharsetReader("KOI8-R", bytes.NewReader(input))
  304. if err != nil {
  305. t.Fatalf("Unable to create reader: %v", err)
  306. }
  307. data, err := io.ReadAll(reader)
  308. if err != nil {
  309. t.Fatalf("Unable to read data: %v", err)
  310. }
  311. if !utf8.Valid(data) {
  312. t.Fatalf("Data is not valid UTF-8")
  313. }
  314. if string(data) != expectedUnicodeString {
  315. t.Fatalf("Data does not match expected unicode string, got %q expected %q", string(data), expectedUnicodeString)
  316. }
  317. }
  318. func TestCharsetReaderWithKOI8RFeedFixture(t *testing.T) {
  319. file := "testdata/koi8r.xml"
  320. f, err := os.Open(file)
  321. if err != nil {
  322. t.Fatalf("Unable to open file: %v", err)
  323. }
  324. reader, err := CharsetReader("KOI8-R", f)
  325. if err != nil {
  326. t.Fatalf("Unable to create reader: %v", err)
  327. }
  328. data, err := io.ReadAll(reader)
  329. if err != nil {
  330. t.Fatalf("Unable to read data: %v", err)
  331. }
  332. if !utf8.Valid(data) {
  333. t.Fatalf("Data is not valid UTF-8")
  334. }
  335. if !bytes.Contains(data, []byte("Пример RSS ленты")) {
  336. t.Fatalf("Data does not contain expected unicode string: %s", "Пример RSS ленты")
  337. }
  338. if !bytes.Contains(data, []byte("Привет мир! Ёжик, чай, Москва, Санкт-Петербург.")) {
  339. t.Fatalf("Data does not contain expected unicode string: %s", "Привет мир! Ёжик, чай, Москва, Санкт-Петербург.")
  340. }
  341. }
  342. func TestNewCharsetReaderWithKOI8RContentType(t *testing.T) {
  343. expectedUnicodeString := "Привет мир"
  344. input, err := charmap.KOI8R.NewEncoder().Bytes([]byte(expectedUnicodeString))
  345. if err != nil {
  346. t.Fatalf("Unable to build KOI8-R input: %v", err)
  347. }
  348. reader, err := NewCharsetReader(bytes.NewReader(input), "text/xml; charset=koi8-r")
  349. if err != nil {
  350. t.Fatalf("Unable to create reader: %v", err)
  351. }
  352. data, err := io.ReadAll(reader)
  353. if err != nil {
  354. t.Fatalf("Unable to read data: %v", err)
  355. }
  356. if !utf8.Valid(data) {
  357. t.Fatalf("Data is not valid UTF-8")
  358. }
  359. if string(data) != expectedUnicodeString {
  360. t.Fatalf("Data does not match expected unicode string, got %q expected %q", string(data), expectedUnicodeString)
  361. }
  362. }
  363. func TestNewCharsetReaderWithKOI8RFeedFixtureAndContentType(t *testing.T) {
  364. file := "testdata/koi8r.xml"
  365. f, err := os.Open(file)
  366. if err != nil {
  367. t.Fatalf("Unable to open file: %v", err)
  368. }
  369. reader, err := NewCharsetReader(f, "application/rss+xml; charset=KOI8-R")
  370. if err != nil {
  371. t.Fatalf("Unable to create reader: %v", err)
  372. }
  373. data, err := io.ReadAll(reader)
  374. if err != nil {
  375. t.Fatalf("Unable to read data: %v", err)
  376. }
  377. if !utf8.Valid(data) {
  378. t.Fatalf("Data is not valid UTF-8")
  379. }
  380. if !bytes.Contains(data, []byte("Тестовая лента в кодировке KOI8-R")) {
  381. t.Fatalf("Data does not contain expected unicode string: %s", "Тестовая лента в кодировке KOI8-R")
  382. }
  383. if !bytes.Contains(data, []byte("Проверка специальных символов")) {
  384. t.Fatalf("Data does not contain expected unicode string: %s", "Проверка специальных символов")
  385. }
  386. }