parser_test.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package parser // import "miniflux.app/v2/internal/reader/parser"
  4. import (
  5. "bytes"
  6. "os"
  7. "testing"
  8. "miniflux.app/v2/internal/http/client"
  9. )
  10. func TestParseAtom(t *testing.T) {
  11. data := `<?xml version="1.0" encoding="utf-8"?>
  12. <feed xmlns="http://www.w3.org/2005/Atom">
  13. <title>Example Feed</title>
  14. <link href="http://example.org/"/>
  15. <updated>2003-12-13T18:30:02Z</updated>
  16. <author>
  17. <name>John Doe</name>
  18. </author>
  19. <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
  20. <entry>
  21. <title>Atom-Powered Robots Run Amok</title>
  22. <link href="http://example.org/2003/12/13/atom03"/>
  23. <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
  24. <updated>2003-12-13T18:30:02Z</updated>
  25. <summary>Some text.</summary>
  26. </entry>
  27. </feed>`
  28. feed, err := ParseFeed("https://example.org/", data)
  29. if err != nil {
  30. t.Error(err)
  31. }
  32. if feed.Title != "Example Feed" {
  33. t.Errorf("Incorrect title, got: %s", feed.Title)
  34. }
  35. }
  36. func TestParseAtomFeedWithRelativeURL(t *testing.T) {
  37. data := `<?xml version="1.0" encoding="utf-8"?>
  38. <feed xmlns="http://www.w3.org/2005/Atom">
  39. <title>Example Feed</title>
  40. <link href="/blog/atom.xml" rel="self" type="application/atom+xml"/>
  41. <link href="/blog"/>
  42. <entry>
  43. <title>Test</title>
  44. <link href="/blog/article.html"/>
  45. <link href="/blog/article.html" rel="alternate" type="text/html"/>
  46. <id>/blog/article.html</id>
  47. <updated>2003-12-13T18:30:02Z</updated>
  48. <summary>Some text.</summary>
  49. </entry>
  50. </feed>`
  51. feed, err := ParseFeed("https://example.org/blog/atom.xml", data)
  52. if err != nil {
  53. t.Fatal(err)
  54. }
  55. if feed.FeedURL != "https://example.org/blog/atom.xml" {
  56. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  57. }
  58. if feed.SiteURL != "https://example.org/blog" {
  59. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  60. }
  61. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  62. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  63. }
  64. }
  65. func TestParseRSS(t *testing.T) {
  66. data := `<?xml version="1.0"?>
  67. <rss version="2.0">
  68. <channel>
  69. <title>Liftoff News</title>
  70. <link>http://liftoff.msfc.nasa.gov/</link>
  71. <item>
  72. <title>Star City</title>
  73. <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
  74. <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
  75. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  76. <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
  77. </item>
  78. </channel>
  79. </rss>`
  80. feed, err := ParseFeed("http://liftoff.msfc.nasa.gov/", data)
  81. if err != nil {
  82. t.Error(err)
  83. }
  84. if feed.Title != "Liftoff News" {
  85. t.Errorf("Incorrect title, got: %s", feed.Title)
  86. }
  87. }
  88. func TestParseRSSFeedWithRelativeURL(t *testing.T) {
  89. data := `<?xml version="1.0"?>
  90. <rss version="2.0">
  91. <channel>
  92. <title>Example Feed</title>
  93. <link>/blog</link>
  94. <item>
  95. <title>Example Entry</title>
  96. <link>/blog/article.html</link>
  97. <description>Something</description>
  98. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  99. <guid>1234</guid>
  100. </item>
  101. </channel>
  102. </rss>`
  103. feed, err := ParseFeed("http://example.org/rss.xml", data)
  104. if err != nil {
  105. t.Error(err)
  106. }
  107. if feed.Title != "Example Feed" {
  108. t.Errorf("Incorrect title, got: %s", feed.Title)
  109. }
  110. if feed.FeedURL != "http://example.org/rss.xml" {
  111. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  112. }
  113. if feed.SiteURL != "http://example.org/blog" {
  114. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  115. }
  116. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  117. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  118. }
  119. }
  120. func TestParseRDF(t *testing.T) {
  121. data := `<?xml version="1.0" encoding="utf-8"?>
  122. <rdf:RDF
  123. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  124. xmlns="http://purl.org/rss/1.0/"
  125. >
  126. <channel>
  127. <title>RDF Example</title>
  128. <link>http://example.org/</link>
  129. </channel>
  130. <item>
  131. <title>Title</title>
  132. <link>http://example.org/item</link>
  133. <description>Test</description>
  134. </item>
  135. </rdf:RDF>`
  136. feed, err := ParseFeed("http://example.org/", data)
  137. if err != nil {
  138. t.Error(err)
  139. }
  140. if feed.Title != "RDF Example" {
  141. t.Errorf("Incorrect title, got: %s", feed.Title)
  142. }
  143. }
  144. func TestParseRDFWithRelativeURL(t *testing.T) {
  145. data := `<?xml version="1.0" encoding="utf-8"?>
  146. <rdf:RDF
  147. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  148. xmlns="http://purl.org/rss/1.0/"
  149. >
  150. <channel>
  151. <title>RDF Example</title>
  152. <link>/blog</link>
  153. </channel>
  154. <item>
  155. <title>Title</title>
  156. <link>/blog/article.html</link>
  157. <description>Test</description>
  158. </item>
  159. </rdf:RDF>`
  160. feed, err := ParseFeed("http://example.org/rdf.xml", data)
  161. if err != nil {
  162. t.Error(err)
  163. }
  164. if feed.FeedURL != "http://example.org/rdf.xml" {
  165. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  166. }
  167. if feed.SiteURL != "http://example.org/blog" {
  168. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  169. }
  170. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  171. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  172. }
  173. }
  174. func TestParseJson(t *testing.T) {
  175. data := `{
  176. "version": "https://jsonfeed.org/version/1",
  177. "title": "My Example Feed",
  178. "home_page_url": "https://example.org/",
  179. "feed_url": "https://example.org/feed.json",
  180. "items": [
  181. {
  182. "id": "2",
  183. "content_text": "This is a second item.",
  184. "url": "https://example.org/second-item"
  185. },
  186. {
  187. "id": "1",
  188. "content_html": "<p>Hello, world!</p>",
  189. "url": "https://example.org/initial-post"
  190. }
  191. ]
  192. }`
  193. feed, err := ParseFeed("https://example.org/feed.json", data)
  194. if err != nil {
  195. t.Error(err)
  196. }
  197. if feed.Title != "My Example Feed" {
  198. t.Errorf("Incorrect title, got: %s", feed.Title)
  199. }
  200. }
  201. func TestParseJsonFeedWithRelativeURL(t *testing.T) {
  202. data := `{
  203. "version": "https://jsonfeed.org/version/1",
  204. "title": "My Example Feed",
  205. "home_page_url": "/blog",
  206. "feed_url": "/blog/feed.json",
  207. "items": [
  208. {
  209. "id": "2",
  210. "content_text": "This is a second item.",
  211. "url": "/blog/article.html"
  212. }
  213. ]
  214. }`
  215. feed, err := ParseFeed("https://example.org/blog/feed.json", data)
  216. if err != nil {
  217. t.Error(err)
  218. }
  219. if feed.Title != "My Example Feed" {
  220. t.Errorf("Incorrect title, got: %s", feed.Title)
  221. }
  222. if feed.FeedURL != "https://example.org/blog/feed.json" {
  223. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  224. }
  225. if feed.SiteURL != "https://example.org/blog" {
  226. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  227. }
  228. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  229. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  230. }
  231. }
  232. func TestParseUnknownFeed(t *testing.T) {
  233. data := `
  234. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  235. <html xmlns="http://www.w3.org/1999/xhtml">
  236. <head>
  237. <title>Title of document</title>
  238. </head>
  239. <body>
  240. some content
  241. </body>
  242. </html>
  243. `
  244. _, err := ParseFeed("https://example.org/", data)
  245. if err == nil {
  246. t.Error("ParseFeed must returns an error")
  247. }
  248. }
  249. func TestParseEmptyFeed(t *testing.T) {
  250. _, err := ParseFeed("", "")
  251. if err == nil {
  252. t.Error("ParseFeed must returns an error")
  253. }
  254. }
  255. func TestDifferentEncodingWithResponse(t *testing.T) {
  256. var unicodeTestCases = []struct {
  257. filename, contentType string
  258. index int
  259. title string
  260. }{
  261. // Arabic language encoded in UTF-8.
  262. {"urdu_UTF8.xml", "text/xml; charset=utf-8", 0, "امریکی عسکری امداد کی بندش کی وجوہات: انڈیا سے جنگ، جوہری پروگرام اور اب دہشت گردوں کی پشت پناہی"},
  263. // Windows-1251 encoding and not charset in HTTP header.
  264. {"encoding_WINDOWS-1251.xml", "text/xml", 0, "Цитата #17703"},
  265. // No encoding in XML, but defined in HTTP Content-Type header.
  266. {"no_encoding_ISO-8859-1.xml", "application/xml; charset=ISO-8859-1", 2, "La criminalité liée surtout à... l'ennui ?"},
  267. // ISO-8859-1 encoding defined in XML and HTTP header.
  268. {"encoding_ISO-8859-1.xml", "application/rss+xml; charset=ISO-8859-1", 5, "Projekt Jedi: Microsoft will weiter mit US-Militär zusammenarbeiten"},
  269. // UTF-8 encoding defined in RDF document and HTTP header.
  270. {"rdf_UTF8.xml", "application/rss+xml; charset=utf-8", 1, "Mega-Deal: IBM übernimmt Red Hat"},
  271. // UTF-8 encoding defined only in RDF document.
  272. {"rdf_UTF8.xml", "application/rss+xml", 1, "Mega-Deal: IBM übernimmt Red Hat"},
  273. }
  274. for _, tc := range unicodeTestCases {
  275. content, err := os.ReadFile("testdata/" + tc.filename)
  276. if err != nil {
  277. t.Fatalf(`Unable to read file %q: %v`, tc.filename, err)
  278. }
  279. r := &client.Response{Body: bytes.NewReader(content), ContentType: tc.contentType}
  280. if encodingErr := r.EnsureUnicodeBody(); encodingErr != nil {
  281. t.Fatalf(`Encoding error for %q: %v`, tc.filename, encodingErr)
  282. }
  283. feed, parseErr := ParseFeed("https://example.org/", r.BodyAsString())
  284. if parseErr != nil {
  285. t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
  286. }
  287. if feed.Entries[tc.index].Title != tc.title {
  288. t.Errorf(`Unexpected title, got %q instead of %q`, feed.Entries[tc.index].Title, tc.title)
  289. }
  290. }
  291. }