parser_test.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package parser // import "miniflux.app/v2/internal/reader/parser"
  4. import (
  5. "os"
  6. "strings"
  7. "testing"
  8. )
  9. func BenchmarkParse(b *testing.B) {
  10. var testCases = map[string][]string{
  11. "large_atom.xml": {"https://dustri.org/b", ""},
  12. "large_rss.xml": {"https://dustri.org/b", ""},
  13. "small_atom.xml": {"https://github.com/miniflux/v2/commits/main", ""},
  14. }
  15. for filename := range testCases {
  16. data, err := os.ReadFile("./testdata/" + filename)
  17. if err != nil {
  18. b.Fatalf(`Unable to read file %q: %v`, filename, err)
  19. }
  20. testCases[filename][1] = string(data)
  21. }
  22. for b.Loop() {
  23. for _, v := range testCases {
  24. ParseFeed(v[0], strings.NewReader(v[1]))
  25. }
  26. }
  27. }
  28. func FuzzParse(f *testing.F) {
  29. f.Add("https://z.org", `<?xml version="1.0" encoding="utf-8"?>
  30. <feed xmlns="http://www.w3.org/2005/Atom">
  31. <title>Example Feed</title>
  32. <link href="http://z.org/"/>
  33. <link href="/k"/>
  34. <updated>2003-12-13T18:30:02Z</updated>
  35. <author><name>John Doe</name></author>
  36. <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
  37. <entry>
  38. <title>a</title>
  39. <link href="http://example.org/b"/>
  40. <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
  41. <updated>2003-12-13T18:30:02Z</updated>
  42. <summary>c</summary>
  43. </entry>
  44. </feed>`)
  45. f.Add("https://z.org", `<?xml version="1.0"?>
  46. <rss version="2.0">
  47. <channel>
  48. <title>a</title>
  49. <link>http://z.org</link>
  50. <item>
  51. <title>a</title>
  52. <link>http://z.org</link>
  53. <description>d</description>
  54. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  55. <guid>l</guid>
  56. </item>
  57. </channel>
  58. </rss>`)
  59. f.Add("https://z.org", `<?xml version="1.0" encoding="utf-8"?>
  60. <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
  61. <channel>
  62. <title>a</title>
  63. <link>http://z.org/</link>
  64. </channel>
  65. <item>
  66. <title>a</title>
  67. <link>/</link>
  68. <description>c</description>
  69. </item>
  70. </rdf:RDF>`)
  71. f.Add("http://z.org", `{
  72. "version": "http://jsonfeed.org/version/1",
  73. "title": "a",
  74. "home_page_url": "http://z.org/",
  75. "feed_url": "http://z.org/a.json",
  76. "items": [
  77. {"id": "2","content_text": "a","url": "https://z.org/2"},
  78. {"id": "1","content_html": "<a","url":"http://z.org/1"}]}`)
  79. f.Fuzz(func(t *testing.T, url string, data string) {
  80. ParseFeed(url, strings.NewReader(data))
  81. })
  82. }
  83. func TestParseAtom03Feed(t *testing.T) {
  84. data := `<?xml version="1.0" encoding="utf-8"?>
  85. <feed version="0.3" xmlns="http://purl.org/atom/ns#">
  86. <title>dive into mark</title>
  87. <link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
  88. <modified>2003-12-13T18:30:02Z</modified>
  89. <author><name>Mark Pilgrim</name></author>
  90. <entry>
  91. <title>Atom 0.3 snapshot</title>
  92. <link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
  93. <id>tag:diveintomark.org,2003:3.2397</id>
  94. <issued>2003-12-13T08:29:29-04:00</issued>
  95. <modified>2003-12-13T18:30:02Z</modified>
  96. <summary type="text/plain">It&apos;s a test</summary>
  97. <content type="text/html" mode="escaped"><![CDATA[<p>HTML content</p>]]></content>
  98. </entry>
  99. </feed>`
  100. feed, err := ParseFeed("https://example.org/", strings.NewReader(data))
  101. if err != nil {
  102. t.Error(err)
  103. }
  104. if feed.Title != "dive into mark" {
  105. t.Errorf("Incorrect title, got: %s", feed.Title)
  106. }
  107. }
  108. func TestParseAtom10Feed(t *testing.T) {
  109. data := `<?xml version="1.0" encoding="utf-8"?>
  110. <feed xmlns="http://www.w3.org/2005/Atom">
  111. <title>Example Feed</title>
  112. <link href="http://example.org/"/>
  113. <updated>2003-12-13T18:30:02Z</updated>
  114. <author>
  115. <name>John Doe</name>
  116. </author>
  117. <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
  118. <entry>
  119. <title>Atom-Powered Robots Run Amok</title>
  120. <link href="http://example.org/2003/12/13/atom03"/>
  121. <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
  122. <updated>2003-12-13T18:30:02Z</updated>
  123. <summary>Some text.</summary>
  124. </entry>
  125. </feed>`
  126. feed, err := ParseFeed("https://example.org/", strings.NewReader(data))
  127. if err != nil {
  128. t.Error(err)
  129. }
  130. if feed.Title != "Example Feed" {
  131. t.Errorf("Incorrect title, got: %s", feed.Title)
  132. }
  133. }
  134. func TestParseAtomFeedWithRelativeURL(t *testing.T) {
  135. data := `<?xml version="1.0" encoding="utf-8"?>
  136. <feed xmlns="http://www.w3.org/2005/Atom">
  137. <title>Example Feed</title>
  138. <link href="/blog/atom.xml" rel="self" type="application/atom+xml"/>
  139. <link href="/blog"/>
  140. <entry>
  141. <title>Test</title>
  142. <link href="/blog/article.html"/>
  143. <link href="/blog/article.html" rel="alternate" type="text/html"/>
  144. <id>/blog/article.html</id>
  145. <updated>2003-12-13T18:30:02Z</updated>
  146. <summary>Some text.</summary>
  147. </entry>
  148. </feed>`
  149. feed, err := ParseFeed("https://example.org/blog/atom.xml", strings.NewReader(data))
  150. if err != nil {
  151. t.Fatal(err)
  152. }
  153. if feed.FeedURL != "https://example.org/blog/atom.xml" {
  154. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  155. }
  156. if feed.SiteURL != "https://example.org/blog" {
  157. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  158. }
  159. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  160. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  161. }
  162. }
  163. func TestParseRSS(t *testing.T) {
  164. data := `<?xml version="1.0"?>
  165. <rss version="2.0">
  166. <channel>
  167. <title>Liftoff News</title>
  168. <link>http://liftoff.msfc.nasa.gov/</link>
  169. <item>
  170. <title>Star City</title>
  171. <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
  172. <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
  173. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  174. <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
  175. </item>
  176. </channel>
  177. </rss>`
  178. feed, err := ParseFeed("http://liftoff.msfc.nasa.gov/", strings.NewReader(data))
  179. if err != nil {
  180. t.Error(err)
  181. }
  182. if feed.Title != "Liftoff News" {
  183. t.Errorf("Incorrect title, got: %s", feed.Title)
  184. }
  185. }
  186. func TestParseRSSFeedWithRelativeURL(t *testing.T) {
  187. data := `<?xml version="1.0"?>
  188. <rss version="2.0">
  189. <channel>
  190. <title>Example Feed</title>
  191. <link>/blog</link>
  192. <item>
  193. <title>Example Entry</title>
  194. <link>/blog/article.html</link>
  195. <description>Something</description>
  196. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  197. <guid>1234</guid>
  198. </item>
  199. </channel>
  200. </rss>`
  201. feed, err := ParseFeed("http://example.org/rss.xml", strings.NewReader(data))
  202. if err != nil {
  203. t.Error(err)
  204. }
  205. if feed.Title != "Example Feed" {
  206. t.Errorf("Incorrect title, got: %s", feed.Title)
  207. }
  208. if feed.FeedURL != "http://example.org/rss.xml" {
  209. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  210. }
  211. if feed.SiteURL != "http://example.org/blog" {
  212. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  213. }
  214. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  215. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  216. }
  217. }
  218. func TestParseRDF(t *testing.T) {
  219. data := `<?xml version="1.0" encoding="utf-8"?>
  220. <rdf:RDF
  221. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  222. xmlns="http://purl.org/rss/1.0/"
  223. >
  224. <channel>
  225. <title>RDF Example</title>
  226. <link>http://example.org/</link>
  227. </channel>
  228. <item>
  229. <title>Title</title>
  230. <link>http://example.org/item</link>
  231. <description>Test</description>
  232. </item>
  233. </rdf:RDF>`
  234. feed, err := ParseFeed("http://example.org/", strings.NewReader(data))
  235. if err != nil {
  236. t.Error(err)
  237. }
  238. if feed.Title != "RDF Example" {
  239. t.Errorf("Incorrect title, got: %s", feed.Title)
  240. }
  241. }
  242. func TestParseRDFWithRelativeURL(t *testing.T) {
  243. data := `<?xml version="1.0" encoding="utf-8"?>
  244. <rdf:RDF
  245. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  246. xmlns="http://purl.org/rss/1.0/"
  247. >
  248. <channel>
  249. <title>RDF Example</title>
  250. <link>/blog</link>
  251. </channel>
  252. <item>
  253. <title>Title</title>
  254. <link>/blog/article.html</link>
  255. <description>Test</description>
  256. </item>
  257. </rdf:RDF>`
  258. feed, err := ParseFeed("http://example.org/rdf.xml", strings.NewReader(data))
  259. if err != nil {
  260. t.Error(err)
  261. }
  262. if feed.FeedURL != "http://example.org/rdf.xml" {
  263. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  264. }
  265. if feed.SiteURL != "http://example.org/blog" {
  266. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  267. }
  268. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  269. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  270. }
  271. }
  272. func TestParseJson(t *testing.T) {
  273. data := `{
  274. "version": "https://jsonfeed.org/version/1",
  275. "title": "My Example Feed",
  276. "home_page_url": "https://example.org/",
  277. "feed_url": "https://example.org/feed.json",
  278. "items": [
  279. {
  280. "id": "2",
  281. "content_text": "This is a second item.",
  282. "url": "https://example.org/second-item"
  283. },
  284. {
  285. "id": "1",
  286. "content_html": "<p>Hello, world!</p>",
  287. "url": "https://example.org/initial-post"
  288. }
  289. ]
  290. }`
  291. feed, err := ParseFeed("https://example.org/feed.json", strings.NewReader(data))
  292. if err != nil {
  293. t.Error(err)
  294. }
  295. if feed.Title != "My Example Feed" {
  296. t.Errorf("Incorrect title, got: %s", feed.Title)
  297. }
  298. }
  299. func TestParseJsonFeedWithRelativeURL(t *testing.T) {
  300. data := `{
  301. "version": "https://jsonfeed.org/version/1",
  302. "title": "My Example Feed",
  303. "home_page_url": "/blog",
  304. "feed_url": "/blog/feed.json",
  305. "items": [
  306. {
  307. "id": "2",
  308. "content_text": "This is a second item.",
  309. "url": "/blog/article.html"
  310. }
  311. ]
  312. }`
  313. feed, err := ParseFeed("https://example.org/blog/feed.json", strings.NewReader(data))
  314. if err != nil {
  315. t.Error(err)
  316. }
  317. if feed.Title != "My Example Feed" {
  318. t.Errorf("Incorrect title, got: %s", feed.Title)
  319. }
  320. if feed.FeedURL != "https://example.org/blog/feed.json" {
  321. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  322. }
  323. if feed.SiteURL != "https://example.org/blog" {
  324. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  325. }
  326. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  327. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  328. }
  329. }
  330. func TestParseUnknownFeed(t *testing.T) {
  331. data := `
  332. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  333. <html xmlns="http://www.w3.org/1999/xhtml">
  334. <head>
  335. <title>Title of document</title>
  336. </head>
  337. <body>
  338. some content
  339. </body>
  340. </html>
  341. `
  342. _, err := ParseFeed("https://example.org/", strings.NewReader(data))
  343. if err == nil {
  344. t.Error("ParseFeed must returns an error")
  345. }
  346. }
  347. func TestParseEmptyFeed(t *testing.T) {
  348. _, err := ParseFeed("", strings.NewReader(""))
  349. if err == nil {
  350. t.Error("ParseFeed must returns an error")
  351. }
  352. }