parser_test.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package parser // import "miniflux.app/reader/parser"
  5. import (
  6. "bytes"
  7. "os"
  8. "testing"
  9. "miniflux.app/http/client"
  10. )
  11. func TestParseAtom(t *testing.T) {
  12. data := `<?xml version="1.0" encoding="utf-8"?>
  13. <feed xmlns="http://www.w3.org/2005/Atom">
  14. <title>Example Feed</title>
  15. <link href="http://example.org/"/>
  16. <updated>2003-12-13T18:30:02Z</updated>
  17. <author>
  18. <name>John Doe</name>
  19. </author>
  20. <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
  21. <entry>
  22. <title>Atom-Powered Robots Run Amok</title>
  23. <link href="http://example.org/2003/12/13/atom03"/>
  24. <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
  25. <updated>2003-12-13T18:30:02Z</updated>
  26. <summary>Some text.</summary>
  27. </entry>
  28. </feed>`
  29. feed, err := ParseFeed("https://example.org/", data)
  30. if err != nil {
  31. t.Error(err)
  32. }
  33. if feed.Title != "Example Feed" {
  34. t.Errorf("Incorrect title, got: %s", feed.Title)
  35. }
  36. }
  37. func TestParseAtomFeedWithRelativeURL(t *testing.T) {
  38. data := `<?xml version="1.0" encoding="utf-8"?>
  39. <feed xmlns="http://www.w3.org/2005/Atom">
  40. <title>Example Feed</title>
  41. <link href="/blog/atom.xml" rel="self" type="application/atom+xml"/>
  42. <link href="/blog"/>
  43. <entry>
  44. <title>Test</title>
  45. <link href="/blog/article.html"/>
  46. <link href="/blog/article.html" rel="alternate" type="text/html"/>
  47. <id>/blog/article.html</id>
  48. <updated>2003-12-13T18:30:02Z</updated>
  49. <summary>Some text.</summary>
  50. </entry>
  51. </feed>`
  52. feed, err := ParseFeed("https://example.org/blog/atom.xml", data)
  53. if err != nil {
  54. t.Fatal(err)
  55. }
  56. if feed.FeedURL != "https://example.org/blog/atom.xml" {
  57. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  58. }
  59. if feed.SiteURL != "https://example.org/blog" {
  60. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  61. }
  62. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  63. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  64. }
  65. }
  66. func TestParseRSS(t *testing.T) {
  67. data := `<?xml version="1.0"?>
  68. <rss version="2.0">
  69. <channel>
  70. <title>Liftoff News</title>
  71. <link>http://liftoff.msfc.nasa.gov/</link>
  72. <item>
  73. <title>Star City</title>
  74. <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
  75. <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
  76. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  77. <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
  78. </item>
  79. </channel>
  80. </rss>`
  81. feed, err := ParseFeed("http://liftoff.msfc.nasa.gov/", data)
  82. if err != nil {
  83. t.Error(err)
  84. }
  85. if feed.Title != "Liftoff News" {
  86. t.Errorf("Incorrect title, got: %s", feed.Title)
  87. }
  88. }
  89. func TestParseRSSFeedWithRelativeURL(t *testing.T) {
  90. data := `<?xml version="1.0"?>
  91. <rss version="2.0">
  92. <channel>
  93. <title>Example Feed</title>
  94. <link>/blog</link>
  95. <item>
  96. <title>Example Entry</title>
  97. <link>/blog/article.html</link>
  98. <description>Something</description>
  99. <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
  100. <guid>1234</guid>
  101. </item>
  102. </channel>
  103. </rss>`
  104. feed, err := ParseFeed("http://example.org/rss.xml", data)
  105. if err != nil {
  106. t.Error(err)
  107. }
  108. if feed.Title != "Example Feed" {
  109. t.Errorf("Incorrect title, got: %s", feed.Title)
  110. }
  111. if feed.FeedURL != "http://example.org/rss.xml" {
  112. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  113. }
  114. if feed.SiteURL != "http://example.org/blog" {
  115. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  116. }
  117. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  118. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  119. }
  120. }
  121. func TestParseRDF(t *testing.T) {
  122. data := `<?xml version="1.0" encoding="utf-8"?>
  123. <rdf:RDF
  124. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  125. xmlns="http://purl.org/rss/1.0/"
  126. >
  127. <channel>
  128. <title>RDF Example</title>
  129. <link>http://example.org/</link>
  130. </channel>
  131. <item>
  132. <title>Title</title>
  133. <link>http://example.org/item</link>
  134. <description>Test</description>
  135. </item>
  136. </rdf:RDF>`
  137. feed, err := ParseFeed("http://example.org/", data)
  138. if err != nil {
  139. t.Error(err)
  140. }
  141. if feed.Title != "RDF Example" {
  142. t.Errorf("Incorrect title, got: %s", feed.Title)
  143. }
  144. }
  145. func TestParseRDFWithRelativeURL(t *testing.T) {
  146. data := `<?xml version="1.0" encoding="utf-8"?>
  147. <rdf:RDF
  148. xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  149. xmlns="http://purl.org/rss/1.0/"
  150. >
  151. <channel>
  152. <title>RDF Example</title>
  153. <link>/blog</link>
  154. </channel>
  155. <item>
  156. <title>Title</title>
  157. <link>/blog/article.html</link>
  158. <description>Test</description>
  159. </item>
  160. </rdf:RDF>`
  161. feed, err := ParseFeed("http://example.org/rdf.xml", data)
  162. if err != nil {
  163. t.Error(err)
  164. }
  165. if feed.FeedURL != "http://example.org/rdf.xml" {
  166. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  167. }
  168. if feed.SiteURL != "http://example.org/blog" {
  169. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  170. }
  171. if feed.Entries[0].URL != "http://example.org/blog/article.html" {
  172. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  173. }
  174. }
  175. func TestParseJson(t *testing.T) {
  176. data := `{
  177. "version": "https://jsonfeed.org/version/1",
  178. "title": "My Example Feed",
  179. "home_page_url": "https://example.org/",
  180. "feed_url": "https://example.org/feed.json",
  181. "items": [
  182. {
  183. "id": "2",
  184. "content_text": "This is a second item.",
  185. "url": "https://example.org/second-item"
  186. },
  187. {
  188. "id": "1",
  189. "content_html": "<p>Hello, world!</p>",
  190. "url": "https://example.org/initial-post"
  191. }
  192. ]
  193. }`
  194. feed, err := ParseFeed("https://example.org/feed.json", data)
  195. if err != nil {
  196. t.Error(err)
  197. }
  198. if feed.Title != "My Example Feed" {
  199. t.Errorf("Incorrect title, got: %s", feed.Title)
  200. }
  201. }
  202. func TestParseJsonFeedWithRelativeURL(t *testing.T) {
  203. data := `{
  204. "version": "https://jsonfeed.org/version/1",
  205. "title": "My Example Feed",
  206. "home_page_url": "/blog",
  207. "feed_url": "/blog/feed.json",
  208. "items": [
  209. {
  210. "id": "2",
  211. "content_text": "This is a second item.",
  212. "url": "/blog/article.html"
  213. }
  214. ]
  215. }`
  216. feed, err := ParseFeed("https://example.org/blog/feed.json", data)
  217. if err != nil {
  218. t.Error(err)
  219. }
  220. if feed.Title != "My Example Feed" {
  221. t.Errorf("Incorrect title, got: %s", feed.Title)
  222. }
  223. if feed.FeedURL != "https://example.org/blog/feed.json" {
  224. t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
  225. }
  226. if feed.SiteURL != "https://example.org/blog" {
  227. t.Errorf("Incorrect site URL, got: %s", feed.SiteURL)
  228. }
  229. if feed.Entries[0].URL != "https://example.org/blog/article.html" {
  230. t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
  231. }
  232. }
  233. func TestParseUnknownFeed(t *testing.T) {
  234. data := `
  235. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  236. <html xmlns="http://www.w3.org/1999/xhtml">
  237. <head>
  238. <title>Title of document</title>
  239. </head>
  240. <body>
  241. some content
  242. </body>
  243. </html>
  244. `
  245. _, err := ParseFeed("https://example.org/", data)
  246. if err == nil {
  247. t.Error("ParseFeed must returns an error")
  248. }
  249. }
  250. func TestParseEmptyFeed(t *testing.T) {
  251. _, err := ParseFeed("", "")
  252. if err == nil {
  253. t.Error("ParseFeed must returns an error")
  254. }
  255. }
  256. func TestDifferentEncodingWithResponse(t *testing.T) {
  257. var unicodeTestCases = []struct {
  258. filename, contentType string
  259. index int
  260. title string
  261. }{
  262. // Arabic language encoded in UTF-8.
  263. {"urdu_UTF8.xml", "text/xml; charset=utf-8", 0, "امریکی عسکری امداد کی بندش کی وجوہات: انڈیا سے جنگ، جوہری پروگرام اور اب دہشت گردوں کی پشت پناہی"},
  264. // Windows-1251 encoding and not charset in HTTP header.
  265. {"encoding_WINDOWS-1251.xml", "text/xml", 0, "Цитата #17703"},
  266. // No encoding in XML, but defined in HTTP Content-Type header.
  267. {"no_encoding_ISO-8859-1.xml", "application/xml; charset=ISO-8859-1", 2, "La criminalité liée surtout à... l'ennui ?"},
  268. // ISO-8859-1 encoding defined in XML and HTTP header.
  269. {"encoding_ISO-8859-1.xml", "application/rss+xml; charset=ISO-8859-1", 5, "Projekt Jedi: Microsoft will weiter mit US-Militär zusammenarbeiten"},
  270. // UTF-8 encoding defined in RDF document and HTTP header.
  271. {"rdf_UTF8.xml", "application/rss+xml; charset=utf-8", 1, "Mega-Deal: IBM übernimmt Red Hat"},
  272. // UTF-8 encoding defined only in RDF document.
  273. {"rdf_UTF8.xml", "application/rss+xml", 1, "Mega-Deal: IBM übernimmt Red Hat"},
  274. }
  275. for _, tc := range unicodeTestCases {
  276. content, err := os.ReadFile("testdata/" + tc.filename)
  277. if err != nil {
  278. t.Fatalf(`Unable to read file %q: %v`, tc.filename, err)
  279. }
  280. r := &client.Response{Body: bytes.NewReader(content), ContentType: tc.contentType}
  281. if encodingErr := r.EnsureUnicodeBody(); encodingErr != nil {
  282. t.Fatalf(`Encoding error for %q: %v`, tc.filename, encodingErr)
  283. }
  284. feed, parseErr := ParseFeed("https://example.org/", r.BodyAsString())
  285. if parseErr != nil {
  286. t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
  287. }
  288. if feed.Entries[tc.index].Title != tc.title {
  289. t.Errorf(`Unexpected title, got %q instead of %q`, feed.Entries[tc.index].Title, tc.title)
  290. }
  291. }
  292. }