readability_test.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package readability // import "miniflux.app/v2/internal/reader/readability"
  4. import (
  5. "bytes"
  6. "os"
  7. "strings"
  8. "testing"
  9. )
  10. func TestBaseURL(t *testing.T) {
  11. html := `
  12. <html>
  13. <head>
  14. <base href="https://example.org/ ">
  15. </head>
  16. <body>
  17. <article>
  18. Some content
  19. </article>
  20. </body>
  21. </html>`
  22. baseURL, _, err := ExtractContent(strings.NewReader(html))
  23. if err != nil {
  24. t.Fatal(err)
  25. }
  26. if baseURL != "https://example.org/" {
  27. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  28. }
  29. }
  30. func TestMultipleBaseURL(t *testing.T) {
  31. html := `
  32. <html>
  33. <head>
  34. <base href="https://example.org/ ">
  35. <base href="https://example.com/ ">
  36. </head>
  37. <body>
  38. <article>
  39. Some content
  40. </article>
  41. </body>
  42. </html>`
  43. baseURL, _, err := ExtractContent(strings.NewReader(html))
  44. if err != nil {
  45. t.Fatal(err)
  46. }
  47. if baseURL != "https://example.org/" {
  48. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  49. }
  50. }
  51. func TestRelativeBaseURL(t *testing.T) {
  52. html := `
  53. <html>
  54. <head>
  55. <base href="/test/ ">
  56. </head>
  57. <body>
  58. <article>
  59. Some content
  60. </article>
  61. </body>
  62. </html>`
  63. baseURL, _, err := ExtractContent(strings.NewReader(html))
  64. if err != nil {
  65. t.Fatal(err)
  66. }
  67. if baseURL != "" {
  68. t.Errorf(`Unexpected base URL, got %q`, baseURL)
  69. }
  70. }
  71. func TestWithoutBaseURL(t *testing.T) {
  72. html := `
  73. <html>
  74. <head>
  75. <title>Test</title>
  76. </head>
  77. <body>
  78. <article>
  79. Some content
  80. </article>
  81. </body>
  82. </html>`
  83. baseURL, _, err := ExtractContent(strings.NewReader(html))
  84. if err != nil {
  85. t.Fatal(err)
  86. }
  87. if baseURL != "" {
  88. t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
  89. }
  90. }
  91. func TestRemoveStyleScript(t *testing.T) {
  92. html := `
  93. <html>
  94. <head>
  95. <title>Test</title>
  96. <script src="tololo.js"></script>
  97. </head>
  98. <body>
  99. <script src="tololo.js"></script>
  100. <style>
  101. h1 {color:red;}
  102. p {color:blue;}
  103. </style>
  104. <article>Some content</article>
  105. </body>
  106. </html>`
  107. want := `<div><div><article>Somecontent</article></div></div>`
  108. _, content, err := ExtractContent(strings.NewReader(html))
  109. if err != nil {
  110. t.Fatal(err)
  111. }
  112. content = strings.ReplaceAll(content, "\n", "")
  113. content = strings.ReplaceAll(content, " ", "")
  114. content = strings.ReplaceAll(content, "\t", "")
  115. if content != want {
  116. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  117. }
  118. }
  119. func TestRemoveBlacklist(t *testing.T) {
  120. html := `
  121. <html>
  122. <head>
  123. <title>Test</title>
  124. </head>
  125. <body>
  126. <article class="super-ad">Some content</article>
  127. <article class="g-plus-crap">Some other thing</article>
  128. <article class="stuff popupbody">And more</article>
  129. <article class="legit">Valid!</article>
  130. </body>
  131. </html>`
  132. want := `<div><div><articleclass="legit">Valid!</article></div></div>`
  133. _, content, err := ExtractContent(strings.NewReader(html))
  134. if err != nil {
  135. t.Fatal(err)
  136. }
  137. content = strings.ReplaceAll(content, "\n", "")
  138. content = strings.ReplaceAll(content, " ", "")
  139. content = strings.ReplaceAll(content, "\t", "")
  140. if content != want {
  141. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  142. }
  143. }
  144. func TestNestedSpanInCodeBlock(t *testing.T) {
  145. html := `
  146. <html>
  147. <head>
  148. <title>Test</title>
  149. </head>
  150. <body>
  151. <article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
  152. </body>
  153. </html>`
  154. want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
  155. _, result, err := ExtractContent(strings.NewReader(html))
  156. if err != nil {
  157. t.Fatal(err)
  158. }
  159. if result != want {
  160. t.Errorf(`Invalid content, got %s instead of %s`, result, want)
  161. }
  162. }
  163. func BenchmarkExtractContent(b *testing.B) {
  164. var testCases = map[string][]byte{
  165. "miniflux_github.html": {},
  166. "miniflux_wikipedia.html": {},
  167. }
  168. for filename := range testCases {
  169. data, err := os.ReadFile("testdata/" + filename)
  170. if err != nil {
  171. b.Fatalf(`Unable to read file %q: %v`, filename, err)
  172. }
  173. testCases[filename] = data
  174. }
  175. for range b.N {
  176. for _, v := range testCases {
  177. ExtractContent(bytes.NewReader(v))
  178. }
  179. }
  180. }