readability_test.go 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package readability // import "miniflux.app/v2/internal/reader/readability"
  4. import (
  5. "strings"
  6. "testing"
  7. )
  8. func TestBaseURL(t *testing.T) {
  9. html := `
  10. <html>
  11. <head>
  12. <base href="https://example.org/ ">
  13. </head>
  14. <body>
  15. <article>
  16. Some content
  17. </article>
  18. </body>
  19. </html>`
  20. baseURL, _, err := ExtractContent(strings.NewReader(html))
  21. if err != nil {
  22. t.Fatal(err)
  23. }
  24. if baseURL != "https://example.org/" {
  25. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  26. }
  27. }
  28. func TestMultipleBaseURL(t *testing.T) {
  29. html := `
  30. <html>
  31. <head>
  32. <base href="https://example.org/ ">
  33. <base href="https://example.com/ ">
  34. </head>
  35. <body>
  36. <article>
  37. Some content
  38. </article>
  39. </body>
  40. </html>`
  41. baseURL, _, err := ExtractContent(strings.NewReader(html))
  42. if err != nil {
  43. t.Fatal(err)
  44. }
  45. if baseURL != "https://example.org/" {
  46. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  47. }
  48. }
  49. func TestRelativeBaseURL(t *testing.T) {
  50. html := `
  51. <html>
  52. <head>
  53. <base href="/test/ ">
  54. </head>
  55. <body>
  56. <article>
  57. Some content
  58. </article>
  59. </body>
  60. </html>`
  61. baseURL, _, err := ExtractContent(strings.NewReader(html))
  62. if err != nil {
  63. t.Fatal(err)
  64. }
  65. if baseURL != "" {
  66. t.Errorf(`Unexpected base URL, got %q`, baseURL)
  67. }
  68. }
  69. func TestWithoutBaseURL(t *testing.T) {
  70. html := `
  71. <html>
  72. <head>
  73. <title>Test</title>
  74. </head>
  75. <body>
  76. <article>
  77. Some content
  78. </article>
  79. </body>
  80. </html>`
  81. baseURL, _, err := ExtractContent(strings.NewReader(html))
  82. if err != nil {
  83. t.Fatal(err)
  84. }
  85. if baseURL != "" {
  86. t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
  87. }
  88. }
  89. func TestRemoveStyleScript(t *testing.T) {
  90. html := `
  91. <html>
  92. <head>
  93. <title>Test</title>
  94. <script src="tololo.js"></script>
  95. </head>
  96. <body>
  97. <script src="tololo.js"></script>
  98. <style>
  99. h1 {color:red;}
  100. p {color:blue;}
  101. </style>
  102. <article>Some content</article>
  103. </body>
  104. </html>`
  105. want := `<div><div><article>Somecontent</article></div></div>`
  106. _, content, err := ExtractContent(strings.NewReader(html))
  107. if err != nil {
  108. t.Fatal(err)
  109. }
  110. content = strings.ReplaceAll(content, "\n", "")
  111. content = strings.ReplaceAll(content, " ", "")
  112. content = strings.ReplaceAll(content, "\t", "")
  113. if content != want {
  114. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  115. }
  116. }
  117. func TestRemoveBlacklist(t *testing.T) {
  118. html := `
  119. <html>
  120. <head>
  121. <title>Test</title>
  122. </head>
  123. <body>
  124. <article class="super-ad">Some content</article>
  125. <article class="g-plus-crap">Some other thing</article>
  126. <article class="stuff popupbody">And more</article>
  127. <article class="legit">Valid!</article>
  128. </body>
  129. </html>`
  130. want := `<div><div><articleclass="legit">Valid!</article></div></div>`
  131. _, content, err := ExtractContent(strings.NewReader(html))
  132. if err != nil {
  133. t.Fatal(err)
  134. }
  135. content = strings.ReplaceAll(content, "\n", "")
  136. content = strings.ReplaceAll(content, " ", "")
  137. content = strings.ReplaceAll(content, "\t", "")
  138. if content != want {
  139. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  140. }
  141. }