truncate_test.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer
  4. import (
  5. "os"
  6. "strconv"
  7. "testing"
  8. )
  9. func TestTruncateHTML(t *testing.T) {
  10. tests := []struct {
  11. name string
  12. input string
  13. maxLen int
  14. expected string
  15. }{
  16. {
  17. name: "text lower than limit",
  18. input: "This is a <strong>bug 🐛</strong>.",
  19. maxLen: 50,
  20. expected: "This is a bug 🐛.",
  21. },
  22. {
  23. name: "text above limit",
  24. input: "This is <strong>HTML</strong>.",
  25. maxLen: 4,
  26. expected: "This…",
  27. },
  28. {
  29. name: "unicode text above limit",
  30. input: "This is a <strong>bike 🚲</strong>.",
  31. maxLen: 4,
  32. expected: "This…",
  33. },
  34. {
  35. name: "multiline text above limit",
  36. input: "\n\t\tThis is a <strong>bike\n\t\t🚲</strong>.\n\n\t",
  37. maxLen: 15,
  38. expected: "This is a bike…",
  39. },
  40. {
  41. name: "multiline text lower than limit",
  42. input: "\n\t\tThis is a <strong>bike\n 🚲</strong>.\n\n\t",
  43. maxLen: 20,
  44. expected: "This is a bike 🚲.",
  45. },
  46. {
  47. name: "multiple spaces",
  48. input: "hello world test",
  49. maxLen: 20,
  50. expected: "hello world test",
  51. },
  52. {
  53. name: "tabs and newlines",
  54. input: "hello\t\tworld\n\ntest",
  55. maxLen: 20,
  56. expected: "hello world test",
  57. },
  58. {
  59. name: "truncation with unicode",
  60. input: "hello world 你好",
  61. maxLen: 11,
  62. expected: "hello world…",
  63. },
  64. {
  65. name: "html stripping",
  66. input: "<p>hello <b>world</b> test</p>",
  67. maxLen: 20,
  68. expected: "hello world test",
  69. },
  70. {
  71. name: "no truncation needed",
  72. input: "hello world",
  73. maxLen: 20,
  74. expected: "hello world",
  75. },
  76. {
  77. name: "just enough characters",
  78. input: "Hello",
  79. maxLen: 5,
  80. expected: "Hello",
  81. },
  82. {
  83. name: "just enough unicode characters",
  84. input: "Привет",
  85. maxLen: 6,
  86. expected: "Привет",
  87. },
  88. {
  89. name: "spaces around tag",
  90. input: "hello <br/> world",
  91. maxLen: 20,
  92. expected: "hello world",
  93. },
  94. {
  95. name: "leading spaces",
  96. input: " hello world",
  97. maxLen: 5,
  98. expected: "hello…",
  99. },
  100. {
  101. name: "text above limit with space at the end",
  102. input: "hello world",
  103. maxLen: 6,
  104. expected: "hello…",
  105. },
  106. {
  107. name: "leading space before tag",
  108. input: " <a>hello</a>",
  109. maxLen: 15,
  110. expected: "hello",
  111. },
  112. {
  113. name: "space-only tokens in between tags",
  114. input: "hello <br/>\t<a> </a>world",
  115. maxLen: 15,
  116. expected: "hello world",
  117. },
  118. {
  119. name: "truncate mid-word",
  120. input: "hello world",
  121. maxLen: 8,
  122. expected: "hello wo…",
  123. },
  124. {
  125. name: "truncate mid-word with unicode",
  126. input: "Съешь ещё этих мягких французских булок, да выпей же чаю",
  127. maxLen: 25,
  128. expected: "Съешь ещё этих мягких фра…",
  129. },
  130. {
  131. name: "negative limit",
  132. input: "whatever",
  133. maxLen: -10,
  134. expected: "…",
  135. },
  136. {
  137. name: "zero limit",
  138. input: "whatever",
  139. maxLen: 0,
  140. expected: "…",
  141. },
  142. }
  143. for _, tt := range tests {
  144. t.Run(tt.name, func(t *testing.T) {
  145. result := TruncateHTML(tt.input, tt.maxLen)
  146. if result != tt.expected {
  147. t.Errorf("TruncateHTML(%q, %d) = %q, want %q",
  148. tt.input, tt.maxLen, result, tt.expected)
  149. }
  150. })
  151. }
  152. }
  153. func BenchmarkTruncateHTML(b *testing.B) {
  154. benches := []struct {
  155. filename string
  156. limit int
  157. }{
  158. {
  159. filename: "miniflux_github.html",
  160. limit: 100,
  161. },
  162. {
  163. filename: "miniflux_github.html",
  164. limit: 10_000,
  165. },
  166. {
  167. filename: "miniflux_wikipedia.html",
  168. limit: 100,
  169. },
  170. {
  171. filename: "miniflux_wikipedia.html",
  172. limit: 100_000,
  173. },
  174. }
  175. for _, f := range benches {
  176. data, err := os.ReadFile("testdata/" + f.filename)
  177. if err != nil {
  178. b.Fatalf(`Unable to read file %q: %v`, f.filename, err)
  179. }
  180. b.Run(f.filename+"_"+strconv.Itoa(f.limit), func(b *testing.B) {
  181. var junk string
  182. str := string(data)
  183. for b.Loop() {
  184. junk = TruncateHTML(str, 100)
  185. }
  186. _ = junk
  187. })
  188. }
  189. }