readability_test.go 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package readability // import "miniflux.app/v2/internal/reader/readability"
  4. import (
  5. "bytes"
  6. "os"
  7. "strings"
  8. "testing"
  9. "github.com/PuerkitoBio/goquery"
  10. )
  11. func TestBaseURL(t *testing.T) {
  12. html := `
  13. <html>
  14. <head>
  15. <base href="https://example.org/ ">
  16. </head>
  17. <body>
  18. <article>
  19. Some content
  20. </article>
  21. </body>
  22. </html>`
  23. baseURL, _, err := ExtractContent(strings.NewReader(html))
  24. if err != nil {
  25. t.Fatal(err)
  26. }
  27. if baseURL != "https://example.org/" {
  28. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  29. }
  30. }
  31. func TestMultipleBaseURL(t *testing.T) {
  32. html := `
  33. <html>
  34. <head>
  35. <base href="https://example.org/ ">
  36. <base href="https://example.com/ ">
  37. </head>
  38. <body>
  39. <article>
  40. Some content
  41. </article>
  42. </body>
  43. </html>`
  44. baseURL, _, err := ExtractContent(strings.NewReader(html))
  45. if err != nil {
  46. t.Fatal(err)
  47. }
  48. if baseURL != "https://example.org/" {
  49. t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
  50. }
  51. }
  52. func TestRelativeBaseURL(t *testing.T) {
  53. html := `
  54. <html>
  55. <head>
  56. <base href="/test/ ">
  57. </head>
  58. <body>
  59. <article>
  60. Some content
  61. </article>
  62. </body>
  63. </html>`
  64. baseURL, _, err := ExtractContent(strings.NewReader(html))
  65. if err != nil {
  66. t.Fatal(err)
  67. }
  68. if baseURL != "" {
  69. t.Errorf(`Unexpected base URL, got %q`, baseURL)
  70. }
  71. }
  72. func TestWithoutBaseURL(t *testing.T) {
  73. html := `
  74. <html>
  75. <head>
  76. <title>Test</title>
  77. </head>
  78. <body>
  79. <article>
  80. Some content
  81. </article>
  82. </body>
  83. </html>`
  84. baseURL, _, err := ExtractContent(strings.NewReader(html))
  85. if err != nil {
  86. t.Fatal(err)
  87. }
  88. if baseURL != "" {
  89. t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
  90. }
  91. }
  92. func TestRemoveStyleScript(t *testing.T) {
  93. html := `
  94. <html>
  95. <head>
  96. <title>Test</title>
  97. <script src="tololo.js"></script>
  98. </head>
  99. <body>
  100. <script src="tololo.js"></script>
  101. <style>
  102. h1 {color:red;}
  103. p {color:blue;}
  104. </style>
  105. <article>Some content</article>
  106. </body>
  107. </html>`
  108. want := `<div><div><article>Somecontent</article></div></div>`
  109. _, content, err := ExtractContent(strings.NewReader(html))
  110. if err != nil {
  111. t.Fatal(err)
  112. }
  113. content = strings.ReplaceAll(content, "\n", "")
  114. content = strings.ReplaceAll(content, " ", "")
  115. content = strings.ReplaceAll(content, "\t", "")
  116. if content != want {
  117. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  118. }
  119. }
  120. func TestRemoveBlacklist(t *testing.T) {
  121. html := `
  122. <html>
  123. <head>
  124. <title>Test</title>
  125. </head>
  126. <body>
  127. <article class="super-ad">Some content</article>
  128. <article class="g-plus-crap">Some other thing</article>
  129. <article class="stuff popupbody">And more</article>
  130. <article class="legit">Valid!</article>
  131. </body>
  132. </html>`
  133. want := `<div><div><articleclass="legit">Valid!</article></div></div>`
  134. _, content, err := ExtractContent(strings.NewReader(html))
  135. if err != nil {
  136. t.Fatal(err)
  137. }
  138. content = strings.ReplaceAll(content, "\n", "")
  139. content = strings.ReplaceAll(content, " ", "")
  140. content = strings.ReplaceAll(content, "\t", "")
  141. if content != want {
  142. t.Errorf(`Invalid content, got %s instead of %s`, content, want)
  143. }
  144. }
  145. func TestNestedSpanInCodeBlock(t *testing.T) {
  146. html := `
  147. <html>
  148. <head>
  149. <title>Test</title>
  150. </head>
  151. <body>
  152. <article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
  153. </body>
  154. </html>`
  155. want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
  156. _, result, err := ExtractContent(strings.NewReader(html))
  157. if err != nil {
  158. t.Fatal(err)
  159. }
  160. if result != want {
  161. t.Errorf(`Invalid content, got %s instead of %s`, result, want)
  162. }
  163. }
  164. func BenchmarkExtractContent(b *testing.B) {
  165. var testCases = map[string][]byte{
  166. "miniflux_github.html": {},
  167. "miniflux_wikipedia.html": {},
  168. }
  169. for filename := range testCases {
  170. data, err := os.ReadFile("testdata/" + filename)
  171. if err != nil {
  172. b.Fatalf(`Unable to read file %q: %v`, filename, err)
  173. }
  174. testCases[filename] = data
  175. }
  176. for range b.N {
  177. for _, v := range testCases {
  178. ExtractContent(bytes.NewReader(v))
  179. }
  180. }
  181. }
  182. func TestGetClassWeight(t *testing.T) {
  183. testCases := []struct {
  184. name string
  185. html string
  186. expected float32
  187. }{
  188. {
  189. name: "no class or id",
  190. html: `<div>content</div>`,
  191. expected: 0,
  192. },
  193. {
  194. name: "positive class only",
  195. html: `<div class="article">content</div>`,
  196. expected: 25,
  197. },
  198. {
  199. name: "negative class only",
  200. html: `<div class="comment">content</div>`,
  201. expected: -25,
  202. },
  203. {
  204. name: "positive id only",
  205. html: `<div id="main">content</div>`,
  206. expected: 25,
  207. },
  208. {
  209. name: "negative id only",
  210. html: `<div id="sidebar">content</div>`,
  211. expected: -25,
  212. },
  213. {
  214. name: "positive class and positive id",
  215. html: `<div class="content" id="main">content</div>`,
  216. expected: 50,
  217. },
  218. {
  219. name: "negative class and negative id",
  220. html: `<div class="comment" id="sidebar">content</div>`,
  221. expected: -50,
  222. },
  223. {
  224. name: "positive class and negative id",
  225. html: `<div class="article" id="comment">content</div>`,
  226. expected: 0,
  227. },
  228. {
  229. name: "negative class and positive id",
  230. html: `<div class="banner" id="content">content</div>`,
  231. expected: 0,
  232. },
  233. {
  234. name: "multiple positive classes",
  235. html: `<div class="article content">content</div>`,
  236. expected: 25,
  237. },
  238. {
  239. name: "multiple negative classes",
  240. html: `<div class="comment sidebar">content</div>`,
  241. expected: -25,
  242. },
  243. {
  244. name: "mixed positive and negative classes",
  245. html: `<div class="article comment">content</div>`,
  246. expected: -25, // negative takes precedence since it's checked first
  247. },
  248. {
  249. name: "case insensitive class",
  250. html: `<div class="ARTICLE">content</div>`,
  251. expected: 25,
  252. },
  253. {
  254. name: "case insensitive id",
  255. html: `<div id="MAIN">content</div>`,
  256. expected: 25,
  257. },
  258. {
  259. name: "non-matching class and id",
  260. html: `<div class="random" id="unknown">content</div>`,
  261. expected: 0,
  262. },
  263. {
  264. name: "empty class and id",
  265. html: `<div class="" id="">content</div>`,
  266. expected: 0,
  267. },
  268. {
  269. name: "class with special characters",
  270. html: `<div class="com-section">content</div>`,
  271. expected: -25, // matches com- in negative regex
  272. },
  273. {
  274. name: "id with special characters",
  275. html: `<div id="h-entry-123">content</div>`,
  276. expected: 25, // matches h-entry in positive regex
  277. },
  278. }
  279. for _, tc := range testCases {
  280. t.Run(tc.name, func(t *testing.T) {
  281. doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
  282. if err != nil {
  283. t.Fatalf("Failed to parse HTML: %v", err)
  284. }
  285. selection := doc.Find("div").First()
  286. if selection.Length() == 0 {
  287. t.Fatal("No div element found in HTML")
  288. }
  289. result := getClassWeight(selection)
  290. if result != tc.expected {
  291. t.Errorf("Expected weight %f, got %f", tc.expected, result)
  292. }
  293. })
  294. }
  295. }
  296. func TestGetClassWeightRegexPatterns(t *testing.T) {
  297. // Test specific regex patterns used in getClassWeight
  298. positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
  299. negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
  300. for _, word := range positiveWords {
  301. t.Run("positive_"+word, func(t *testing.T) {
  302. html := `<div class="` + word + `">content</div>`
  303. doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
  304. if err != nil {
  305. t.Fatalf("Failed to parse HTML: %v", err)
  306. }
  307. selection := doc.Find("div").First()
  308. result := getClassWeight(selection)
  309. if result != 25 {
  310. t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
  311. }
  312. })
  313. }
  314. for _, word := range negativeWords {
  315. t.Run("negative_"+word, func(t *testing.T) {
  316. html := `<div class="` + word + `">content</div>`
  317. doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
  318. if err != nil {
  319. t.Fatalf("Failed to parse HTML: %v", err)
  320. }
  321. selection := doc.Find("div").First()
  322. result := getClassWeight(selection)
  323. if result != -25 {
  324. t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
  325. }
  326. })
  327. }
  328. }
  329. func TestRemoveUnlikelyCandidates(t *testing.T) {
  330. testCases := []struct {
  331. name string
  332. html string
  333. expected string
  334. }{
  335. {
  336. name: "removes elements with popupbody class",
  337. html: `<html><body><div class="popupbody">popup content</div><div class="content">good content</div></body></html>`,
  338. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  339. },
  340. {
  341. name: "removes elements with -ad in class",
  342. html: `<html><body><div class="super-ad">ad content</div><div class="content">good content</div></body></html>`,
  343. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  344. },
  345. {
  346. name: "removes elements with g-plus in class",
  347. html: `<html><body><div class="g-plus-share">social content</div><div class="content">good content</div></body></html>`,
  348. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  349. },
  350. {
  351. name: "removes elements with unlikely candidates in class",
  352. html: `<html><body><div class="banner">banner</div><div class="sidebar">sidebar</div><div class="content">good content</div></body></html>`,
  353. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  354. },
  355. {
  356. name: "preserves elements with unlikely candidates but also good candidates in class",
  357. html: `<html><body><div class="banner article">mixed content</div><div class="content">good content</div></body></html>`,
  358. expected: `<html><head></head><body><div class="banner article">mixed content</div><div class="content">good content</div></body></html>`,
  359. },
  360. {
  361. name: "removes elements with unlikely candidates in id",
  362. html: `<html><body><div id="banner">banner</div><div id="main-content">good content</div></body></html>`,
  363. expected: `<html><head></head><body><div id="main-content">good content</div></body></html>`,
  364. },
  365. {
  366. name: "preserves elements with unlikely candidates but also good candidates in id",
  367. html: `<html><body><div id="comment-article">mixed content</div><div id="main">good content</div></body></html>`,
  368. expected: `<html><head></head><body><div id="comment-article">mixed content</div><div id="main">good content</div></body></html>`,
  369. },
  370. {
  371. name: "preserves html and body tags",
  372. html: `<html class="banner"><body class="sidebar"><div class="banner">content</div></body></html>`,
  373. expected: `<html class="banner"><head></head><body class="sidebar"></body></html>`,
  374. },
  375. {
  376. name: "preserves elements within code blocks",
  377. html: `<html><body><pre><code><span class="banner">code content</span></code></pre><div class="banner">remove this</div></body></html>`,
  378. expected: `<html><head></head><body><pre><code><span class="banner">code content</span></code></pre></body></html>`,
  379. },
  380. {
  381. name: "preserves elements within pre tags",
  382. html: `<html><body><pre><div class="sidebar">preformatted content</div></pre><div class="sidebar">remove this</div></body></html>`,
  383. expected: `<html><head></head><body><pre><div class="sidebar">preformatted content</div></pre></body></html>`,
  384. },
  385. {
  386. name: "case insensitive matching",
  387. html: `<html><body><div class="BANNER">uppercase banner</div><div class="Banner">mixed case banner</div><div class="content">good content</div></body></html>`,
  388. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  389. },
  390. {
  391. name: "multiple unlikely patterns in single class",
  392. html: `<html><body><div class="banner sidebar footer">multiple bad</div><div class="content">good content</div></body></html>`,
  393. expected: `<html><head></head><body><div class="content">good content</div></body></html>`,
  394. },
  395. {
  396. name: "elements without class or id are preserved",
  397. html: `<html><body><div>no attributes</div><p>paragraph</p></body></html>`,
  398. expected: `<html><head></head><body><div>no attributes</div><p>paragraph</p></body></html>`,
  399. },
  400. {
  401. name: "removes nested unlikely elements",
  402. html: `<html><body><div class="main"><div class="banner">nested banner</div><p>good content</p></div></body></html>`,
  403. expected: `<html><head></head><body><div class="main"><p>good content</p></div></body></html>`,
  404. },
  405. {
  406. name: "comprehensive unlikely candidates test",
  407. html: `<html><body><div class="breadcrumbs">breadcrumbs</div><div class="combx">combx</div><div class="comment">comment</div><div class="community">community</div><div class="cover-wrap">cover-wrap</div><div class="disqus">disqus</div><div class="extra">extra</div><div class="foot">foot</div><div class="header">header</div><div class="legends">legends</div><div class="menu">menu</div><div class="modal">modal</div><div class="related">related</div><div class="remark">remark</div><div class="replies">replies</div><div class="rss">rss</div><div class="shoutbox">shoutbox</div><div class="skyscraper">skyscraper</div><div class="social">social</div><div class="sponsor">sponsor</div><div class="supplemental">supplemental</div><div class="ad-break">ad-break</div><div class="agegate">agegate</div><div class="pagination">pagination</div><div class="pager">pager</div><div class="popup">popup</div><div class="yom-remote">yom-remote</div><div class="article">good content</div></body></html>`,
  408. expected: `<html><head></head><body><div class="article">good content</div></body></html>`,
  409. },
  410. {
  411. name: "preserves good candidates that contain unlikely words",
  412. html: `<html><body><div class="banner article">should be preserved</div><div class="comment main">should be preserved</div><div class="sidebar body">should be preserved</div><div class="footer column">should be preserved</div><div class="header and">should be preserved</div><div class="menu shadow">should be preserved</div><div class="pure-banner">should be removed</div></body></html>`,
  413. expected: `<html><head></head><body><div class="banner article">should be preserved</div><div class="comment main">should be preserved</div><div class="sidebar body">should be preserved</div><div class="footer column">should be preserved</div><div class="header and">should be preserved</div><div class="menu shadow">should be preserved</div></body></html>`,
  414. },
  415. }
  416. for _, tc := range testCases {
  417. t.Run(tc.name, func(t *testing.T) {
  418. doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
  419. if err != nil {
  420. t.Fatalf("Failed to parse HTML: %v", err)
  421. }
  422. removeUnlikelyCandidates(doc)
  423. result, err := doc.Html()
  424. if err != nil {
  425. t.Fatalf("Failed to get HTML: %v", err)
  426. }
  427. // Normalize whitespace for comparison
  428. result = strings.TrimSpace(result)
  429. expected := strings.TrimSpace(tc.expected)
  430. if result != expected {
  431. t.Errorf("\nExpected:\n%s\n\nGot:\n%s", expected, result)
  432. }
  433. })
  434. }
  435. }
  436. func TestRemoveUnlikelyCandidatesShouldRemoveFunction(t *testing.T) {
  437. // Test the internal shouldRemove function behavior through the public interface
  438. testCases := []struct {
  439. name string
  440. attr string
  441. attrType string // "class" or "id"
  442. expected bool // true if should be removed
  443. }{
  444. // Special hardcoded cases
  445. {"popupbody in class", "popupbody", "class", true},
  446. {"contains popupbody in class", "main-popupbody-content", "class", true},
  447. {"ad suffix in class", "super-ad", "class", true},
  448. {"ad in middle of class", "pre-ad-post", "class", true},
  449. {"g-plus in class", "g-plus-share", "class", true},
  450. {"contains g-plus in class", "social-g-plus-button", "class", true},
  451. // Unlikely candidates regexp
  452. {"banner class", "banner", "class", true},
  453. {"breadcrumbs class", "breadcrumbs", "class", true},
  454. {"comment class", "comment", "class", true},
  455. {"sidebar class", "sidebar", "class", true},
  456. {"footer class", "footer", "class", true},
  457. // Unlikely candidates with good candidates (should not be removed)
  458. {"banner with article", "banner article", "class", false},
  459. {"comment with main", "comment main", "class", false},
  460. {"sidebar with body", "sidebar body", "class", false},
  461. {"footer with column", "footer column", "class", false},
  462. {"menu with shadow", "menu shadow", "class", false},
  463. // Case insensitive
  464. {"uppercase banner", "BANNER", "class", true},
  465. {"mixed case comment", "Comment", "class", true},
  466. {"uppercase with good", "BANNER ARTICLE", "class", false},
  467. // ID attributes
  468. {"banner id", "banner", "id", true},
  469. {"comment id", "comment", "id", true},
  470. {"banner with article id", "banner article", "id", false},
  471. // Good candidates only
  472. {"article class", "article", "class", false},
  473. {"main class", "main", "class", false},
  474. {"content class", "content", "class", false},
  475. {"body class", "body", "class", false},
  476. // No matches
  477. {"random class", "random-class", "class", false},
  478. {"normal content", "normal-content", "class", false},
  479. {"empty string", "", "class", false},
  480. }
  481. for _, tc := range testCases {
  482. t.Run(tc.name, func(t *testing.T) {
  483. var html string
  484. if tc.attrType == "class" {
  485. html = `<html><body><div class="` + tc.attr + `">content</div></body></html>`
  486. } else {
  487. html = `<html><body><div id="` + tc.attr + `">content</div></body></html>`
  488. }
  489. doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
  490. if err != nil {
  491. t.Fatalf("Failed to parse HTML: %v", err)
  492. }
  493. // Count elements before removal
  494. beforeCount := doc.Find("div").Length()
  495. removeUnlikelyCandidates(doc)
  496. // Count elements after removal
  497. afterCount := doc.Find("div").Length()
  498. wasRemoved := beforeCount > afterCount
  499. if wasRemoved != tc.expected {
  500. t.Errorf("Expected element to be removed: %v, but was removed: %v", tc.expected, wasRemoved)
  501. }
  502. })
  503. }
  504. }
  505. func TestRemoveUnlikelyCandidatesPreservation(t *testing.T) {
  506. testCases := []struct {
  507. name string
  508. html string
  509. description string
  510. }{
  511. {
  512. name: "preserves html tag",
  513. html: `<html class="banner sidebar footer"><body><div>content</div></body></html>`,
  514. description: "HTML tag should never be removed regardless of class",
  515. },
  516. {
  517. name: "preserves body tag",
  518. html: `<html><body class="banner sidebar footer"><div>content</div></body></html>`,
  519. description: "Body tag should never be removed regardless of class",
  520. },
  521. {
  522. name: "preserves elements in pre tags",
  523. html: `<html><body><pre><span class="banner">code</span></pre></body></html>`,
  524. description: "Elements within pre tags should be preserved",
  525. },
  526. {
  527. name: "preserves elements in code tags",
  528. html: `<html><body><code><span class="sidebar">code</span></code></body></html>`,
  529. description: "Elements within code tags should be preserved",
  530. },
  531. {
  532. name: "preserves nested elements in code blocks",
  533. html: `<html><body><pre><code><div class="comment"><span class="banner">nested</span></div></code></pre></body></html>`,
  534. description: "Deeply nested elements in code blocks should be preserved",
  535. },
  536. {
  537. name: "preserves elements in mixed code scenarios",
  538. html: `<html><body><div class="main"><pre><span class="sidebar">code</span></pre><code><div class="banner">more code</div></code></div></body></html>`,
  539. description: "Multiple code block scenarios should work correctly",
  540. },
  541. }
  542. for _, tc := range testCases {
  543. t.Run(tc.name, func(t *testing.T) {
  544. doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
  545. if err != nil {
  546. t.Fatalf("Failed to parse HTML: %v", err)
  547. }
  548. // Count specific elements before removal
  549. beforeHtml := doc.Find("html").Length()
  550. beforeBody := doc.Find("body").Length()
  551. beforePre := doc.Find("pre").Length()
  552. beforeCode := doc.Find("code").Length()
  553. removeUnlikelyCandidates(doc)
  554. // Count specific elements after removal
  555. afterHtml := doc.Find("html").Length()
  556. afterBody := doc.Find("body").Length()
  557. afterPre := doc.Find("pre").Length()
  558. afterCode := doc.Find("code").Length()
  559. // These elements should always be preserved
  560. if beforeHtml != afterHtml {
  561. t.Errorf("HTML elements were removed: before=%d, after=%d", beforeHtml, afterHtml)
  562. }
  563. if beforeBody != afterBody {
  564. t.Errorf("Body elements were removed: before=%d, after=%d", beforeBody, afterBody)
  565. }
  566. if beforePre != afterPre {
  567. t.Errorf("Pre elements were removed: before=%d, after=%d", beforePre, afterPre)
  568. }
  569. if beforeCode != afterCode {
  570. t.Errorf("Code elements were removed: before=%d, after=%d", beforeCode, afterCode)
  571. }
  572. // Verify that elements within code blocks are preserved
  573. if tc.name == "preserves elements in pre tags" || tc.name == "preserves elements in code tags" || tc.name == "preserves nested elements in code blocks" {
  574. spanInCode := doc.Find("pre span, code span, pre div, code div").Length()
  575. if spanInCode == 0 {
  576. t.Error("Elements within code blocks were incorrectly removed")
  577. }
  578. }
  579. })
  580. }
  581. }