Преглед изворни кода

fix(json): escape plain-text content_text and summary

JSON Feed 1.1 defines content_html as HTML but content_text and summary
as plain text. The adapter stored whichever was present directly in
entry.Content, which is treated as HTML everywhere downstream, so any
markup-like characters in a content_text or summary value (for example
"<tag>") were dropped by the sanitizer.

Escape content_text and summary with html.EscapeString before storing
them, leaving content_html untouched. This mirrors how the Atom 0.3
reader already escapes plain-text constructs.
Saleh пре 2 дана
родитељ
комит
0abd9e7145
2 измењених фајлова са 39 додато и 8 уклоњено
  1. 11 8
      internal/reader/json/adapter.go
  2. 28 0
      internal/reader/json/parser_test.go

+ 11 - 8
internal/reader/json/adapter.go

@@ -5,6 +5,7 @@ package json // import "miniflux.app/v2/internal/reader/json"
 
 import (
 	"cmp"
+	"html"
 	"log/slog"
 	"slices"
 	"strings"
@@ -110,14 +111,16 @@ func (j *JSONAdapter) BuildFeed(baseURL string) *model.Feed {
 			entry.Title = entry.URL
 		}
 
-		// Populate the entry content.
-		for _, value := range []string{item.ContentHTML, item.ContentText, item.Summary} {
-			if value = strings.TrimSpace(value); value == "" {
-				continue
-			}
-
-			entry.Content = value
-			break
+		// Populate the entry content. content_html is HTML, but content_text
+		// and summary are plain text (JSON Feed 1.1), so they must be escaped
+		// before being stored as HTML, otherwise the sanitizer drops any
+		// markup-like characters they contain.
+		if contentHTML := strings.TrimSpace(item.ContentHTML); contentHTML != "" {
+			entry.Content = contentHTML
+		} else if contentText := strings.TrimSpace(item.ContentText); contentText != "" {
+			entry.Content = html.EscapeString(contentText)
+		} else if summary := strings.TrimSpace(item.Summary); summary != "" {
+			entry.Content = html.EscapeString(summary)
 		}
 
 		// Populate the entry date.

+ 28 - 0
internal/reader/json/parser_test.go

@@ -1203,3 +1203,31 @@ func TestParseItemWithoutLanguageInheritsFeedLanguage(t *testing.T) {
 		t.Errorf("Expected entry to inherit feed language, got: %q", feed.Entries[0].Language)
 	}
 }
+
+func TestParseItemWithPlainTextContentContainingHTMLCharacters(t *testing.T) {
+	data := `{
+		"version": "https://jsonfeed.org/version/1.1",
+		"title": "Example",
+		"home_page_url": "https://example.org/",
+		"feed_url": "https://example.org/feed.json",
+		"items": [
+			{
+				"id": "1",
+				"url": "https://example.org/1",
+				"content_text": "Use <div> literally: a & b < c"
+			}
+		]
+	}`
+
+	feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// content_text is plain text (JSON Feed 1.1), so it must be HTML-escaped
+	// before being stored as HTML, otherwise the sanitizer drops the markup.
+	want := "Use &lt;div&gt; literally: a &amp; b &lt; c"
+	if feed.Entries[0].Content != want {
+		t.Errorf("Incorrect entry content, got: %q, want: %q", feed.Entries[0].Content, want)
+	}
+}