2 лет назад · d0f99cee1a
--- a/internal/reader/atom/parser.go
+++ b/internal/reader/atom/parser.go
@@ -29,7 +29,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) {
 
				 		rawFeed = new(atom10Feed)
			
 
				 	}
			
 
				 
			
 
				-	if err := xml_decoder.NewDecoder(&buf).Decode(rawFeed); err != nil {
			
 
				+	if err := xml_decoder.NewXMLDecoder(&buf).Decode(rawFeed); err != nil {
			
 
				 		return nil, fmt.Errorf("atom: unable to parse feed: %w", err)
			
 
				 	}
			
 
				 
			
@@ -37,7 +37,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) {
 
				 }
			
 
				 
			
 
				 func getAtomFeedVersion(data io.Reader) string {
			
 
				-	decoder := xml_decoder.NewDecoder(data)
			
 
				+	decoder := xml_decoder.NewXMLDecoder(data)
			
 
				 	for {
			
 
				 		token, _ := decoder.Token()
			
 
				 		if token == nil {
			
--- a/internal/reader/encoding/encoding.go
+++ b/internal/reader/encoding/encoding.go
@@ -22,7 +22,7 @@ import (
 
				 // - Feeds with encoding specified in both places
			
 
				 // - Feeds with encoding specified only in XML document and not in HTTP header
			
 
				 // - Feeds with wrong encoding defined and already in UTF-8
			
 
				-func CharsetReader(label string, input io.Reader) (io.Reader, error) {
			
 
				+func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) {
			
 
				 	buffer, _ := io.ReadAll(input)
			
 
				 	r := bytes.NewReader(buffer)
			
 
				 
			
@@ -33,5 +33,10 @@ func CharsetReader(label string, input io.Reader) (io.Reader, error) {
 
				 	}
			
 
				 
			
 
				 	// Transform document to UTF-8 from the specified encoding in XML prolog.
			
 
				-	return charset.NewReaderLabel(label, r)
			
 
				+	return charset.NewReaderLabel(charsetLabel, r)
			
 
				+}
			
 
				+
			
 
				+// CharsetReaderFromContentType is used when the encoding is not specified for the input document.
			
 
				+func CharsetReaderFromContentType(contentType string, input io.Reader) (io.Reader, error) {
			
 
				+	return charset.NewReader(input, contentType)
			
 
				 }
			
--- a/internal/reader/icon/finder.go
+++ b/internal/reader/icon/finder.go
@@ -14,6 +14,7 @@ import (
 
				 	"miniflux.app/v2/internal/config"
			
 
				 	"miniflux.app/v2/internal/crypto"
			
 
				 	"miniflux.app/v2/internal/model"
			
 
				+	"miniflux.app/v2/internal/reader/encoding"
			
 
				 	"miniflux.app/v2/internal/reader/fetcher"
			
 
				 	"miniflux.app/v2/internal/urllib"
			
 
				 
			
@@ -110,7 +111,10 @@ func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
 
				 		return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
			
 
				 	}
			
 
				 
			
 
				-	iconURLs, err := findIconURLsFromHTMLDocument(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
			
 
				+	iconURLs, err := findIconURLsFromHTMLDocument(
			
 
				+		responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
			
 
				+		responseHandler.ContentType(),
			
 
				+	)
			
 
				 	if err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
@@ -178,7 +182,7 @@ func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
 
				 	return icon, nil
			
 
				 }
			
 
				 
			
 
				-func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
			
 
				+func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) {
			
 
				 	queries := []string{
			
 
				 		"link[rel='shortcut icon']",
			
 
				 		"link[rel='Shortcut Icon']",
			
@@ -186,7 +190,12 @@ func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
 
				 		"link[rel='icon']",
			
 
				 	}
			
 
				 
			
 
				-	doc, err := goquery.NewDocumentFromReader(body)
			
 
				+	htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body)
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
			
 
				 	if err != nil {
			
 
				 		return nil, fmt.Errorf("icon: unable to read document: %v", err)
			
 
				 	}
			
--- a/internal/reader/icon/finder_test.go
+++ b/internal/reader/icon/finder_test.go
@@ -112,7 +112,7 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) {
 
				 		/static/img/favicon.ico
			
 
				 	">`
			
 
				 
			
 
				-	iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html))
			
 
				+	iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html")
			
 
				 	if err != nil {
			
 
				 		t.Fatal(err)
			
 
				 	}
			
--- a/internal/reader/parser/format.go
+++ b/internal/reader/parser/format.go
@@ -30,7 +30,7 @@ func DetectFeedFormat(r io.ReadSeeker) string {
 
				 	}
			
 
				 
			
 
				 	r.Seek(0, io.SeekStart)
			
 
				-	decoder := rxml.NewDecoder(r)
			
 
				+	decoder := rxml.NewXMLDecoder(r)
			
 
				 
			
 
				 	for {
			
 
				 		token, _ := decoder.Token()
			
--- a/internal/reader/rdf/parser.go
+++ b/internal/reader/rdf/parser.go
@@ -14,7 +14,7 @@ import (
 
				 // Parse returns a normalized feed struct from a RDF feed.
			
 
				 func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
			
 
				 	feed := new(rdfFeed)
			
 
				-	if err := xml.NewDecoder(data).Decode(feed); err != nil {
			
 
				+	if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
			
 
				 		return nil, fmt.Errorf("rdf: unable to parse feed: %w", err)
			
 
				 	}
			
 
				 
			
--- a/internal/reader/rss/parser.go
+++ b/internal/reader/rss/parser.go
@@ -14,7 +14,7 @@ import (
 
				 // Parse returns a normalized feed struct from a RSS feed.
			
 
				 func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
			
 
				 	feed := new(rssFeed)
			
 
				-	if err := xml.NewDecoder(data).Decode(feed); err != nil {
			
 
				+	if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
			
 
				 		return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
			
 
				 	}
			
 
				 	return feed.Transform(baseURL), nil
			
--- a/internal/reader/scraper/scraper.go
+++ b/internal/reader/scraper/scraper.go
@@ -10,6 +10,7 @@ import (
 
				 	"strings"
			
 
				 
			
 
				 	"miniflux.app/v2/internal/config"
			
 
				+	"miniflux.app/v2/internal/reader/encoding"
			
 
				 	"miniflux.app/v2/internal/reader/fetcher"
			
 
				 	"miniflux.app/v2/internal/reader/readability"
			
 
				 	"miniflux.app/v2/internal/urllib"
			
@@ -41,17 +42,25 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules str
 
				 	var content string
			
 
				 	var err error
			
 
				 
			
 
				+	htmlDocumentReader, err := encoding.CharsetReaderFromContentType(
			
 
				+		responseHandler.ContentType(),
			
 
				+		responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
			
 
				+	)
			
 
				+	if err != nil {
			
 
				+		return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
			
 
				+	}
			
 
				+
			
 
				 	if sameSite && rules != "" {
			
 
				 		slog.Debug("Extracting content with custom rules",
			
 
				 			"url", websiteURL,
			
 
				 			"rules", rules,
			
 
				 		)
			
 
				-		content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules)
			
 
				+		content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
			
 
				 	} else {
			
 
				 		slog.Debug("Extracting content with readability",
			
 
				 			"url", websiteURL,
			
 
				 		)
			
 
				-		content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
			
 
				+		content, err = readability.ExtractContent(htmlDocumentReader)
			
 
				 	}
			
 
				 
			
 
				 	if err != nil {
			
--- a/internal/reader/subscription/finder.go
+++ b/internal/reader/subscription/finder.go
@@ -14,6 +14,7 @@ import (
 
				 	"miniflux.app/v2/internal/integration/rssbridge"
			
 
				 	"miniflux.app/v2/internal/locale"
			
 
				 	"miniflux.app/v2/internal/model"
			
 
				+	"miniflux.app/v2/internal/reader/encoding"
			
 
				 	"miniflux.app/v2/internal/reader/fetcher"
			
 
				 	"miniflux.app/v2/internal/reader/parser"
			
 
				 	"miniflux.app/v2/internal/urllib"
			
@@ -98,8 +99,11 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
 
				 	}
			
 
				 
			
 
				 	// Step 4) Parse web page to find feeds from HTML meta tags.
			
 
				-	slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL))
			
 
				-	subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, bytes.NewReader(responseBody))
			
 
				+	slog.Debug("Try to detect feeds from HTML meta tags",
			
 
				+		slog.String("website_url", websiteURL),
			
 
				+		slog.String("content_type", responseHandler.ContentType()),
			
 
				+	)
			
 
				+	subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody))
			
 
				 	if localizedError != nil {
			
 
				 		return nil, localizedError
			
 
				 	}
			
@@ -138,7 +142,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
 
				 	return nil, nil
			
 
				 }
			
 
				 
			
 
				-func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
			
 
				+func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentType string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
			
 
				 	queries := map[string]string{
			
 
				 		"link[type='application/rss+xml']":   parser.FormatRSS,
			
 
				 		"link[type='application/atom+xml']":  parser.FormatAtom,
			
@@ -146,7 +150,12 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
 
				 		"link[type='application/feed+json']": parser.FormatJSON,
			
 
				 	}
			
 
				 
			
 
				-	doc, err := goquery.NewDocumentFromReader(body)
			
 
				+	htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body)
			
 
				+	if err != nil {
			
 
				+		return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
			
 
				+	}
			
 
				+
			
 
				+	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
			
 
				 	if err != nil {
			
 
				 		return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
			
 
				 	}
			
--- a/internal/reader/subscription/finder_test.go
+++ b/internal/reader/subscription/finder_test.go
@@ -40,7 +40,7 @@ func TestParseWebPageWithRssFeed(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -73,7 +73,7 @@ func TestParseWebPageWithAtomFeed(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -106,7 +106,7 @@ func TestParseWebPageWithJSONFeed(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -139,7 +139,7 @@ func TestParseWebPageWithOldJSONFeedMimeType(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -172,7 +172,7 @@ func TestParseWebPageWithRelativeFeedURL(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -205,7 +205,7 @@ func TestParseWebPageWithEmptyTitle(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -239,7 +239,7 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -261,7 +261,7 @@ func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -294,7 +294,7 @@ func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
@@ -315,7 +315,7 @@ func TestParseWebPageWithNoHref(t *testing.T) {
 
				 		</body>
			
 
				 	</html>`
			
 
				 
			
 
				-	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
			
 
				+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage))
			
 
				 	if err != nil {
			
 
				 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
			
 
				 	}
			
--- a/internal/reader/xml/decoder.go
+++ b/internal/reader/xml/decoder.go
@@ -13,8 +13,8 @@ import (
 
				 	"miniflux.app/v2/internal/reader/encoding"
			
 
				 )
			
 
				 
			
 
				-// NewDecoder returns a XML decoder that filters illegal characters.
			
 
				-func NewDecoder(data io.Reader) *xml.Decoder {
			
 
				+// NewXMLDecoder returns a XML decoder that filters illegal characters.
			
 
				+func NewXMLDecoder(data io.Reader) *xml.Decoder {
			
 
				 	var decoder *xml.Decoder
			
 
				 	buffer, _ := io.ReadAll(data)
			
 
				 	enc := procInst("encoding", string(buffer))
			
@@ -36,7 +36,7 @@ func NewDecoder(data io.Reader) *xml.Decoder {
 
				 		}
			
 
				 		rawData, err := io.ReadAll(utf8Reader)
			
 
				 		if err != nil {
			
 
				-			return nil, fmt.Errorf("Unable to read data: %q", err)
			
 
				+			return nil, fmt.Errorf("encoding: unable to read data: %w", err)
			
 
				 		}
			
 
				 		filteredBytes := bytes.Map(filterValidXMLChar, rawData)
			
 
				 		return bytes.NewReader(filteredBytes), nil
			
--- a/internal/reader/xml/decoder_test.go
+++ b/internal/reader/xml/decoder_test.go
@@ -10,7 +10,7 @@ import (
 
				 	"testing"
			
 
				 )
			
 
				 
			
 
				-func TestUTF8WithIllegalCharacters(t *testing.T) {
			
 
				+func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
			
 
				 	type myxml struct {
			
 
				 		XMLName xml.Name `xml:"rss"`
			
 
				 		Version string   `xml:"version,attr"`
			
@@ -23,7 +23,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) {
 
				 
			
 
				 	var x myxml
			
 
				 
			
 
				-	decoder := NewDecoder(reader)
			
 
				+	decoder := NewXMLDecoder(reader)
			
 
				 	err := decoder.Decode(&x)
			
 
				 	if err != nil {
			
 
				 		t.Error(err)
			
@@ -34,7 +34,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func TestWindows251WithIllegalCharacters(t *testing.T) {
			
 
				+func TestXMLDocumentWindows251EncodedWithIllegalCharacters(t *testing.T) {
			
 
				 	type myxml struct {
			
 
				 		XMLName xml.Name `xml:"rss"`
			
 
				 		Version string   `xml:"version,attr"`
			
@@ -47,7 +47,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) {
 
				 
			
 
				 	var x myxml
			
 
				 
			
 
				-	decoder := NewDecoder(reader)
			
 
				+	decoder := NewXMLDecoder(reader)
			
 
				 	err := decoder.Decode(&x)
			
 
				 	if err != nil {
			
 
				 		t.Error(err)
			
@@ -58,7 +58,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func TestIllegalEncodingField(t *testing.T) {
			
 
				+func TestXMLDocumentWithIncorrectEncodingField(t *testing.T) {
			
 
				 	type myxml struct {
			
 
				 		XMLName xml.Name `xml:"rss"`
			
 
				 		Version string   `xml:"version,attr"`
			
@@ -71,7 +71,7 @@ func TestIllegalEncodingField(t *testing.T) {
 
				 
			
 
				 	var x myxml
			
 
				 
			
 
				-	decoder := NewDecoder(reader)
			
 
				+	decoder := NewXMLDecoder(reader)
			
 
				 	err := decoder.Decode(&x)
			
 
				 	if err != nil {
			
 
				 		t.Error(err)