Przeglądaj źródła

Handle more encoding edge cases

- Feeds with charset specified only in Content-Type header and not in XML document
- Feeds with charset specified in both places
- Feeds with charset specified only in XML document and not in HTTP header
Frédéric Guillot 8 lat temu
rodzic
commit
713b38e34c

+ 2 - 1
http/client.go

@@ -95,11 +95,12 @@ func (c *Client) executeRequest(request *http.Request) (*Response, error) {
 		ContentLength: resp.ContentLength,
 	}
 
-	logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ETag=%s, LastModified=%s, EffectiveURL=%s",
+	logger.Debug("[HttpClient:%s] OriginalURL=%s, StatusCode=%d, ContentLength=%d, ContentType=%s, ETag=%s, LastModified=%s, EffectiveURL=%s",
 		request.Method,
 		c.url,
 		response.StatusCode,
 		resp.ContentLength,
+		response.ContentType,
 		response.ETag,
 		response.LastModified,
 		response.EffectiveURL,

+ 17 - 2
http/response.go

@@ -6,8 +6,10 @@ package http
 
 import (
 	"io"
+	"mime"
 	"strings"
 
+	"github.com/miniflux/miniflux/logger"
 	"golang.org/x/net/html/charset"
 )
 
@@ -45,9 +47,22 @@ func (r *Response) IsModified(etag, lastModified string) bool {
 }
 
 // NormalizeBodyEncoding make sure the body is encoded in UTF-8.
+//
+// If a charset other than UTF-8 is detected, we convert the document to UTF-8.
+// This is used by the scraper and feed readers.
+//
+// Do not forget edge cases:
+// - Some non-utf8 feeds specify encoding only in Content-Type, not in XML document.
 func (r *Response) NormalizeBodyEncoding() (io.Reader, error) {
-	if strings.Contains(r.ContentType, "charset=") {
-		return charset.NewReader(r.Body, r.ContentType)
+	_, params, err := mime.ParseMediaType(r.ContentType)
+	if err == nil {
+		if enc, found := params["charset"]; found {
+			enc = strings.ToLower(enc)
+			if enc != "utf-8" && enc != "utf8" && enc != "" {
+				logger.Debug("[NormalizeBodyEncoding] Convert body to UTF-8 from %s", enc)
+				return charset.NewReader(r.Body, r.ContentType)
+			}
+		}
 	}
 	return r.Body, nil
 }

+ 2 - 3
reader/atom/parser.go

@@ -10,15 +10,14 @@ import (
 
 	"github.com/miniflux/miniflux/errors"
 	"github.com/miniflux/miniflux/model"
-
-	"golang.org/x/net/html/charset"
+	"github.com/miniflux/miniflux/reader/encoding"
 )
 
 // Parse returns a normalized feed struct from a Atom feed.
 func Parse(data io.Reader) (*model.Feed, error) {
 	atomFeed := new(atomFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.CharsetReader = charset.NewReaderLabel
+	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(atomFeed)
 	if err != nil {

+ 10 - 0
reader/encoding/doc.go

@@ -0,0 +1,10 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+/*
+
+Package encoding handles workarounds to deal with encoding edge cases found into the wild.
+
+*/
+package encoding

+ 38 - 0
reader/encoding/encoding.go

@@ -0,0 +1,38 @@
+// Copyright 2018 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package encoding
+
+import (
+	"bytes"
+	"io"
+	"unicode/utf8"
+
+	"golang.org/x/net/html/charset"
+)
+
+// CharsetReader is used when the XML encoding is specified for the input document.
+//
+// The document is converted in UTF-8 only if a different encoding is specified
+// and the document is not already UTF-8.
+//
+// Several edge cases could exists:
+//
+// - Feeds with charset specified only in Content-Type header and not in XML document
+// - Feeds with charset specified in both places
+// - Feeds with charset specified only in XML document and not in HTTP header
+func CharsetReader(label string, input io.Reader) (io.Reader, error) {
+	var buf1, buf2 bytes.Buffer
+	w := io.MultiWriter(&buf1, &buf2)
+	io.Copy(w, input)
+	r := bytes.NewReader(buf2.Bytes())
+
+	if !utf8.Valid(buf1.Bytes()) {
+		// Transform document to UTF-8 from the specified XML encoding.
+		return charset.NewReaderLabel(label, r)
+	}
+
+	// The document is already UTF-8, do not do anything (avoid double-encoding)
+	return r, nil
+}

+ 6 - 7
reader/feed/parser.go

@@ -14,12 +14,11 @@ import (
 
 	"github.com/miniflux/miniflux/model"
 	"github.com/miniflux/miniflux/reader/atom"
+	"github.com/miniflux/miniflux/reader/encoding"
 	"github.com/miniflux/miniflux/reader/json"
 	"github.com/miniflux/miniflux/reader/rdf"
 	"github.com/miniflux/miniflux/reader/rss"
 	"github.com/miniflux/miniflux/timer"
-
-	"golang.org/x/net/html/charset"
 )
 
 // List of feed formats.
@@ -32,14 +31,14 @@ const (
 )
 
 // DetectFeedFormat detect feed format from input data.
-func DetectFeedFormat(data io.Reader) string {
+func DetectFeedFormat(r io.Reader) string {
 	defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
 
 	var buffer bytes.Buffer
-	tee := io.TeeReader(data, &buffer)
+	tee := io.TeeReader(r, &buffer)
 
 	decoder := xml.NewDecoder(tee)
-	decoder.CharsetReader = charset.NewReaderLabel
+	decoder.CharsetReader = encoding.CharsetReader
 
 	for {
 		token, _ := decoder.Token()
@@ -66,11 +65,11 @@ func DetectFeedFormat(data io.Reader) string {
 	return FormatUnknown
 }
 
-func parseFeed(data io.Reader) (*model.Feed, error) {
+func parseFeed(r io.Reader) (*model.Feed, error) {
 	defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
 
 	var buffer bytes.Buffer
-	io.Copy(&buffer, data)
+	io.Copy(&buffer, r)
 
 	reader := bytes.NewReader(buffer.Bytes())
 	format := DetectFeedFormat(reader)

+ 2 - 2
reader/opml/parser.go

@@ -9,14 +9,14 @@ import (
 	"io"
 
 	"github.com/miniflux/miniflux/errors"
-	"golang.org/x/net/html/charset"
+	"github.com/miniflux/miniflux/reader/encoding"
 )
 
 // Parse reads an OPML file and returns a SubcriptionList.
 func Parse(data io.Reader) (SubcriptionList, error) {
 	feeds := new(opml)
 	decoder := xml.NewDecoder(data)
-	decoder.CharsetReader = charset.NewReaderLabel
+	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feeds)
 	if err != nil {

+ 2 - 2
reader/rdf/parser.go

@@ -10,14 +10,14 @@ import (
 
 	"github.com/miniflux/miniflux/errors"
 	"github.com/miniflux/miniflux/model"
-	"golang.org/x/net/html/charset"
+	"github.com/miniflux/miniflux/reader/encoding"
 )
 
 // Parse returns a normalized feed struct from a RDF feed.
 func Parse(data io.Reader) (*model.Feed, error) {
 	feed := new(rdfFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.CharsetReader = charset.NewReaderLabel
+	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feed)
 	if err != nil {

+ 2 - 3
reader/rss/parser.go

@@ -10,15 +10,14 @@ import (
 
 	"github.com/miniflux/miniflux/errors"
 	"github.com/miniflux/miniflux/model"
-
-	"golang.org/x/net/html/charset"
+	"github.com/miniflux/miniflux/reader/encoding"
 )
 
 // Parse returns a normalized feed struct from a RSS feed.
 func Parse(data io.Reader) (*model.Feed, error) {
 	feed := new(rssFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.CharsetReader = charset.NewReaderLabel
+	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feed)
 	if err != nil {

+ 6 - 1
reader/subscription/finder.go

@@ -35,8 +35,13 @@ func FindSubscriptions(websiteURL string) (Subscriptions, error) {
 		return nil, errors.NewLocalizedError(errConnectionFailure, err)
 	}
 
+	body, err := response.NormalizeBodyEncoding()
+	if err != nil {
+		return nil, err
+	}
+
 	var buffer bytes.Buffer
-	io.Copy(&buffer, response.Body)
+	io.Copy(&buffer, body)
 	reader := bytes.NewReader(buffer.Bytes())
 
 	if format := feed.DetectFeedFormat(reader); format != feed.FormatUnknown {