Bladeren bron

Improve XML decoder to remove illegal characters

Tony Wang 6 jaren geleden
bovenliggende
commit
2eb2441f2b

+ 2 - 1
locale/translations.go

@@ -2624,6 +2624,7 @@ var translations = map[string]string{
     "Unable to parse Atom feed: %q": "无法解析Atom源: %q",
     "Unable to parse JSON feed: %q": "无法解析JSON源: %q",
     "Unable to parse RDF feed: %q": "无法解析RDF源: %q",
+    "Unable to read data: %q": "无法读取数据: %q",
     "Unable to normalize encoding: %q": "无法正则化编码: %q",
     "Category not found for this user": "未找到该用户的这一分类",
     "This feed is empty": "该源是空的",
@@ -2645,5 +2646,5 @@ var translationsChecksums = map[string]string{
 	"nl_NL": "a91e2195ac0731a3788405a51c4201e1a89dcce35ef792356e8c17adb57aee97",
 	"pl_PL": "097bc9beac12f33d3a5e5ee98ccba0875e4d1c1bf13e38251a66ac450834c5b3",
 	"ru_RU": "b253bf709a2f4bcac2f894bd1797247481fa7c6b70a0a0d8785d8680be83bac8",
-	"zh_CN": "cb974ad8c374278057db4ca58ff0e59314dc191e2ea59af0d1472a438a9ce3e0",
+	"zh_CN": "5004e07fa535ea56e7fbe1501bb8ff4191d1d214e51b4590110b660994c39f0d",
 }

+ 1 - 0
locale/translations/zh_CN.json

@@ -279,6 +279,7 @@
     "Unable to parse Atom feed: %q": "无法解析Atom源: %q",
     "Unable to parse JSON feed: %q": "无法解析JSON源: %q",
     "Unable to parse RDF feed: %q": "无法解析RDF源: %q",
+    "Unable to read data: %q": "无法读取数据: %q",
     "Unable to normalize encoding: %q": "无法正则化编码: %q",
     "Category not found for this user": "未找到该用户的这一分类",
     "This feed is empty": "该源是空的",

+ 1 - 6
reader/atom/parser.go

@@ -5,22 +5,17 @@
 package atom // import "miniflux.app/reader/atom"
 
 import (
-	"encoding/xml"
 	"io"
 
 	"miniflux.app/errors"
 	"miniflux.app/model"
-	"miniflux.app/reader/encoding"
+	"miniflux.app/reader/xml"
 )
 
 // Parse returns a normalized feed struct from a Atom feed.
 func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	atomFeed := new(atomFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.Entity = xml.HTMLEntity
-	decoder.Strict = false
-	decoder.CharsetReader = encoding.CharsetReader
-
 	err := decoder.Decode(atomFeed)
 	if err != nil {
 		return nil, errors.NewLocalizedError("Unable to parse Atom feed: %q", err)

+ 1 - 6
reader/rdf/parser.go

@@ -5,22 +5,17 @@
 package rdf // import "miniflux.app/reader/rdf"
 
 import (
-	"encoding/xml"
 	"io"
 
 	"miniflux.app/errors"
 	"miniflux.app/model"
-	"miniflux.app/reader/encoding"
+	"miniflux.app/reader/xml"
 )
 
 // Parse returns a normalized feed struct from a RDF feed.
 func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	feed := new(rdfFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.Entity = xml.HTMLEntity
-	decoder.Strict = false
-	decoder.CharsetReader = encoding.CharsetReader
-
 	err := decoder.Decode(feed)
 	if err != nil {
 		return nil, errors.NewLocalizedError("Unable to parse RDF feed: %q", err)

+ 1 - 6
reader/rss/parser.go

@@ -5,22 +5,17 @@
 package rss // import "miniflux.app/reader/rss"
 
 import (
-	"encoding/xml"
 	"io"
 
 	"miniflux.app/errors"
 	"miniflux.app/model"
-	"miniflux.app/reader/encoding"
+	"miniflux.app/reader/xml"
 )
 
 // Parse returns a normalized feed struct from a RSS feed.
 func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	feed := new(rssFeed)
 	decoder := xml.NewDecoder(data)
-	decoder.Entity = xml.HTMLEntity
-	decoder.Strict = false
-	decoder.CharsetReader = encoding.CharsetReader
-
 	err := decoder.Decode(feed)
 	if err != nil {
 		return nil, errors.NewLocalizedError("Unable to parse RSS feed: %q", err)

+ 50 - 0
reader/xml/decoder.go

@@ -0,0 +1,50 @@
+// Copyright 2019 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package xml // import "miniflux.app/reader/xml"
+
+import (
+	"bytes"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"io/ioutil"
+
+	"miniflux.app/reader/encoding"
+)
+
+// NewDecoder returns a XML decoder that filters illegal characters.
+func NewDecoder(data io.Reader) *xml.Decoder {
+	decoder := xml.NewDecoder(data)
+	decoder.Entity = xml.HTMLEntity
+	decoder.Strict = false
+	decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
+		utf8Reader, err := encoding.CharsetReader(charset, input)
+		if err != nil {
+			return nil, err
+		}
+		rawData, err := ioutil.ReadAll(utf8Reader)
+		if err != nil {
+			return nil, fmt.Errorf("Unable to read data: %q", err)
+		}
+		filteredBytes := bytes.Map(filterValidXMLChar, rawData)
+		return bytes.NewReader(filteredBytes), nil
+	}
+
+	return decoder
+}
+
+// This function is copied from encoding/xml package,
+// and is used to check if all the characters are legal.
+func filterValidXMLChar(r rune) rune {
+	if r == 0x09 ||
+		r == 0x0A ||
+		r == 0x0D ||
+		r >= 0x20 && r <= 0xD7FF ||
+		r >= 0xE000 && r <= 0xFFFD ||
+		r >= 0x10000 && r <= 0x10FFFF {
+		return r
+	}
+	return -1
+}

+ 29 - 0
reader/xml/decoder_test.go

@@ -0,0 +1,29 @@
+// Copyright 2019 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package xml // import "miniflux.app/reader/xml"
+
+import (
+	"encoding/xml"
+	"fmt"
+	"strings"
+	"testing"
+)
+
+func TestIllegalCharacters(t *testing.T) {
+	type myxml struct {
+		XMLName xml.Name `xml:"rss"`
+		Version string   `xml:"version,attr"`
+		Title   string   `xml:"title"`
+	}
+
+	data := fmt.Sprintf(`<?xml version="1.0" encoding="windows-1251"?><rss version="2.0"><title>%s</title></rss>`, "\x10")
+	var x myxml
+
+	decoder := NewDecoder(strings.NewReader(data))
+	err := decoder.Decode(&x)
+	if err != nil {
+		t.Error(err)
+	}
+}