Browse Source

Strip invalid XML characters to avoid parsing errors

Frédéric Guillot 8 years ago
parent
commit
7b0bfd9308
1 changed files with 26 additions and 1 deletions
  1. 26 1
      reader/feed/parser.go

+ 26 - 1
reader/feed/parser.go

@@ -12,6 +12,7 @@ import (
 	"strings"
 	"strings"
 	"time"
 	"time"
 
 
+	"github.com/miniflux/miniflux/logger"
 	"github.com/miniflux/miniflux/model"
 	"github.com/miniflux/miniflux/model"
 	"github.com/miniflux/miniflux/reader/atom"
 	"github.com/miniflux/miniflux/reader/atom"
 	"github.com/miniflux/miniflux/reader/encoding"
 	"github.com/miniflux/miniflux/reader/encoding"
@@ -74,7 +75,8 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("This feed is empty")
 		return nil, errors.New("This feed is empty")
 	}
 	}
 
 
-	reader := bytes.NewReader(buffer.Bytes())
+	str := stripInvalidXMLCharacters(buffer.String())
+	reader := strings.NewReader(str)
 	format := DetectFeedFormat(reader)
 	format := DetectFeedFormat(reader)
 	reader.Seek(0, io.SeekStart)
 	reader.Seek(0, io.SeekStart)
 
 
@@ -91,3 +93,26 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("Unsupported feed format")
 		return nil, errors.New("Unsupported feed format")
 	}
 	}
 }
 }
+
+func stripInvalidXMLCharacters(input string) string {
+	return strings.Map(func(r rune) rune {
+		if isInCharacterRange(r) {
+			return r
+		}
+
+		logger.Debug("Strip invalid XML characters: %U", r)
+		return -1
+	}, input)
+}
+
+// Decide whether the given rune is in the XML Character Range, per
+// the Char production of http://www.xml.com/axml/testaxml.htm,
+// Section 2.2 Characters.
+func isInCharacterRange(r rune) (inrange bool) {
+	return r == 0x09 ||
+		r == 0x0A ||
+		r == 0x0D ||
+		r >= 0x20 && r <= 0xDF77 ||
+		r >= 0xE000 && r <= 0xFFFD ||
+		r >= 0x10000 && r <= 0x10FFFF
+}