Przeglądaj źródła

perf(xml): optimize xml filtering

Instead of using bytes.Map which is returning a copy of the provided []byte,
use a custom in-place implementation, as the bytes.Map call is taking around
25% of rss.Parse
jvoisin 10 miesięcy temu
rodzic
commit
d59990f1dd
1 zmienionych plików z 22 dodań i 2 usunięć
  1. 22 2
      internal/reader/xml/decoder.go

+ 22 - 2
internal/reader/xml/decoder.go

@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"io"
 	"strings"
+	"unicode/utf8"
 
 	"miniflux.app/v2/internal/reader/encoding"
 )
@@ -24,7 +25,7 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
 	enc := getEncoding(buffer.Bytes())
 	if enc == "" || strings.EqualFold(enc, "utf-8") {
 		// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
-		filteredBytes := bytes.Map(filterValidXMLChar, buffer.Bytes())
+		filteredBytes := filterValidXMLChars(buffer.Bytes())
 		decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
 	} else {
 		// filter invalid chars later within decoder.CharsetReader
@@ -43,13 +44,32 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
 		if err != nil {
 			return nil, fmt.Errorf("encoding: unable to read data: %w", err)
 		}
-		filteredBytes := bytes.Map(filterValidXMLChar, rawData)
+		filteredBytes := filterValidXMLChars(rawData)
 		return bytes.NewReader(filteredBytes), nil
 	}
 
 	return decoder
 }
 
+// filterValidXMLChars filters inplace invalid XML characters.
+// This function is inspired from bytes.Map
+func filterValidXMLChars(s []byte) []byte {
+	j := 0
+	for i := 0; i < len(s); {
+		wid := 1
+		r := rune(s[i])
+		if r >= utf8.RuneSelf {
+			r, wid = utf8.DecodeRune(s[i:])
+		}
+		if r = filterValidXMLChar(r); r >= 0 {
+			utf8.EncodeRune(s[j:], r)
+			j += wid
+		}
+		i += wid
+	}
+	return s[:j]
+}
+
 // This function is copied from encoding/xml package,
 // and is used to check if all the characters are legal.
 func filterValidXMLChar(r rune) rune {