Ver código fonte

perf(xml): optimized NewXMLDecoder

io.ReadAll is growing the underlying buffer progressively, while
io.Copy is able to allocate it in one go, which is significantly faster.
io.ReadAll is currently accounting for around 10% of the CPU time of rss.Parse
jvoisin 10 meses atrás
pai
commit
49085daefe
1 arquivos alterados com 7 adições e 3 exclusões
  1. 7 3
      internal/reader/xml/decoder.go

+ 7 - 3
internal/reader/xml/decoder.go

@@ -16,11 +16,15 @@ import (
 // NewXMLDecoder returns a XML decoder that filters illegal characters.
 func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
 	var decoder *xml.Decoder
-	buffer, _ := io.ReadAll(data)
-	enc := getEncoding(buffer)
+
+	// This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
+	buffer := &bytes.Buffer{}
+	io.Copy(buffer, data)
+
+	enc := getEncoding(buffer.Bytes())
 	if enc == "" || strings.EqualFold(enc, "utf-8") {
 		// filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
-		filteredBytes := bytes.Map(filterValidXMLChar, buffer)
+		filteredBytes := bytes.Map(filterValidXMLChar, buffer.Bytes())
 		decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
 	} else {
 		// filter invalid chars later within decoder.CharsetReader