Browse Source

fix(reader): fix a crash introduced by d59990f1

And add a fuzzer and a testcase as well to validate that nothing breaks.
jvoisin 1 year ago
parent
commit
f40c1e7f63
2 changed files with 28 additions and 3 deletions
  1. 5 3
      internal/reader/xml/decoder.go
  2. 23 0
      internal/reader/xml/decoder_test.go

+ 5 - 3
internal/reader/xml/decoder.go

@@ -61,9 +61,11 @@ func filterValidXMLChars(s []byte) []byte {
 		if r >= utf8.RuneSelf {
 			r, wid = utf8.DecodeRune(s[i:])
 		}
-		if r = filterValidXMLChar(r); r >= 0 {
-			utf8.EncodeRune(s[j:], r)
-			j += wid
+		if r != utf8.RuneError {
+			if r = filterValidXMLChar(r); r >= 0 {
+				utf8.EncodeRune(s[j:], r)
+				j += wid
+			}
 		}
 		i += wid
 	}

+ 23 - 0
internal/reader/xml/decoder_test.go

@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"strings"
 	"testing"
+	"unicode/utf8"
 )
 
 func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
@@ -81,3 +82,25 @@ func TestXMLDocumentWithIncorrectEncodingField(t *testing.T) {
 		t.Errorf("Incorrect entry title, expected: %s, got: %s", expected, x.Title)
 	}
 }
+
+func TestFilterValidXMLCharsWithInvalidUTF8Sequence(t *testing.T) {
+	// Create input with invalid UTF-8 sequence
+	input := []byte{0x41, 0xC0, 0xAF, 0x42} // 'A', invalid UTF-8, 'B'
+
+	filtered := filterValidXMLChars(input)
+
+	// The function would replace invalid UTF-8 with replacement char
+	// rather than properly filtering
+	if utf8.Valid(filtered) {
+		r, _ := utf8.DecodeRune(filtered[1:])
+		if r == utf8.RuneError {
+			t.Error("Invalid UTF-8 was not properly filtered")
+		}
+	}
+}
+
+func FuzzFilterValidXMLChars(f *testing.F) {
+	f.Fuzz(func(t *testing.T, s []byte) {
+		filterValidXMLChars(s)
+	})
+}