Просмотр исходного кода

fix(language): reject tags outside the BCP-47 alphabet in Normalize

Feed-declared language values were persisted and rendered with no
charset or length restriction. Normalize now rejects values longer
than 50 bytes or containing characters outside [a-z0-9-], keeping
control characters, NUL bytes (which Postgres rejects, failing the
feed refresh), and oversized values out of the database and the HTML
lang attribute. Invalid values are rejected rather than stripped,
since stripping could assemble a wrong tag ("fr, en" -> "fren").

Lower-casing is ASCII-only and done in the same pass as the charset
check: Unicode case folding maps some non-ASCII characters to ASCII
(e.g. the Kelvin sign U+212A to "k"), laundering input the filter
should reject into apparently valid tags.
Fred 1 день назад
Родитель
Сommit
65cd6cd25d
2 измененных файлов с 54 добавлено и 3 удалено
  1. 31 2
      internal/reader/language/language.go
  2. 23 1
      internal/reader/language/language_test.go

+ 31 - 2
internal/reader/language/language.go

@@ -5,13 +5,42 @@ package language // import "miniflux.app/v2/internal/reader/language"
 
 import "strings"
 
+// maxLength bounds accepted language tags. RFC 5646 recommends supporting
+// tags of at least 35 characters; anything much longer is garbage.
+const maxLength = 50
+
 // Normalize cleans up a language tag declared by a feed so it is
 // suitable for use as an HTML lang attribute. It trims surrounding
 // whitespace, lower-cases the value, and replaces underscores with hyphens
 // (e.g. "en_US" -> "en-us"). No strict BCP-47 validation is performed:
 // many real feeds use loose values and silently dropping them yields worse
 // downstream behaviour than passing them through.
+//
+// The value is feed-controlled and is persisted and rendered as-is, so
+// anything outside the BCP-47 tag alphabet ([a-z0-9-]) or longer than
+// maxLength is rejected: such a value carries no usable language
+// information, and stripping bad characters could assemble a wrong tag.
 func Normalize(s string) string {
-	s = strings.ToLower(strings.TrimSpace(s))
-	return strings.ReplaceAll(s, "_", "-")
+	s = strings.TrimSpace(s)
+	if len(s) > maxLength {
+		return ""
+	}
+
+	// Lower-case ASCII-only, in the same pass as the charset check.
+	// Unicode case folding (strings.ToLower) would map some non-ASCII
+	// characters to ASCII (e.g. the Kelvin sign U+212A to "k"), turning
+	// input the filter should reject into an apparently valid tag.
+	b := []byte(s)
+	for i, c := range b {
+		switch {
+		case c >= 'A' && c <= 'Z':
+			b[i] = c + 'a' - 'A'
+		case c == '_':
+			b[i] = '-'
+		case (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-':
+		default:
+			return ""
+		}
+	}
+	return string(b)
 }

+ 23 - 1
internal/reader/language/language_test.go

@@ -3,7 +3,10 @@
 
 package language // import "miniflux.app/v2/internal/reader/language"
 
-import "testing"
+import (
+	"strings"
+	"testing"
+)
 
 func TestNormalize(t *testing.T) {
 	cases := []struct {
@@ -17,6 +20,25 @@ func TestNormalize(t *testing.T) {
 		{"EN-us", "en-us"},
 		{"pt-BR", "pt-br"},
 		{"  fr-FR  ", "fr-fr"},
+		{"zh-hant-cn-x-private1-private2", "zh-hant-cn-x-private1-private2"},
+
+		// Values outside the tag alphabet are rejected, not stripped.
+		{"en US", ""},
+		{"en-US, de-DE", ""},
+		{"en\x00us", ""},
+		{"en\u202eus", ""},
+		{"français", ""},
+		{`"><script>`, ""},
+
+		// Non-ASCII input must be rejected even when Unicode case
+		// folding would map it to ASCII (U+212A Kelvin sign -> "k",
+		// U+0130 dotted capital I -> "i").
+		{"KO", ""},
+		{"İ-en", ""},
+
+		// Values longer than 50 characters are rejected.
+		{strings.Repeat("a", 51), ""},
+		{"en-" + strings.Repeat("a", 100), ""},
 	}
 	for _, c := range cases {
 		if got := Normalize(c.in); got != c.want {