language.go 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package language // import "miniflux.app/v2/internal/reader/language"
  4. import "strings"
  5. // maxLength bounds accepted language tags. RFC 5646 recommends supporting
  6. // tags of at least 35 characters; anything much longer is garbage.
  7. const maxLength = 50
  8. // Normalize cleans up a language tag declared by a feed so it is
  9. // suitable for use as an HTML lang attribute. It trims surrounding
  10. // whitespace, lower-cases the value, and replaces underscores with hyphens
  11. // (e.g. "en_US" -> "en-us"). No strict BCP-47 validation is performed:
  12. // many real feeds use loose values and silently dropping them yields worse
  13. // downstream behaviour than passing them through.
  14. //
  15. // The value is feed-controlled and is persisted and rendered as-is, so
  16. // anything outside the BCP-47 tag alphabet ([a-z0-9-]) or longer than
  17. // maxLength is rejected: such a value carries no usable language
  18. // information, and stripping bad characters could assemble a wrong tag.
  19. func Normalize(s string) string {
  20. s = strings.TrimSpace(s)
  21. if len(s) > maxLength {
  22. return ""
  23. }
  24. // Lower-case ASCII-only, in the same pass as the charset check.
  25. // Unicode case folding (strings.ToLower) would map some non-ASCII
  26. // characters to ASCII (e.g. the Kelvin sign U+212A to "k"), turning
  27. // input the filter should reject into an apparently valid tag.
  28. b := []byte(s)
  29. for i, c := range b {
  30. switch {
  31. case c >= 'A' && c <= 'Z':
  32. b[i] = c + 'a' - 'A'
  33. case c == '_':
  34. b[i] = '-'
  35. case (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-':
  36. default:
  37. return ""
  38. }
  39. }
  40. return string(b)
  41. }