LBP
/
miniflux_v2
mirror de https://github.com/miniflux/v2.git


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546
							// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package language // import "miniflux.app/v2/internal/reader/language"

import "strings"

// maxLength bounds accepted language tags. RFC 5646 recommends supporting
// tags of at least 35 characters; anything much longer is garbage.
const maxLength = 50

// Normalize cleans up a language tag declared by a feed so it is
// suitable for use as an HTML lang attribute. It trims surrounding
// whitespace, lower-cases the value, and replaces underscores with hyphens
// (e.g. "en_US" -> "en-us"). No strict BCP-47 validation is performed:
// many real feeds use loose values and silently dropping them yields worse
// downstream behaviour than passing them through.
//
// The value is feed-controlled and is persisted and rendered as-is, so
// anything outside the BCP-47 tag alphabet ([a-z0-9-]) or longer than
// maxLength is rejected: such a value carries no usable language
// information, and stripping bad characters could assemble a wrong tag.
func Normalize(s string) string {
	s = strings.TrimSpace(s)
	if len(s) > maxLength {
		return ""
	}

	// Lower-case ASCII-only, in the same pass as the charset check.
	// Unicode case folding (strings.ToLower) would map some non-ASCII
	// characters to ASCII (e.g. the Kelvin sign U+212A to "k"), turning
	// input the filter should reject into an apparently valid tag.
	b := []byte(s)
	for i, c := range b {
		switch {
		case c >= 'A' && c <= 'Z':
			b[i] = c + 'a' - 'A'
		case c == '_':
			b[i] = '-'
		case (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-':
		default:
			return ""
		}
	}
	return string(b)
}