Explorar o código

HTTP Get allow UTF-8 even when charset is far from top (#6271)

* HTTP Get allow UTF-8 even when charset is far from top
fix https://github.com/FreshRSS/FreshRSS/issues/5586

The case was an HTML document with 15k whitespace then 1.2k of scripts before the `<meta charset="utf-8">` (far from the 1024 bytes suggested by the spec..., and too far for DOMDocument)

* Rewording

* Trim also vertical tab + comment
Alexandre Alapetite %!s(int64=2) %!d(string=hai) anos
pai
achega
e3c86a164d
Modificáronse 1 ficheiros con 13 adicións e 4 borrados
  1. 13 4
      lib/lib_rss.php

+ 13 - 4
lib/lib_rss.php

@@ -444,8 +444,14 @@ function stripHtmlMetaCharset(string $html): string {
 function enforceHttpEncoding(string $html, string $contentType = ''): string {
 	$httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
 	if ($httpCharset == '') {
-		// No charset defined by HTTP, do nothing
-		return $html;
+		// No charset defined by HTTP
+		if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
+			// Detect UTF-8 even if declared too deep in HTML for DOMDocument
+			$httpCharset = 'UTF-8';
+		} else {
+			// Do nothing
+			return $html;
+		}
 	}
 	$httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
 	if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
@@ -565,8 +571,11 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		// TODO: Implement HTTP 410 Gone
 	} elseif (!is_string($body) || strlen($body) === 0) {
 		$body = '';
-	} elseif ($type !== 'json') {
-		$body = enforceHttpEncoding($body, $c_content_type);
+	} else {
+		$body = trim($body, " \n\r\t\v");	// Do not trim \x00 to avoid breaking a BOM
+		if ($type !== 'json') {
+			$body = enforceHttpEncoding($body, $c_content_type);
+		}
 	}
 
 	if (file_put_contents($cachePath, $body) === false) {