Sfoglia il codice sorgente

Handle redirects when scraping feed from HTML (#7654)

* Handle redirects when scraping feed from HTML

* pass codesniffer

* pass PHPStan

* Optimize

* Another approach relying on HTML base
Standard way to save an HTML document with relative references

* Fix case of existing HTML base
which should not be overriden

---------

Co-authored-by: Alexandre Alapetite <alexandre@alapetite.fr>
Inverle 9 mesi fa
parent
commit
18b5c8ec6d

+ 1 - 1
app/Controllers/extensionController.php

@@ -48,7 +48,7 @@ class FreshRSS_extension_Controller extends FreshRSS_ActionController {
 		$cacheFile = CACHE_PATH . '/extension_list.json';
 		if (FreshRSS_Context::userConf()->retrieve_extension_list === true) {
 			if (!file_exists($cacheFile) || (time() - (filemtime($cacheFile) ?: 0) > 86400)) {
-				$json = httpGet($extensionListUrl, $cacheFile, 'json');
+				$json = httpGet($extensionListUrl, $cacheFile, 'json')['body'];
 			} else {
 				$json = @file_get_contents($cacheFile) ?: '';
 			}

+ 1 - 1
app/Models/Category.php

@@ -188,7 +188,7 @@ class FreshRSS_Category extends Minz_Model {
 		}
 		$ok = true;
 		$cachePath = $this->cacheFilename($url);
-		$opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions());
+		$opml = httpGet($url, $cachePath, 'opml', $this->attributes(), $this->curlOptions())['body'];
 		if ($opml == '') {
 			Minz_Log::warning('Error getting dynamic OPML for category ' . $this->id() . '! ' .
 				\SimplePie\Misc::url_remove_credentials($url));

+ 7 - 3
app/Models/Entry.php

@@ -845,7 +845,7 @@ HTML;
 	 * @param string $url Overridden URL. Will default to the entry URL.
 	 * @throws Minz_Exception
 	 */
-	public function getContentByParsing(string $url = '', int $maxRedirs = 3): string {
+	public function getContentByParsing(string $url = '', int $maxRedirs = 4): string {
 		$url = $url ?: htmlspecialchars_decode($this->link(), ENT_QUOTES);
 		$feed = $this->feed();
 		if ($url === '' || $feed === null || $feed->pathEntries() === '') {
@@ -872,12 +872,16 @@ HTML;
 		}
 
 		$cachePath = $feed->cacheFilename($url . '#' . $feed->pathEntries());
-		$html = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions());
-		if (strlen($html) > 0) {
+		$response = httpGet($url, $cachePath, 'html', $feed->attributes(), $feed->curlOptions());
+		$html = $response['body'];
+		if ($html !== '') {
 			$doc = new DOMDocument();
 			$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
 			$xpath = new DOMXPath($doc);
 
+			// Account for HTTP redirections
+			$url = $response['effective_url'] ?: $url;
+			$maxRedirs -= $response['redirect_count'];
 			if ($maxRedirs > 0) {
 				//Follow any HTML redirection
 				$metas = $xpath->query('//meta[@content]') ?: [];

+ 2 - 2
app/Models/Feed.php

@@ -788,7 +788,7 @@ class FreshRSS_Feed extends Minz_Model {
 		}
 
 		$httpAccept = $this->kind() === FreshRSS_Feed::KIND_HTML_XPATH_JSON_DOTNOTATION ? 'html' : 'json';
-		$content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions());
+		$content = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body'];
 		if (strlen($content) <= 0) {
 			return null;
 		}
@@ -846,7 +846,7 @@ class FreshRSS_Feed extends Minz_Model {
 		}
 
 		$httpAccept = $this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html';
-		$html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions());
+		$html = httpGet($feedSourceUrl, $this->cacheFilename(), $httpAccept, $this->attributes(), $this->curlOptions())['body'];
 		if (strlen($html) <= 0) {
 			return null;
 		}

+ 44 - 5
lib/lib_rss.php

@@ -507,12 +507,47 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string {
 	return $html;
 }
 
+/**
+ * Set an HTML base URL to the HTML content if there is none.
+ * @param string $html the raw downloaded HTML content
+ * @param string $href the HTML base URL
+ * @return string an HTML string
+ */
+function enforceHtmlBase(string $html, string $href): string {
+	$doc = new DOMDocument();
+	$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
+	if ($doc->documentElement === null) {
+		return '';
+	}
+	$xpath = new DOMXPath($doc);
+	$bases = $xpath->evaluate('//base');
+	if (!($bases instanceof DOMNodeList) || $bases->length === 0) {
+		$base = $doc->createElement('base');
+		if ($base === false) {
+			return $html;
+		}
+		$base->setAttribute('href', $href);
+		$head = null;
+		$heads = $xpath->evaluate('//head');
+		if ($heads instanceof DOMNodeList && $heads->length > 0) {
+			$head = $heads->item(0);
+		}
+		if ($head instanceof DOMElement) {
+			$head->insertBefore($base, $head->firstChild);
+		} else {
+			$doc->insertBefore($base, $doc->documentElement->firstChild);
+		}
+	}
+	return $doc->saveHTML() ?: $html;
+}
+
 /**
  * @param string $type {html,json,opml,xml}
  * @param array<string,mixed> $attributes
  * @param array<int,mixed> $curl_options
+ * @return array{body:string,effective_url:string,redirect_count:int,fail:bool}
  */
-function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): string {
+function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): array {
 	$limits = FreshRSS_Context::systemConf()->limits;
 	$feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
 
@@ -521,7 +556,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		$body = @file_get_contents($cachePath);
 		if ($body != false) {
 			syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
-			return $body;
+			return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false];
 		}
 	}
 
@@ -553,7 +588,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 	// TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
 	$ch = curl_init();
 	if ($ch === false) {
-		return '';
+		return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true];
 	}
 	curl_setopt_array($ch, [
 		CURLOPT_URL => $url,
@@ -598,10 +633,13 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 	$body = curl_exec($ch);
 	$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
 	$c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
+	$c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
+	$c_redirect_count = curl_getinfo($ch, CURLINFO_REDIRECT_COUNT);
 	$c_error = curl_error($ch);
 	curl_close($ch);
 
-	if ($c_status != 200 || $c_error != '' || $body === false) {
+	$fail = $c_status != 200 || $c_error != '' || $body === false;
+	if ($fail) {
 		Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
 		$body = '';
 		// TODO: Implement HTTP 410 Gone
@@ -611,6 +649,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		$body = trim($body, " \n\r\t\v");	// Do not trim \x00 to avoid breaking a BOM
 		if ($type !== 'json') {
 			$body = enforceHttpEncoding($body, $c_content_type);
+			$body = enforceHtmlBase($body, $c_effective_url);
 		}
 	}
 
@@ -618,7 +657,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		Minz_Log::warning("Error saving cache $cachePath for $url");
 	}
 
-	return $body;
+	return ['body' => $body, 'effective_url' => $c_effective_url, 'redirect_count' => $c_redirect_count, 'fail' => $fail];
 }
 
 /**