소스 검색

Implement support for HTTP 429 Too Many Requests (#7760)

* Implement support for HTTP 429 Too Many Requests
Will obey the corresponding HTTP `Retry-After` header at domain level.

* Implement 503 Service Unavailable

* Sanitize Retry-After

* Reduce default value when Retry-After is absent
And make configuration parameter

* Retry-After also for favicons
Alexandre Alapetite 8 달 전
부모
커밋
7a0c423357

+ 4 - 2
app/Controllers/feedController.php

@@ -83,7 +83,9 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController {
 		switch ($kind) {
 			case FreshRSS_Feed::KIND_RSS:
 			case FreshRSS_Feed::KIND_RSS_FORCED:
-				$feed->load(true);	//Throws FreshRSS_Feed_Exception, Minz_FileNotExistException
+				if ($feed->load(loadDetails: true) === null) {	// Throws FreshRSS_Feed_Exception, Minz_FileNotExistException
+					throw new FreshRSS_FeedNotAdded_Exception($url);
+				}
 				break;
 			case FreshRSS_Feed::KIND_HTML_XPATH:
 			case FreshRSS_Feed::KIND_XML_XPATH:
@@ -345,7 +347,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController {
 			$this->view->feed = new FreshRSS_Feed($url);
 			try {
 				// We try to get more information about the feed.
-				$this->view->feed->load(true);
+				$this->view->feed->load(loadDetails: true);
 				$this->view->load_ok = true;
 			} catch (Exception) {
 				$this->view->load_ok = false;

+ 20 - 9
app/Models/Feed.php

@@ -552,6 +552,10 @@ class FreshRSS_Feed extends Minz_Model {
 					Minz_Exception::ERROR
 				);
 			} else {
+				if (($retryAfter = FreshRSS_http_Util::getRetryAfter($this->url)) > 0) {
+					throw new FreshRSS_Feed_Exception('For that domain, will first retry after ' . date('c', $retryAfter) .
+						'. ' . $this->url(includeCredentials: false), code: 503);
+				}
 				$simplePie = customSimplePie($this->attributes(), $this->curlOptions());
 				$url = htmlspecialchars_decode($this->url, ENT_QUOTES);
 				if (str_ends_with($url, '#force_feed')) {
@@ -571,15 +575,21 @@ class FreshRSS_Feed extends Minz_Model {
 				Minz_ExtensionManager::callHook('simplepie_after_init', $simplePie, $this, $simplePieResult);
 
 				if ($simplePieResult === false || $simplePie->get_hash() === '' || !empty($simplePie->error())) {
-					$errorMessage = $simplePie->error();
-					if (empty($errorMessage)) {
-						$errorMessage = '';
-					} elseif (is_array($errorMessage)) {
-						$errorMessage = json_encode($errorMessage, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_LINE_TERMINATORS) ?: '';
+					if ($simplePie->status_code() === 429) {
+						$errorMessage = 'HTTP 429 Too Many Requests!';
+					} elseif ($simplePie->status_code() === 503) {
+						$errorMessage = 'HTTP 503 Service Unavailable!';
+					} else {
+						$errorMessage = $simplePie->error();
+						if (empty($errorMessage)) {
+							$errorMessage = '';
+						} elseif (is_array($errorMessage)) {
+							$errorMessage = json_encode($errorMessage, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_LINE_TERMINATORS) ?: '';
+						}
 					}
 					throw new FreshRSS_Feed_Exception(
 						($errorMessage == '' ? 'Unknown error for feed' : $errorMessage) .
-							' [' . \SimplePie\Misc::url_remove_credentials($this->url) . ']',
+							' [' . $this->url(includeCredentials: false) . ']',
 						$simplePie->status_code()
 					);
 				}
@@ -701,7 +711,7 @@ class FreshRSS_Feed extends Minz_Model {
 		}
 
 		if ($invalidGuids > 0) {
-			Minz_Log::warning("Feed has {$invalidGuids} invalid GUIDs: " . $this->url);
+			Minz_Log::warning("Feed has {$invalidGuids} invalid GUIDs: " . $this->url(includeCredentials: false));
 			if (!$this->attributeBoolean('unicityCriteriaForced') && $invalidGuids > round($invalidGuidsTolerance * count($items))) {
 				$unicityCriteria = $this->attributeString('unicityCriteria');
 				if ($this->attributeBoolean('hasBadGuids')) {	// Legacy
@@ -719,7 +729,8 @@ class FreshRSS_Feed extends Minz_Model {
 				if ($newUnicityCriteria !== $unicityCriteria) {
 					$this->_attribute('hasBadGuids', null);	// Remove legacy
 					$this->_attribute('unicityCriteria', $newUnicityCriteria);
-					Minz_Log::warning('Feed unicity policy degraded (' . ($unicityCriteria ?: 'id') . ' → ' . $newUnicityCriteria . '): ' . $this->url);
+					Minz_Log::warning('Feed unicity policy degraded (' . ($unicityCriteria ?: 'id') . ' → ' . $newUnicityCriteria . '): ' .
+						$this->url(includeCredentials: false));
 					return $this->loadGuids($simplePie, $invalidGuidsTolerance);
 				}
 			}
@@ -1167,7 +1178,7 @@ class FreshRSS_Feed extends Minz_Model {
 			$affected = $feedDAO->markAsReadNotSeen($this->id(), $minLastSeen);
 		}
 		if ($affected > 0) {
-			Minz_Log::debug(__METHOD__ . " $affected items" . ($upstreamIsEmpty ? ' (all)' : '') . ' [' . $this->url(false) . ']');
+			Minz_Log::debug(__METHOD__ . " $affected items" . ($upstreamIsEmpty ? ' (all)' : '') . ' [' . $this->url(includeCredentials: false) . ']');
 		}
 		return $affected;
 	}

+ 19 - 1
app/Models/SimplePieResponse.php

@@ -4,7 +4,25 @@ declare(strict_types=1);
 final class FreshRSS_SimplePieResponse extends \SimplePie\File
 {
 	#[\Override]
-	protected function on_http_response(): void {
+	protected function on_http_response(string|false $response = ''): void {
 		syslog(LOG_INFO, 'FreshRSS SimplePie GET ' . $this->get_status_code() . ' ' . \SimplePie\Misc::url_remove_credentials($this->get_final_requested_uri()));
+
+		if (in_array($this->get_status_code(), [429, 503], true)) {
+			$parser = new \SimplePie\HTTP\Parser(is_string($response) ? $response : '');
+			if ($parser->parse()) {
+				$headers = $parser->headers;
+			} else {
+				$headers = [];
+			}
+
+			$retryAfter = FreshRSS_http_Util::setRetryAfter($this->get_final_requested_uri(), $headers['retry-after'] ?? '');
+			if ($retryAfter > 0) {
+				$domain = parse_url($this->get_final_requested_uri(), PHP_URL_HOST);
+				if (is_string($domain) && $domain !== '') {
+					$errorMessage = 'Will retry after ' . date('c', $retryAfter) . ' for domain `' . $domain . '`';
+					Minz_Log::notice($errorMessage);
+				}
+			}
+		}
 	}
 }

+ 74 - 0
app/Utils/httpUtil.php

@@ -0,0 +1,74 @@
+<?php
+declare(strict_types=1);
+
+final class FreshRSS_http_Util {
+
+	private const RETRY_AFTER_PATH = DATA_PATH . '/Retry-After/';
+
+	/**
+	 * Clean up old Retry-After files
+	 */
+	private static function cleanRetryAfters(): void {
+		if (!is_dir(self::RETRY_AFTER_PATH)) {
+			return;
+		}
+		$files = glob(self::RETRY_AFTER_PATH . '*.txt', GLOB_NOSORT);
+		if ($files === false) {
+			return;
+		}
+		foreach ($files as $file) {
+			if (@filemtime($file) < time()) {
+				@unlink($file);
+			}
+		}
+	}
+
+	/**
+	 * Check whether the URL needs to wait for a Retry-After period.
+	 * @return int The timestamp of when the Retry-After expires, or 0 if not set.
+	 */
+	public static function getRetryAfter(string $url): int {
+		if (rand(0, 30) === 1) {	// Remove old files once in a while
+			self::cleanRetryAfters();
+		}
+		$domain = parse_url($url, PHP_URL_HOST);
+		if (!is_string($domain) || $domain === '') {
+			return 0;
+		}
+		$retryAfter = @filemtime(self::RETRY_AFTER_PATH . $domain . '.txt') ?: 0;
+		if ($retryAfter <= 0) {
+			return 0;
+		}
+		if ($retryAfter < time()) {
+			@unlink(self::RETRY_AFTER_PATH . $domain . '.txt');
+			return 0;
+		}
+		return $retryAfter;
+	}
+
+	/**
+	 * Store the HTTP Retry-After header value of an HTTP `429 Too Many Requests` or `503 Service Unavailable` response.
+	 */
+	public static function setRetryAfter(string $url, string $retryAfter): int {
+		$domain = parse_url($url, PHP_URL_HOST);
+		if (!is_string($domain) || $domain === '') {
+			return 0;
+		}
+
+		$limits = FreshRSS_Context::systemConf()->limits;
+		if (ctype_digit($retryAfter)) {
+			$retryAfter = time() + (int)$retryAfter;
+		} else {
+			$retryAfter = \SimplePie\Misc::parse_date($retryAfter) ?:
+				(time() + max(600, $limits['retry_after_default'] ?? 0));
+		}
+		$retryAfter = min($retryAfter, time() + max(3600, $limits['retry_after_max'] ?? 0));
+
+		@mkdir(self::RETRY_AFTER_PATH);
+		if (!touch(self::RETRY_AFTER_PATH . $domain . '.txt', $retryAfter)) {
+			Minz_Log::error('Failed to set Retry-After for ' . $domain);
+			return 0;
+		}
+		return $retryAfter;
+	}
+}

+ 1 - 0
cli/prepare.php

@@ -12,6 +12,7 @@ $dirs = [
 	'/PubSubHubbub',
 	'/PubSubHubbub/feeds',
 	'/PubSubHubbub/keys',
+	'/Retry-After',
 	'/tokens',
 	'/users',
 	'/users/_',

+ 6 - 2
config.default.php

@@ -103,10 +103,14 @@ return [
 		# Especially important for multi-user setups.
 		# Might be overridden by HTTP response headers.
 		'cache_duration' => 800,
-		# Minimal cache duration (in seconds), overriding HTTP response headers `Cache-Control` and `Expires`,
+		# Minimal cache duration (in seconds), overriding HTTP response headers `Cache-Control` and `Expires`.
 		'cache_duration_min' => 60,
-		# Maximal cache duration (in seconds), overriding HTTP response headers `Cache-Control` and `Expires`,
+		# Maximal cache duration (in seconds), overriding HTTP response headers `Cache-Control` and `Expires`.
 		'cache_duration_max' => 86400,
+		# Default rate limit duration (in seconds), when HTTP response header `Retry-After` is absent.
+		'retry_after_default' => 1500,
+		# Maximal rate limit duration (in seconds), overriding HTTP response header `Retry-After`.
+		'retry_after_max' => 172800,
 
 		# SimplePie HTTP request timeout in seconds.
 		'timeout' => 20,

+ 1 - 0
data/Retry-After/.gitignore

@@ -0,0 +1 @@
+*.txt

+ 2 - 0
data/Retry-After/README.md

@@ -0,0 +1,2 @@
+Folder to store domains for which there is a pending HTTP `429 Too Many Requests`.
+The `Retry-After` value is added to current time and stored in the `*.txt` file modification time.

+ 13 - 0
data/Retry-After/index.html

@@ -0,0 +1,13 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-GB" lang="en-GB">
+<head>
+<meta charset="UTF-8" />
+<meta http-equiv="Refresh" content="0; url=/" />
+<title>Redirection</title>
+<meta name="robots" content="noindex" />
+</head>
+
+<body>
+<p><a href="/">Redirection</a></p>
+</body>
+</html>

+ 38 - 11
lib/favicons.php

@@ -24,15 +24,24 @@ function isImgMime(string $content): bool {
 
 /** @param array<int,int|bool|string> $curlOptions */
 function downloadHttp(string &$url, array $curlOptions = []): string {
+	if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url)) > 0) {
+		Minz_Log::warning('For that domain, will first retry favicon after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
+		return '';
+	}
+
 	syslog(LOG_INFO, 'FreshRSS Favicon GET ' . $url);
 	$url2 = checkUrl($url);
 	if ($url2 == false) {
 		return '';
 	}
 	$url = $url2;
-	/** @var CurlHandle $ch */
+
 	$ch = curl_init($url);
+	if ($ch === false) {
+		return '';
+	}
 	curl_setopt_array($ch, [
+			CURLOPT_HEADER => true,
 			CURLOPT_RETURNTRANSFER => true,
 			CURLOPT_TIMEOUT => 15,
 			CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
@@ -50,18 +59,37 @@ function downloadHttp(string &$url, array $curlOptions = []): string {
 	curl_setopt_array($ch, $curlOptions);
 
 	$response = curl_exec($ch);
-	if (!is_string($response)) {
-		$response = '';
-	}
-	$info = curl_getinfo($ch);
+	$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+	$c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
 	curl_close($ch);
-	if (!empty($info['url'])) {
-		$url2 = checkUrl($info['url']);
-		if ($url2 != false) {
-			$url = $url2;	//Possible redirect
+
+	$parser = new \SimplePie\HTTP\Parser(is_string($response) ? $response : '');
+	if ($parser->parse()) {
+		$headers = $parser->headers;
+		$body = $parser->body;
+	} else {
+		$headers = [];
+		$body = false;
+	}
+
+	if (in_array($c_status, [429, 503], true)) {
+		$retryAfter = FreshRSS_http_Util::setRetryAfter($url, $headers['retry-after'] ?? '');
+		if ($c_status === 429) {
+			$errorMessage = 'HTTP 429 Too Many Requests! Searching favicon [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
+		} elseif ($c_status === 503) {
+			$errorMessage = 'HTTP 503 Service Unavailable! Searching favicon [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
+		}
+		if ($retryAfter > 0) {
+			$errorMessage .= ' We may retry after ' . date('c', $retryAfter);
 		}
 	}
-	return is_array($info) && $info['http_code'] == 200 ? $response : '';
+
+	$url2 = checkUrl($c_effective_url);
+	if ($url2 != false) {
+		$url = $url2;	//Possible redirect
+	}
+
+	return $c_status === 200 && is_string($body) ? $body : '';
 }
 
 function searchFavicon(string &$url): string {
@@ -75,7 +103,6 @@ function searchFavicon(string &$url): string {
 	$xpath = new DOMXPath($dom);
 	$links = $xpath->query('//link[@href][translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="shortcut icon"'
 		. ' or translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="icon"]');
-
 	if (!($links instanceof DOMNodeList)) {
 		return '';
 	}

+ 28 - 2
lib/lib_rss.php

@@ -564,6 +564,11 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		cleanCache(CLEANCACHE_HOURS);
 	}
 
+	if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url)) > 0) {
+		Minz_Log::warning('For that domain, will first retry after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
+		return ['body' => '', 'effective_url' => $url, 'redirect_count' => 0, 'fail' => true];
+	}
+
 	if (FreshRSS_Context::systemConf()->simplepie_syslog_enabled) {
 		syslog(LOG_INFO, 'FreshRSS GET ' . $type . ' ' . \SimplePie\Misc::url_remove_credentials($url));
 	}
@@ -597,6 +602,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 		CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
 		CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
 		CURLOPT_MAXREDIRS => 4,
+		CURLOPT_HEADER => true,
 		CURLOPT_RETURNTRANSFER => true,
 		CURLOPT_FOLLOWLOCATION => true,
 		CURLOPT_ENCODING => '',	//Enable all encodings
@@ -630,7 +636,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 
 	curl_setopt_array($ch, $curl_options);
 
-	$body = curl_exec($ch);
+	$response = curl_exec($ch);
 	$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
 	$c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
 	$c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
@@ -638,10 +644,30 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 	$c_error = curl_error($ch);
 	curl_close($ch);
 
+	$parser = new \SimplePie\HTTP\Parser(is_string($response) ? $response : '');
+	if ($parser->parse()) {
+		$headers = $parser->headers;
+		$body = $parser->body;
+	} else {
+		$headers = [];
+		$body = false;
+	}
+
 	$fail = $c_status != 200 || $c_error != '' || $body === false;
 	if ($fail) {
-		Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
 		$body = '';
+		Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
+		if (in_array($c_status, [429, 503], true)) {
+			$retryAfter = FreshRSS_http_Util::setRetryAfter($url, $headers['retry-after'] ?? '');
+			if ($c_status === 429) {
+				$errorMessage = 'HTTP 429 Too Many Requests! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
+			} elseif ($c_status === 503) {
+				$errorMessage = 'HTTP 503 Service Unavailable! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
+			}
+			if ($retryAfter > 0) {
+				$errorMessage .= ' We may retry after ' . date('c', $retryAfter);
+			}
+		}
 		// TODO: Implement HTTP 410 Gone
 	} elseif (!is_string($body) || strlen($body) === 0) {
 		$body = '';

+ 11 - 10
lib/simplepie/simplepie/src/File.php

@@ -127,7 +127,7 @@ class File implements Response
                 curl_setopt($fp, CURLOPT_URL, $url);
                 curl_setopt($fp, CURLOPT_HEADER, 1);
                 curl_setopt($fp, CURLOPT_RETURNTRANSFER, 1);
-                curl_setopt($fp, CURLOPT_FAILONERROR, 1);
+                // curl_setopt($fp, CURLOPT_FAILONERROR, 1); // FreshRSS removed to retrieve headers even on HTTP errors
                 curl_setopt($fp, CURLOPT_TIMEOUT, $timeout);
                 curl_setopt($fp, CURLOPT_CONNECTTIMEOUT, $timeout);
                 // curl_setopt($fp, CURLOPT_REFERER, \SimplePie\Misc::url_remove_credentials($url)); // FreshRSS removed
@@ -141,7 +141,7 @@ class File implements Response
                 if (curl_errno($fp) === 23 || curl_errno($fp) === 61) {
                     $this->error = 'cURL error ' . curl_errno($fp) . ': ' . curl_error($fp); // FreshRSS
                     $this->status_code = curl_getinfo($fp, CURLINFO_HTTP_CODE); // FreshRSS
-                    $this->on_http_response();
+                    $this->on_http_response($responseHeaders);
                     $this->error = null; // FreshRSS
                     curl_setopt($fp, CURLOPT_ENCODING, 'none');
                     $responseHeaders = curl_exec($fp);
@@ -150,9 +150,9 @@ class File implements Response
                 if (curl_errno($fp)) {
                     $this->error = 'cURL error ' . curl_errno($fp) . ': ' . curl_error($fp);
                     $this->success = false;
-                    $this->on_http_response();
+                    $this->on_http_response($responseHeaders);
                 } else {
-                    $this->on_http_response();
+                    $this->on_http_response($responseHeaders);
                     // Use the updated url provided by curl_getinfo after any redirects.
                     if ($info = curl_getinfo($fp)) {
                         $this->url = $info['url'];
@@ -188,7 +188,7 @@ class File implements Response
                 if (!$fp) {
                     $this->error = 'fsockopen error: ' . $errstr;
                     $this->success = false;
-                    $this->on_http_response();
+                    $this->on_http_response(false);
                 } else {
                     stream_set_timeout($fp, $timeout);
                     if (isset($url_parts['path'])) {
@@ -229,7 +229,7 @@ class File implements Response
                             $this->set_headers($parser->headers);
                             $this->body = $parser->body;
                             $this->status_code = $parser->status_code;
-                            $this->on_http_response();
+                            $this->on_http_response($responseHeaders);
                             if ((in_array($this->status_code, [300, 301, 302, 303, 307]) || $this->status_code > 307 && $this->status_code < 400) && ($locationHeader = $this->get_header_line('location')) !== '' && $this->redirects < $redirects) {
                                 $this->redirects++;
                                 $location = \SimplePie\Misc::absolutize_url($locationHeader, $url);
@@ -271,12 +271,12 @@ class File implements Response
                         } else {
                             $this->error = 'Could not parse'; // FreshRSS
                             $this->success = false; // FreshRSS
-                            $this->on_http_response();
+                            $this->on_http_response($responseHeaders);
                         }
                     } else {
                         $this->error = 'fsocket timed out';
                         $this->success = false;
-                        $this->on_http_response();
+                        $this->on_http_response($responseHeaders);
                     }
                     fclose($fp);
                 }
@@ -291,7 +291,7 @@ class File implements Response
                 $this->body = $filebody;
                 $this->status_code = 200;
             }
-            $this->on_http_response();
+            $this->on_http_response($filebody);
         }
         if ($this->success) {
             // (Leading) whitespace may cause XML parsing errors so we trim it,
@@ -303,9 +303,10 @@ class File implements Response
     /**
      * Event to allow inheriting classes to e.g. log the HTTP responses.
      * Triggered just after an HTTP response is received.
+     * @param string|false $response The raw HTTP response headers and body, or false in case of failure (as returned by curl_exec()).
      * FreshRSS.
      */
-    protected function on_http_response(): void
+    protected function on_http_response(string|false $response): void
     {
     }
 

+ 1 - 1
lib/simplepie/simplepie/src/Misc.php

@@ -1737,7 +1737,7 @@ class Misc
     }
 
     /**
-     * @return int|bool
+     * @return int|false
      */
     public static function parse_date(string $dt)
     {