httpUtil.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. <?php
  2. declare(strict_types=1);
  3. final class FreshRSS_http_Util {
  4. private const RETRY_AFTER_PATH = DATA_PATH . '/Retry-After/';
  5. private static function getRetryAfterFile(string $url, string $proxy): string {
  6. $domain = parse_url($url, PHP_URL_HOST);
  7. if (!is_string($domain) || $domain === '') {
  8. return '';
  9. }
  10. $domainWide = Minz_Request::serverIsPublic($domain);
  11. $port = parse_url($url, PHP_URL_PORT);
  12. if (is_int($port)) {
  13. $domain .= ':' . $port;
  14. }
  15. return self::RETRY_AFTER_PATH . urlencode($domain) .
  16. ($domainWide ? '' : '_' . hash('sha256', $url)) .
  17. (empty($proxy) ? '' : '_' . urlencode($proxy)) . '.txt';
  18. }
  19. /**
  20. * Clean up old Retry-After files
  21. */
  22. private static function cleanRetryAfters(): void {
  23. if (!is_dir(self::RETRY_AFTER_PATH)) {
  24. return;
  25. }
  26. $files = glob(self::RETRY_AFTER_PATH . '*.txt', GLOB_NOSORT);
  27. if ($files === false) {
  28. return;
  29. }
  30. foreach ($files as $file) {
  31. if (@filemtime($file) < time()) {
  32. @unlink($file);
  33. }
  34. }
  35. }
  36. /**
  37. * Check whether the URL needs to wait for a Retry-After period.
  38. * @return int The timestamp of when the Retry-After expires, or 0 if not set.
  39. */
  40. public static function getRetryAfter(string $url, string $proxy): int {
  41. if (rand(0, 30) === 1) { // Remove old files once in a while
  42. self::cleanRetryAfters();
  43. }
  44. $txt = self::getRetryAfterFile($url, $proxy);
  45. if ($txt === '') {
  46. return 0;
  47. }
  48. $retryAfter = @filemtime($txt) ?: 0;
  49. if ($retryAfter <= 0) {
  50. return 0;
  51. }
  52. if ($retryAfter < time()) {
  53. @unlink($txt);
  54. return 0;
  55. }
  56. return $retryAfter;
  57. }
  58. /**
  59. * Store the HTTP Retry-After header value of an HTTP `429 Too Many Requests` or `503 Service Unavailable` response.
  60. */
  61. public static function setRetryAfter(string $url, string $proxy, string $retryAfter): int {
  62. $txt = self::getRetryAfterFile($url, $proxy);
  63. if ($txt === '') {
  64. return 0;
  65. }
  66. $limits = FreshRSS_Context::systemConf()->limits;
  67. if (ctype_digit($retryAfter)) {
  68. $retryAfter = time() + (int)$retryAfter;
  69. } else {
  70. $retryAfter = \SimplePie\Misc::parse_date($retryAfter) ?:
  71. (time() + max(600, $limits['retry_after_default'] ?? 0));
  72. }
  73. $retryAfter = min($retryAfter, time() + max(3600, $limits['retry_after_max'] ?? 0));
  74. @mkdir(self::RETRY_AFTER_PATH);
  75. if (!touch($txt, $retryAfter)) {
  76. Minz_Log::error('Failed to set Retry-After for ' . $url);
  77. return 0;
  78. }
  79. return $retryAfter;
  80. }
  81. /**
  82. * @param array<mixed> $curl_params
  83. * @return array<mixed>
  84. */
  85. public static function sanitizeCurlParams(array $curl_params): array {
  86. $safe_params = [
  87. CURLOPT_COOKIE,
  88. CURLOPT_COOKIEFILE,
  89. CURLOPT_FOLLOWLOCATION,
  90. CURLOPT_HTTPHEADER,
  91. CURLOPT_MAXREDIRS,
  92. CURLOPT_POST,
  93. CURLOPT_POSTFIELDS,
  94. CURLOPT_PROXY,
  95. CURLOPT_PROXYTYPE,
  96. CURLOPT_USERAGENT,
  97. ];
  98. foreach ($curl_params as $k => $_) {
  99. if (!in_array($k, $safe_params, true)) {
  100. unset($curl_params[$k]);
  101. continue;
  102. }
  103. // Allow only an empty value just to enable the libcurl cookie engine
  104. if ($k === CURLOPT_COOKIEFILE) {
  105. $curl_params[$k] = '';
  106. }
  107. }
  108. return $curl_params;
  109. }
  110. private static function idn_to_puny(string $url): string {
  111. if (function_exists('idn_to_ascii')) {
  112. $idn = parse_url($url, PHP_URL_HOST);
  113. if (is_string($idn) && $idn != '') {
  114. $puny = idn_to_ascii($idn);
  115. $pos = strpos($url, $idn);
  116. if ($puny != false && $pos !== false) {
  117. $url = substr_replace($url, $puny, $pos, strlen($idn));
  118. }
  119. }
  120. }
  121. return $url;
  122. }
  123. public static function checkUrl(string $url, bool $fixScheme = true): string|false {
  124. $url = trim($url);
  125. if ($url == '') {
  126. return '';
  127. }
  128. if ($fixScheme && preg_match('#^https?://#i', $url) !== 1) {
  129. $url = 'https://' . ltrim($url, '/');
  130. }
  131. $url = self::idn_to_puny($url); // https://bugs.php.net/bug.php?id=53474
  132. $urlRelaxed = str_replace('_', 'z', $url); //PHP discussion #64948 Underscore
  133. if (is_string(filter_var($urlRelaxed, FILTER_VALIDATE_URL))) {
  134. return $url;
  135. } else {
  136. return false;
  137. }
  138. }
  139. /**
  140. * Remove the charset meta information of an HTML document, e.g.:
  141. * `<meta charset="..." />`
  142. * `<meta http-equiv="Content-Type" content="text/html; charset=...">`
  143. */
  144. private static function stripHtmlMetaCharset(string $html): string {
  145. return preg_replace('/<meta\s[^>]*charset\s*=\s*[^>]+>/i', '', $html, 1) ?? '';
  146. }
  147. /**
  148. * Set an XML preamble to enforce the HTML content type charset received by HTTP.
  149. * @param string $html the raw downloaded HTML content
  150. * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
  151. * @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
  152. */
  153. private static function enforceHttpEncoding(string $html, string $contentType = ''): string {
  154. $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
  155. if ($httpCharset == '') {
  156. // No charset defined by HTTP
  157. if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
  158. // Detect UTF-8 even if declared too deep in HTML for DOMDocument
  159. $httpCharset = 'UTF-8';
  160. } else {
  161. // Do nothing
  162. return $html;
  163. }
  164. }
  165. $httpCharsetNormalized = \SimplePie\Misc::encoding($httpCharset);
  166. if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
  167. // Default charset for HTTP, do nothing
  168. return $html;
  169. }
  170. if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM
  171. substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM
  172. substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM
  173. substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM
  174. substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM
  175. // Existing byte order mark, do nothing
  176. return $html;
  177. }
  178. if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) {
  179. // Existing XML declaration, do nothing
  180. return $html;
  181. }
  182. if ($httpCharsetNormalized !== 'UTF-8') {
  183. // Try to change encoding to UTF-8 using mbstring or iconv or intl
  184. $utf8 = \SimplePie\Misc::change_encoding($html, $httpCharsetNormalized, 'UTF-8');
  185. if (is_string($utf8)) {
  186. $html = self::stripHtmlMetaCharset($utf8);
  187. $httpCharsetNormalized = 'UTF-8';
  188. }
  189. }
  190. if ($httpCharsetNormalized === 'UTF-8') {
  191. // Save encoding information as XML declaration
  192. return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html;
  193. }
  194. // Give up
  195. return $html;
  196. }
  197. /**
  198. * Set an HTML base URL to the HTML content if there is none.
  199. * @param string $html the raw downloaded HTML content
  200. * @param string $href the HTML base URL
  201. * @return string an HTML string
  202. */
  203. private static function enforceHtmlBase(string $html, string $href): string {
  204. $doc = new DOMDocument();
  205. $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
  206. if ($doc->documentElement === null) {
  207. return '';
  208. }
  209. $xpath = new DOMXPath($doc);
  210. $bases = $xpath->evaluate('//base');
  211. if (!($bases instanceof DOMNodeList) || $bases->length === 0) {
  212. $base = $doc->createElement('base');
  213. if ($base === false) {
  214. return $html;
  215. }
  216. $base->setAttribute('href', $href);
  217. $head = null;
  218. $heads = $xpath->evaluate('//head');
  219. if ($heads instanceof DOMNodeList && $heads->length > 0) {
  220. $head = $heads->item(0);
  221. }
  222. if ($head instanceof DOMElement) {
  223. $head->insertBefore($base, $head->firstChild);
  224. } else {
  225. $doc->documentElement->insertBefore($base, $doc->documentElement->firstChild);
  226. }
  227. }
  228. return $doc->saveHTML() ?: $html;
  229. }
  230. /**
  231. * @param non-empty-string $url
  232. * @param string $type {html,ico,json,opml,xml}
  233. * @param array<string,mixed> $attributes
  234. * @param array<int,mixed> $curl_options
  235. * @return array{body:string,effective_url:string,redirect_count:int,fail:bool}
  236. */
  237. public static function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): array {
  238. $limits = FreshRSS_Context::systemConf()->limits;
  239. $feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
  240. $cacheMtime = @filemtime($cachePath);
  241. if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) {
  242. $body = @file_get_contents($cachePath);
  243. if ($body != false) {
  244. syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
  245. return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false];
  246. }
  247. }
  248. if (rand(0, 30) === 1) { // Remove old cache once in a while
  249. cleanCache(CLEANCACHE_HOURS);
  250. }
  251. $options = [];
  252. $accept = '';
  253. $proxy = is_string(FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] ?? null) ? FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] : '';
  254. if (is_array($attributes['curl_params'] ?? null)) {
  255. $options = self::sanitizeCurlParams($attributes['curl_params']);
  256. $proxy = is_string($options[CURLOPT_PROXY] ?? null) ? $options[CURLOPT_PROXY] : '';
  257. if (is_array($options[CURLOPT_HTTPHEADER] ?? null)) {
  258. // Remove headers problematic for security
  259. $options[CURLOPT_HTTPHEADER] = array_filter($options[CURLOPT_HTTPHEADER],
  260. fn($header) => is_string($header) && !preg_match('/^(Remote-User|X-WebAuth-User)\\s*:/i', $header));
  261. // Add Accept header if it is not set
  262. if (preg_grep('/^Accept\\s*:/i', $options[CURLOPT_HTTPHEADER]) === false) {
  263. $options[CURLOPT_HTTPHEADER][] = 'Accept: ' . $accept;
  264. }
  265. }
  266. }
  267. if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url, $proxy)) > 0) {
  268. Minz_Log::warning('For that domain, will first retry after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
  269. return ['body' => '', 'effective_url' => $url, 'redirect_count' => 0, 'fail' => true];
  270. }
  271. if (FreshRSS_Context::systemConf()->simplepie_syslog_enabled) {
  272. syslog(LOG_INFO, 'FreshRSS GET ' . $type . ' ' . \SimplePie\Misc::url_remove_credentials($url));
  273. }
  274. switch ($type) {
  275. case 'json':
  276. $accept = 'application/json,application/feed+json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7';
  277. break;
  278. case 'opml':
  279. $accept = 'text/x-opml,text/xml;q=0.9,application/xml;q=0.9,*/*;q=0.8';
  280. break;
  281. case 'xml':
  282. $accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8';
  283. break;
  284. case 'ico':
  285. $accept = 'image/x-icon,image/vnd.microsoft.icon,image/ico,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.1';
  286. break;
  287. case 'html':
  288. default:
  289. $accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
  290. break;
  291. }
  292. // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
  293. $ch = curl_init();
  294. if ($ch === false) {
  295. return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true];
  296. }
  297. curl_setopt_array($ch, [
  298. CURLOPT_URL => $url,
  299. CURLOPT_HTTPHEADER => ['Accept: ' . $accept],
  300. CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
  301. CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
  302. CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
  303. CURLOPT_MAXREDIRS => 4,
  304. CURLOPT_RETURNTRANSFER => true,
  305. CURLOPT_FOLLOWLOCATION => true,
  306. CURLOPT_ENCODING => '', //Enable all encodings
  307. //CURLOPT_VERBOSE => 1, // To debug sent HTTP headers
  308. ]);
  309. curl_setopt_array($ch, $options);
  310. curl_setopt_array($ch, FreshRSS_Context::systemConf()->curl_options);
  311. $responseHeaders = '';
  312. curl_setopt($ch, CURLOPT_HEADERFUNCTION, function (\CurlHandle $ch, string $header) use (&$responseHeaders) {
  313. if (trim($header) !== '') { // Skip e.g. separation with trailer headers
  314. $responseHeaders .= $header;
  315. }
  316. return strlen($header);
  317. });
  318. if (isset($attributes['ssl_verify'])) {
  319. curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, empty($attributes['ssl_verify']) ? 0 : 2);
  320. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (bool)$attributes['ssl_verify']);
  321. if (empty($attributes['ssl_verify'])) {
  322. curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
  323. }
  324. }
  325. curl_setopt_array($ch, $curl_options);
  326. $body = curl_exec($ch);
  327. $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  328. $c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  329. $c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
  330. $c_redirect_count = curl_getinfo($ch, CURLINFO_REDIRECT_COUNT);
  331. $c_error = curl_error($ch);
  332. $headers = [];
  333. if ($body !== false) {
  334. assert($c_redirect_count >= 0);
  335. $responseHeaders = \SimplePie\HTTP\Parser::prepareHeaders($responseHeaders, $c_redirect_count + 1);
  336. $parser = new \SimplePie\HTTP\Parser($responseHeaders);
  337. if ($parser->parse()) {
  338. $headers = $parser->headers;
  339. }
  340. }
  341. $fail = $c_status != 200 || $c_error != '' || $body === false;
  342. if ($fail) {
  343. $body = '';
  344. Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
  345. if (in_array($c_status, [429, 503], true)) {
  346. $retryAfter = FreshRSS_http_Util::setRetryAfter($url, $proxy, $headers['retry-after'] ?? '');
  347. if ($c_status === 429) {
  348. $errorMessage = 'HTTP 429 Too Many Requests! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
  349. } elseif ($c_status === 503) {
  350. $errorMessage = 'HTTP 503 Service Unavailable! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
  351. }
  352. if ($retryAfter > 0) {
  353. $errorMessage .= ' We may retry after ' . date('c', $retryAfter);
  354. }
  355. }
  356. // TODO: Implement HTTP 410 Gone
  357. } elseif (!is_string($body) || strlen($body) === 0) {
  358. $body = '';
  359. } else {
  360. if (in_array($type, ['html', 'json', 'opml', 'xml'], true)) {
  361. $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
  362. }
  363. if (in_array($type, ['html', 'xml', 'opml'], true)) {
  364. $body = self::enforceHttpEncoding($body, $c_content_type);
  365. }
  366. if (in_array($type, ['html'], true)) {
  367. $body = self::enforceHtmlBase($body, $c_effective_url);
  368. }
  369. }
  370. if (file_put_contents($cachePath, $body) === false) {
  371. Minz_Log::warning("Error saving cache $cachePath for $url");
  372. }
  373. return ['body' => $body, 'effective_url' => $c_effective_url, 'redirect_count' => $c_redirect_count, 'fail' => $fail];
  374. }
  375. /**
  376. * Converts an IP (v4 or v6) to a binary representation using inet_pton
  377. *
  378. * @param string $ip the IP to convert
  379. * @return string a binary representation of the specified IP
  380. */
  381. private static function ipToBits(string $ip): string {
  382. $binaryip = '';
  383. foreach (str_split(inet_pton($ip) ?: '') as $char) {
  384. $binaryip .= str_pad(decbin(ord($char)), 8, '0', STR_PAD_LEFT);
  385. }
  386. return $binaryip;
  387. }
  388. /**
  389. * Check if an ip belongs to the provided range (in CIDR format)
  390. *
  391. * @param string $ip the IP that we want to verify (ex: 192.168.16.1)
  392. * @param string $range the range to check against (ex: 192.168.16.0/24)
  393. * @return bool true if the IP is in the range, otherwise false
  394. */
  395. private static function checkCIDR(string $ip, string $range): bool {
  396. $binary_ip = self::ipToBits($ip);
  397. $split = explode('/', $range);
  398. $subnet = $split[0] ?? '';
  399. if ($subnet == '') {
  400. return false;
  401. }
  402. $binary_subnet = self::ipToBits($subnet);
  403. $mask_bits = $split[1] ?? '';
  404. $mask_bits = (int)$mask_bits;
  405. if ($mask_bits === 0) {
  406. $mask_bits = null;
  407. }
  408. $ip_net_bits = substr($binary_ip, 0, $mask_bits);
  409. $subnet_bits = substr($binary_subnet, 0, $mask_bits);
  410. return $ip_net_bits === $subnet_bits;
  411. }
  412. /**
  413. * Check if the client (e.g. last proxy) is allowed to send unsafe headers.
  414. * This uses the `TRUSTED_PROXY` environment variable or the `trusted_sources` configuration option to get an array of the authorized ranges,
  415. * The connection IP is obtained from the `CONN_REMOTE_ADDR`
  416. * (if available, to be robust even when using Apache mod_remoteip) or `REMOTE_ADDR` environment variables.
  417. * @return bool true if the sender’s IP is in one of the ranges defined in the configuration, else false
  418. */
  419. public static function checkTrustedIP(): bool {
  420. if (!FreshRSS_Context::hasSystemConf()) {
  421. return false;
  422. }
  423. $remoteIp = Minz_Request::connectionRemoteAddress();
  424. if ($remoteIp === '') {
  425. return false;
  426. }
  427. $trusted = getenv('TRUSTED_PROXY');
  428. if ($trusted != 0 && is_string($trusted)) {
  429. $trusted = preg_split('/\s+/', $trusted, -1, PREG_SPLIT_NO_EMPTY);
  430. }
  431. if (!is_array($trusted) || empty($trusted)) {
  432. $trusted = FreshRSS_Context::systemConf()->trusted_sources;
  433. }
  434. foreach ($trusted as $cidr) {
  435. if (self::checkCIDR($remoteIp, $cidr)) {
  436. return true;
  437. }
  438. }
  439. return false;
  440. }
  441. public static function httpAuthUser(bool $onlyTrusted = true): string {
  442. $auths = array_unique(
  443. array_intersect_key($_SERVER, ['REMOTE_USER' => '', 'REDIRECT_REMOTE_USER' => '', 'HTTP_REMOTE_USER' => '', 'HTTP_X_WEBAUTH_USER' => ''])
  444. );
  445. if (count($auths) > 1) {
  446. Minz_Log::warning('Multiple HTTP authentication headers!');
  447. return '';
  448. }
  449. if (!empty($_SERVER['REMOTE_USER']) && is_string($_SERVER['REMOTE_USER'])) {
  450. return $_SERVER['REMOTE_USER'];
  451. }
  452. if (!empty($_SERVER['REDIRECT_REMOTE_USER']) && is_string($_SERVER['REDIRECT_REMOTE_USER'])) {
  453. return $_SERVER['REDIRECT_REMOTE_USER'];
  454. }
  455. if (!$onlyTrusted || self::checkTrustedIP()) {
  456. if (!empty($_SERVER['HTTP_REMOTE_USER']) && is_string($_SERVER['HTTP_REMOTE_USER'])) {
  457. return $_SERVER['HTTP_REMOTE_USER'];
  458. }
  459. if (!empty($_SERVER['HTTP_X_WEBAUTH_USER']) && is_string($_SERVER['HTTP_X_WEBAUTH_USER'])) {
  460. return $_SERVER['HTTP_X_WEBAUTH_USER'];
  461. }
  462. }
  463. return '';
  464. }
  465. }