httpUtil.php 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. <?php
  2. declare(strict_types=1);
  3. final class FreshRSS_http_Util {
  4. private const RETRY_AFTER_PATH = DATA_PATH . '/Retry-After/';
  5. private const PRIVATE_SUBNETS = [
  6. '127.0.0.0/8', // RFC1700 (Loopback)
  7. '10.0.0.0/8', // RFC1918
  8. '192.168.0.0/16', // RFC1918
  9. '172.16.0.0/12', // RFC1918
  10. '169.254.0.0/16', // RFC3927
  11. '0.0.0.0/8', // RFC5735
  12. '240.0.0.0/4', // RFC1112
  13. '::1/128', // Loopback
  14. 'fc00::/7', // Unique Local Address
  15. 'fe80::/10', // Link Local Address
  16. '::ffff:0:0/96', // IPv4 translations
  17. '::/128', // Unspecified address
  18. ];
  19. /** @var array<string, string[]> $resolve_ok */
  20. private static array $resolve_ok = [];
  21. private static function getRetryAfterFile(string $url, string $proxy): string {
  22. $domain = parse_url($url, PHP_URL_HOST);
  23. if (!is_string($domain) || $domain === '') {
  24. return '';
  25. }
  26. $domainWide = Minz_Request::serverIsPublic($domain);
  27. $port = parse_url($url, PHP_URL_PORT);
  28. if (is_int($port)) {
  29. $domain .= ':' . $port;
  30. }
  31. return self::RETRY_AFTER_PATH . urlencode($domain) .
  32. ($domainWide ? '' : '_' . hash('sha256', $url)) .
  33. (empty($proxy) ? '' : '_' . urlencode($proxy)) . '.txt';
  34. }
  35. /**
  36. * Clean up old Retry-After files
  37. */
  38. private static function cleanRetryAfters(): void {
  39. if (!is_dir(self::RETRY_AFTER_PATH)) {
  40. return;
  41. }
  42. $files = glob(self::RETRY_AFTER_PATH . '*.txt', GLOB_NOSORT);
  43. if ($files === false) {
  44. return;
  45. }
  46. foreach ($files as $file) {
  47. if (@filemtime($file) < time()) {
  48. @unlink($file);
  49. }
  50. }
  51. }
  52. /**
  53. * Check whether the URL needs to wait for a Retry-After period.
  54. * @return int The timestamp of when the Retry-After expires, or 0 if not set.
  55. */
  56. public static function getRetryAfter(string $url, string $proxy): int {
  57. if (rand(0, 30) === 1) { // Remove old files once in a while
  58. self::cleanRetryAfters();
  59. }
  60. $txt = self::getRetryAfterFile($url, $proxy);
  61. if ($txt === '') {
  62. return 0;
  63. }
  64. $retryAfter = @filemtime($txt) ?: 0;
  65. if ($retryAfter <= 0) {
  66. return 0;
  67. }
  68. if ($retryAfter < time()) {
  69. @unlink($txt);
  70. return 0;
  71. }
  72. return $retryAfter;
  73. }
  74. /**
  75. * Store the HTTP Retry-After header value of an HTTP `429 Too Many Requests` or `503 Service Unavailable` response.
  76. */
  77. public static function setRetryAfter(string $url, string $proxy, string $retryAfter): int {
  78. $txt = self::getRetryAfterFile($url, $proxy);
  79. if ($txt === '') {
  80. return 0;
  81. }
  82. $limits = FreshRSS_Context::systemConf()->limits;
  83. if (ctype_digit($retryAfter)) {
  84. $retryAfter = time() + (int)$retryAfter;
  85. } else {
  86. $retryAfter = \SimplePie\Misc::parse_date($retryAfter) ?:
  87. (time() + max(600, $limits['retry_after_default'] ?? 0));
  88. }
  89. $retryAfter = min($retryAfter, time() + max(3600, $limits['retry_after_max'] ?? 0));
  90. @mkdir(self::RETRY_AFTER_PATH);
  91. if (!touch($txt, $retryAfter)) {
  92. Minz_Log::error('Failed to set Retry-After for ' . $url);
  93. return 0;
  94. }
  95. return $retryAfter;
  96. }
  97. /**
  98. * @param array<mixed> $curl_params
  99. * @return array<mixed>
  100. */
  101. public static function sanitizeCurlParams(array $curl_params): array {
  102. $safe_params = [
  103. CURLOPT_COOKIE,
  104. CURLOPT_COOKIEFILE,
  105. CURLOPT_FOLLOWLOCATION, // We filter this value later, only allowing `false`
  106. CURLOPT_HTTPHEADER,
  107. CURLOPT_MAXREDIRS,
  108. CURLOPT_POST,
  109. CURLOPT_POSTFIELDS,
  110. CURLOPT_PROXY,
  111. CURLOPT_PROXYTYPE,
  112. CURLOPT_USERAGENT,
  113. ];
  114. foreach ($curl_params as $k => $_) {
  115. if (!in_array($k, $safe_params, true)) {
  116. unset($curl_params[$k]);
  117. continue;
  118. }
  119. // Allow only an empty value just to enable the libcurl cookie engine
  120. if ($k === CURLOPT_COOKIEFILE) {
  121. $curl_params[$k] = '';
  122. }
  123. }
  124. return $curl_params;
  125. }
  126. private static function idn_to_puny(string $url): string {
  127. if (function_exists('idn_to_ascii')) {
  128. $idn = parse_url($url, PHP_URL_HOST);
  129. if (is_string($idn) && $idn != '') {
  130. $puny = idn_to_ascii($idn);
  131. $pos = strpos($url, $idn);
  132. if ($puny != false && $pos !== false) {
  133. $url = substr_replace($url, $puny, $pos, strlen($idn));
  134. }
  135. }
  136. }
  137. return $url;
  138. }
  139. public static function checkUrl(string $url, bool $fixScheme = true): string|false {
  140. $url = trim($url);
  141. if ($url == '') {
  142. return '';
  143. }
  144. if ($fixScheme && preg_match('#^https?://#i', $url) !== 1) {
  145. $url = 'https://' . ltrim($url, '/');
  146. }
  147. $url = self::idn_to_puny($url); // https://bugs.php.net/bug.php?id=53474
  148. $urlRelaxed = str_replace('_', 'z', $url); //PHP discussion #64948 Underscore
  149. if (is_string(filter_var($urlRelaxed, FILTER_VALIDATE_URL))) {
  150. return $url;
  151. } else {
  152. return false;
  153. }
  154. }
  155. /**
  156. * Remove the charset meta information of an HTML document, e.g.:
  157. * `<meta charset="..." />`
  158. * `<meta http-equiv="Content-Type" content="text/html; charset=...">`
  159. */
  160. private static function stripHtmlMetaCharset(string $html): string {
  161. return preg_replace('/<meta\s[^>]*charset\s*=\s*[^>]+>/i', '', $html, 1) ?? '';
  162. }
  163. /**
  164. * Set an XML preamble to enforce the HTML content type charset received by HTTP.
  165. * @param string $html the raw downloaded HTML content
  166. * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
  167. * @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
  168. */
  169. private static function enforceHttpEncoding(string $html, string $contentType = ''): string {
  170. $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
  171. if ($httpCharset == '') {
  172. // No charset defined by HTTP
  173. if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
  174. // Detect UTF-8 even if declared too deep in HTML for DOMDocument
  175. $httpCharset = 'UTF-8';
  176. } else {
  177. // Do nothing
  178. return $html;
  179. }
  180. }
  181. $httpCharsetNormalized = \SimplePie\Misc::encoding($httpCharset);
  182. if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
  183. // Default charset for HTTP, do nothing
  184. return $html;
  185. }
  186. if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM
  187. substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM
  188. substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM
  189. substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM
  190. substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM
  191. // Existing byte order mark, do nothing
  192. return $html;
  193. }
  194. if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) {
  195. // Existing XML declaration, do nothing
  196. return $html;
  197. }
  198. if ($httpCharsetNormalized !== 'UTF-8') {
  199. // Try to change encoding to UTF-8 using mbstring or iconv or intl
  200. $utf8 = \SimplePie\Misc::change_encoding($html, $httpCharsetNormalized, 'UTF-8');
  201. if (is_string($utf8)) {
  202. $html = self::stripHtmlMetaCharset($utf8);
  203. $httpCharsetNormalized = 'UTF-8';
  204. }
  205. }
  206. if ($httpCharsetNormalized === 'UTF-8') {
  207. // Save encoding information as Unicode BOM
  208. return "\xEF\xBB\xBF" . $html;
  209. }
  210. // Give up
  211. return $html;
  212. }
  213. /**
  214. * Set an HTML base URL to the HTML content if there is none.
  215. * @param string $html the raw downloaded HTML content
  216. * @param string $href the HTML base URL
  217. * @return string an HTML string
  218. */
  219. private static function enforceHtmlBase(string $html, string $href): string {
  220. $doc = new DOMDocument();
  221. $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
  222. if ($doc->documentElement === null) {
  223. return '';
  224. }
  225. $xpath = new DOMXPath($doc);
  226. $bases = $xpath->evaluate('//base');
  227. if (!($bases instanceof DOMNodeList) || $bases->length === 0) {
  228. $base = $doc->createElement('base');
  229. if ($base === false) {
  230. return $html;
  231. }
  232. $base->setAttribute('href', $href);
  233. $head = null;
  234. $heads = $xpath->evaluate('//head');
  235. if ($heads instanceof DOMNodeList && $heads->length > 0) {
  236. $head = $heads->item(0);
  237. }
  238. if ($head instanceof DOMElement) {
  239. $head->insertBefore($base, $head->firstChild);
  240. } else {
  241. $doc->documentElement->insertBefore($base, $doc->documentElement->firstChild);
  242. }
  243. }
  244. // Save the start of HTML because libxml2 saveHTML() risks scrambling it
  245. $htmlPos = stripos($html, '<html');
  246. $htmlStart = $htmlPos === false || $htmlPos > 512 ? '' : substr($html, 0, $htmlPos);
  247. $html = $doc->saveHTML() ?: $html;
  248. if ($htmlStart !== '' && !str_starts_with($html, $htmlStart)) {
  249. // libxml2 saveHTML() risks removing Unicode BOM and XML declaration,
  250. // which affects future detection of charset encoding, so manually restore it
  251. $htmlPos = stripos($html, '<html');
  252. $html = $htmlPos === false || $htmlPos > 512 ? $html : $htmlStart . substr($html, $htmlPos);
  253. }
  254. return $html;
  255. }
  256. public static function compareURLOrigins(string $url1, string $url2): bool {
  257. $url1 = parse_url(strtolower($url1));
  258. $url2 = parse_url(strtolower($url2));
  259. if ($url1 === false || $url2 === false) {
  260. return false;
  261. }
  262. foreach ([&$url1, &$url2] as &$url) {
  263. $url['port'] ??= match ($url['scheme']) {
  264. 'http' => 80,
  265. 'https' => 443,
  266. default => 0,
  267. };
  268. }
  269. return ($url1['scheme'] ?? '') === ($url2['scheme'] ?? '') &&
  270. ($url1['host'] ?? '') === ($url2['host'] ?? '') &&
  271. ($url1['port'] ?? '') === ($url2['port'] ?? '');
  272. }
  273. /**
  274. * Returns a value for CURLOPT_RESOLVE as an array, null if no allowed IPs were found, false if the domain failed to resolve.
  275. *
  276. * @return array<string>|null|false
  277. */
  278. public static function getCurlResolveInfo(string $url): array|null|false {
  279. $url = strtolower($url);
  280. $parsed = parse_url($url);
  281. if ($parsed === false) {
  282. return false;
  283. }
  284. $host = $parsed['host'] ?? null;
  285. $scheme = $parsed['scheme'] ?? null;
  286. if ($host === null || $scheme === null) {
  287. return false;
  288. }
  289. if (str_starts_with($host, '[') && str_ends_with($host, ']')) {
  290. if (strlen($host) === 2) {
  291. return false;
  292. }
  293. $host = substr($host, 1, strlen($host) - 2);
  294. }
  295. $internal_host_allowlist = getenv('INTERNAL_HOST_ALLOWLIST');
  296. if (is_string($internal_host_allowlist) && $internal_host_allowlist !== '') {
  297. $internal_host_allowlist = preg_split('/\s+/', $internal_host_allowlist, -1, PREG_SPLIT_NO_EMPTY);
  298. }
  299. if (!is_array($internal_host_allowlist) || empty($internal_host_allowlist)) {
  300. $internal_host_allowlist = FreshRSS_Context::systemConf()->internal_host_allowlist;
  301. }
  302. if (in_array('*', $internal_host_allowlist, true)) {
  303. return []; // Disables SSRF checks entirely (unsafe)
  304. }
  305. $port = parse_url($url)['port'] ?? match ($scheme) {
  306. 'http' => 80,
  307. 'https' => 443,
  308. default => 0,
  309. };
  310. $resolve_str = "$host:$port:";
  311. $ips_ok = [];
  312. $ips = [];
  313. $records = [];
  314. if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
  315. $ips[] = $host;
  316. } elseif (isset(self::$resolve_ok[$host])) {
  317. $ips = self::$resolve_ok[$host];
  318. } else {
  319. $records = @dns_get_record($host, DNS_A + DNS_AAAA);
  320. if ($records === false) {
  321. return false;
  322. }
  323. foreach ($records as $record) {
  324. $ip = $record['ip'] ?? $record['ipv6'];
  325. if (is_string($ip)) {
  326. $ips[] = $ip;
  327. }
  328. }
  329. self::$resolve_ok[$host] = $ips;
  330. }
  331. $cidr_allowlist = array_filter($internal_host_allowlist, fn($v, $_) => str_contains($v, '/'), ARRAY_FILTER_USE_BOTH);
  332. foreach ($ips as $ip) {
  333. $allowlist_str = "$ip:$port";
  334. $add_ip = $ip;
  335. if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6) !== false) {
  336. $allowlist_str = "[$ip]:$port";
  337. $add_ip = "[$ip]";
  338. }
  339. foreach ($cidr_allowlist as $cidr) {
  340. if (self::checkCIDR($ip, $cidr)) {
  341. $ips_ok[] = $add_ip;
  342. continue 2;
  343. }
  344. }
  345. if (in_array($allowlist_str, $internal_host_allowlist, true) ||
  346. in_array("$host:$port", $internal_host_allowlist, true)) {
  347. $ips_ok[] = $add_ip;
  348. continue;
  349. }
  350. if (filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE) === false) {
  351. continue;
  352. }
  353. // Extra check because the above one might not be enough: https://github.com/php/php-src/issues/16944
  354. // Workaround is available by using `FILTER_FLAG_GLOBAL_RANGE` instead, but that was only added in PHP 8.2, and we need to support PHP 8.1+
  355. foreach (self::PRIVATE_SUBNETS as $cidr) {
  356. if (self::checkCIDR($ip, $cidr)) {
  357. continue 2;
  358. }
  359. }
  360. $ips_ok[] = $add_ip;
  361. }
  362. if (count($ips_ok) > 0) {
  363. if (count($records) > 0 || isset(self::$resolve_ok[$host])) {
  364. $resolve_str .= implode(',', $ips_ok);
  365. return [$resolve_str];
  366. }
  367. if (filter_var($host, FILTER_VALIDATE_IP) !== false) {
  368. // No resolve overrides since the URL only contained an IP, not a domain
  369. return [];
  370. }
  371. }
  372. if (count($ips) === 0) {
  373. return false;
  374. }
  375. return null;
  376. }
  377. /**
  378. * @param non-empty-string $url
  379. * @param string|null $cachePath path to cache file, or `null` to disable caching
  380. * @param string $type {html,ico,json,opml,xml}
  381. * @param array<string,mixed> $attributes May contain user-defined cURL options in `$attributes['curl_params']`
  382. * @param array<int,mixed> $curl_options Internal overrides of cURL options
  383. * @return array{body:string,effective_url:string,redirect_count:int,fail:bool,status:int,error:string}
  384. * `status` is the HTTP response code (e.g. 200, 404), or a custom negative value:
  385. * * `-200` served from local cache;
  386. * * `-429` blocked by active `Retry-After` period;
  387. * * `-500` `curl_init()` failure.
  388. */
  389. public static function httpGet(string $url, ?string $cachePath = null, string $type = 'html', array $attributes = [], array $curl_options = []): array {
  390. $limits = FreshRSS_Context::systemConf()->limits;
  391. $feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
  392. if ($cachePath !== null) {
  393. $cacheMtime = @filemtime($cachePath);
  394. if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) {
  395. $body = @file_get_contents($cachePath);
  396. if ($body != false) {
  397. syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
  398. return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false, 'status' => -200, 'error' => ''];
  399. }
  400. }
  401. }
  402. if (rand(0, 30) === 1) { // Remove old cache once in a while
  403. cleanCache(CLEANCACHE_HOURS);
  404. }
  405. $accept = '';
  406. $proxy = is_string(FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] ?? null) ? FreshRSS_Context::systemConf()->curl_options[CURLOPT_PROXY] : '';
  407. $options = []; // User-defined cURL options
  408. if (is_array($attributes['curl_params'] ?? null)) {
  409. $options = self::sanitizeCurlParams($attributes['curl_params']);
  410. $proxy = is_string($options[CURLOPT_PROXY] ?? null) ? $options[CURLOPT_PROXY] : $proxy;
  411. if (is_array($options[CURLOPT_HTTPHEADER] ?? null)) {
  412. // Remove headers problematic for security
  413. $options[CURLOPT_HTTPHEADER] = array_filter($options[CURLOPT_HTTPHEADER],
  414. fn($header) => is_string($header) && !preg_match('/^(Remote-User|X-WebAuth-User)\\s*:/i', $header));
  415. // Add Accept header if it is not set
  416. if (preg_grep('/^Accept\\s*:/i', $options[CURLOPT_HTTPHEADER]) === false) {
  417. $options[CURLOPT_HTTPHEADER][] = 'Accept: ' . $accept;
  418. }
  419. }
  420. }
  421. $proxy = is_string($curl_options[CURLOPT_PROXY] ?? null) ? $curl_options[CURLOPT_PROXY] : $proxy;
  422. if (($retryAfter = FreshRSS_http_Util::getRetryAfter($url, $proxy)) > 0) {
  423. Minz_Log::warning('For that domain, will first retry after ' . date('c', $retryAfter) . '. ' . \SimplePie\Misc::url_remove_credentials($url));
  424. return ['body' => '', 'effective_url' => $url, 'redirect_count' => 0, 'fail' => true, 'status' => -429, 'error' => ''];
  425. }
  426. if (FreshRSS_Context::systemConf()->simplepie_syslog_enabled) {
  427. syslog(LOG_INFO, 'FreshRSS GET ' . $type . ' ' . \SimplePie\Misc::url_remove_credentials($url));
  428. }
  429. switch ($type) {
  430. case 'json':
  431. $accept = 'application/json,application/feed+json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7';
  432. break;
  433. case 'opml':
  434. $accept = 'text/x-opml,text/xml;q=0.9,application/xml;q=0.9,*/*;q=0.8';
  435. break;
  436. case 'xml':
  437. $accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8';
  438. break;
  439. case 'ico':
  440. $accept = 'image/x-icon,image/vnd.microsoft.icon,image/ico,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.1';
  441. break;
  442. case 'html':
  443. default:
  444. $accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
  445. break;
  446. }
  447. $original_url = $url;
  448. $fail = false;
  449. $redirs = 0;
  450. $max_redirs = $curl_options[CURLOPT_MAXREDIRS] ?? $options[CURLOPT_MAXREDIRS] ?? FreshRSS_Context::systemConf()->curl_options[CURLOPT_MAXREDIRS] ?? null;
  451. if (!is_int($max_redirs)) {
  452. $max_redirs = 4;
  453. }
  454. while (true) {
  455. $url = is_string($url) ? $url : '';
  456. $resolve = [];
  457. if ($proxy === '') {
  458. $resolve = self::getCurlResolveInfo($url);
  459. if ($resolve === null) {
  460. Minz_Log::warning('Fetching this URL is not allowed, because the host’s IP is not in the allowlist [' .
  461. \SimplePie\Misc::url_remove_credentials($url) . ']');
  462. return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true, 'status' => -500, 'error' => ''];
  463. } elseif ($resolve === false) {
  464. return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true, 'status' => -500, 'error' => ''];
  465. }
  466. if (!empty($resolve)) {
  467. $curl_options[CURLOPT_RESOLVE] = $resolve; // Prevent DNS rebinding
  468. }
  469. }
  470. // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
  471. $ch = curl_init();
  472. if ($ch === false || $url === '') {
  473. return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true, 'status' => -500, 'error' => ''];
  474. }
  475. curl_setopt_array($ch, [
  476. CURLOPT_URL => $url,
  477. CURLOPT_HTTPHEADER => ['Accept: ' . $accept],
  478. CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
  479. CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
  480. CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
  481. CURLOPT_RETURNTRANSFER => true,
  482. CURLOPT_ACCEPT_ENCODING => '', //Enable all encodings
  483. //CURLOPT_VERBOSE => 1, // To debug sent HTTP headers
  484. ]);
  485. curl_setopt_array($ch, $options);
  486. curl_setopt_array($ch, FreshRSS_Context::systemConf()->curl_options);
  487. $responseHeaders = '';
  488. curl_setopt($ch, CURLOPT_HEADERFUNCTION, function (\CurlHandle $ch, string $header) use (&$responseHeaders) {
  489. if (trim($header) !== '') { // Skip e.g. separation with trailer headers
  490. $responseHeaders .= $header;
  491. }
  492. return strlen($header);
  493. });
  494. if (isset($attributes['ssl_verify'])) {
  495. curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, empty($attributes['ssl_verify']) ? 0 : 2);
  496. curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, (bool)$attributes['ssl_verify']);
  497. if (empty($attributes['ssl_verify'])) {
  498. curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
  499. }
  500. }
  501. if (defined('CURLOPT_PROTOCOLS_STR') && is_int(CURLOPT_PROTOCOLS_STR)) {
  502. $curl_options[CURLOPT_PROTOCOLS_STR] = 'http,https';
  503. if (defined('CURLOPT_REDIR_PROTOCOLS_STR') && is_int(CURLOPT_REDIR_PROTOCOLS_STR)) {
  504. $curl_options[CURLOPT_REDIR_PROTOCOLS_STR] = 'http,https';
  505. }
  506. } elseif (defined('CURLPROTO_HTTP') && defined('CURLPROTO_HTTPS')) {
  507. // Legacy PHP 8.2-
  508. if (defined('CURLOPT_PROTOCOLS')) {
  509. $curl_options[CURLOPT_PROTOCOLS] = CURLPROTO_HTTP | CURLPROTO_HTTPS;
  510. }
  511. if (defined('CURLOPT_REDIR_PROTOCOLS')) {
  512. $curl_options[CURLOPT_REDIR_PROTOCOLS] = CURLPROTO_HTTP | CURLPROTO_HTTPS;
  513. }
  514. }
  515. curl_setopt_array($ch, $curl_options);
  516. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); // We handle HTTP redirections manually for security
  517. $body = curl_exec($ch);
  518. $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  519. $c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
  520. $c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
  521. $c_error = curl_error($ch);
  522. $headers = [];
  523. if ($body !== false) {
  524. $responseHeaders .= "\r\n";
  525. $responseHeaders = \SimplePie\HTTP\Parser::prepareHeaders($responseHeaders);
  526. $parser = new \SimplePie\HTTP\Parser($responseHeaders);
  527. if ($parser->parse()) {
  528. $headers = $parser->headers;
  529. }
  530. }
  531. if (in_array($c_status, [301, 302, 303, 307, 308], true)) {
  532. // Handle the redirect by making another request
  533. $location = \SimplePie\Misc::absolutize_url($headers['location'] ?? $url, $url);
  534. if ($location === false) {
  535. $location = $url;
  536. }
  537. if (!self::compareURLOrigins($url, $location)) {
  538. unset($curl_options[CURLOPT_COOKIE]);
  539. unset($curl_options[CURLOPT_USERPWD]);
  540. unset($options[CURLOPT_COOKIE]);
  541. unset($options[CURLOPT_USERPWD]);
  542. if (is_array($options[CURLOPT_HTTPHEADER] ?? null)) {
  543. $options[CURLOPT_HTTPHEADER] = array_filter($options[CURLOPT_HTTPHEADER], fn(mixed $header): bool =>
  544. is_string($header) && !preg_match('/^(Cookie|Authorization)\\s*:/i', $header));
  545. }
  546. if (is_array($curl_options[CURLOPT_HTTPHEADER] ?? null)) {
  547. $curl_options[CURLOPT_HTTPHEADER] = array_filter($curl_options[CURLOPT_HTTPHEADER], fn(mixed $header): bool =>
  548. is_string($header) && !preg_match('/^(Cookie|Authorization)\\s*:/i', $header));
  549. }
  550. }
  551. if ($max_redirs >= 0) {
  552. $redirs++;
  553. }
  554. if ($redirs > $max_redirs) {
  555. Minz_Log::warning('Error fetching content: Too many redirects were hit [' . \SimplePie\Misc::url_remove_credentials($original_url) . ']');
  556. break;
  557. }
  558. if ((isset($options[CURLOPT_POST]) || isset($curl_options[CURLOPT_POST])) &&
  559. in_array($c_status, [301, 302, 303], true)) { // Not for 307 and 308, which must not change the HTTP method
  560. unset($curl_options[CURLOPT_POST]);
  561. unset($curl_options[CURLOPT_POSTFIELDS]);
  562. unset($options[CURLOPT_POST]);
  563. unset($options[CURLOPT_POSTFIELDS]);
  564. if (is_array($options[CURLOPT_HTTPHEADER] ?? null)) {
  565. $options[CURLOPT_HTTPHEADER] = array_filter($options[CURLOPT_HTTPHEADER], fn(mixed $header): bool =>
  566. is_string($header) && !str_starts_with(strtolower(trim($header)), 'content-type:'));
  567. }
  568. if (is_array($curl_options[CURLOPT_HTTPHEADER] ?? null)) {
  569. $curl_options[CURLOPT_HTTPHEADER] = array_filter($curl_options[CURLOPT_HTTPHEADER], fn(mixed $header): bool =>
  570. is_string($header) && !str_starts_with(strtolower(trim($header)), 'content-type:'));
  571. }
  572. }
  573. $url = $location;
  574. continue;
  575. }
  576. $fail = $c_status != 200 || $c_error != '' || $body === false;
  577. if ($fail) {
  578. $body = '';
  579. Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
  580. if (in_array($c_status, [429, 503], true)) {
  581. $retryAfter = FreshRSS_http_Util::setRetryAfter($url, $proxy, $headers['retry-after'] ?? '');
  582. if ($c_status === 429) {
  583. $errorMessage = 'HTTP 429 Too Many Requests! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
  584. } elseif ($c_status === 503) {
  585. $errorMessage = 'HTTP 503 Service Unavailable! [' . \SimplePie\Misc::url_remove_credentials($url) . ']';
  586. }
  587. if ($retryAfter > 0) {
  588. $errorMessage .= ' We may retry after ' . date('c', $retryAfter);
  589. }
  590. }
  591. } elseif (!is_string($body) || strlen($body) === 0) { // TODO: Implement HTTP 410 Gone
  592. $body = '';
  593. } else {
  594. if (in_array($type, ['html', 'json', 'opml', 'xml'], true)) {
  595. $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
  596. }
  597. if (in_array($type, ['html', 'xml', 'opml'], true)) {
  598. $body = self::enforceHttpEncoding($body, $c_content_type);
  599. }
  600. if (in_array($type, ['html'], true)) {
  601. if (stripos($c_content_type, 'text/plain') !== false) {
  602. // Plain text to be displayed as preformatted text. Prefixed with UTF-8 BOM
  603. $body = "\xEF\xBB\xBF" . '<pre class="text-plain">' . htmlspecialchars($body, ENT_NOQUOTES, 'UTF-8') . '</pre>';
  604. } else {
  605. $body = self::enforceHtmlBase($body, $c_effective_url);
  606. }
  607. }
  608. }
  609. break;
  610. }
  611. if ($cachePath !== null && file_put_contents($cachePath, $body) === false) {
  612. Minz_Log::warning("Error saving cache $cachePath for $url");
  613. }
  614. return ['body' => is_string($body) ? $body : '', 'effective_url' => $c_effective_url, 'redirect_count' => $redirs,
  615. 'fail' => $fail, 'status' => $c_status, 'error' => $c_error];
  616. }
  617. /**
  618. * Converts an IP (v4 or v6) to a binary representation using inet_pton
  619. *
  620. * @param string $ip the IP to convert
  621. * @return string a binary representation of the specified IP
  622. */
  623. private static function ipToBits(string $ip): string {
  624. $binaryip = '';
  625. foreach (str_split(inet_pton($ip) ?: '') as $char) {
  626. $binaryip .= str_pad(decbin(ord($char)), 8, '0', STR_PAD_LEFT);
  627. }
  628. return $binaryip;
  629. }
  630. /**
  631. * Check if an ip belongs to the provided range (in CIDR format)
  632. *
  633. * @param string $ip the IP that we want to verify (ex: 192.168.16.1)
  634. * @param string $range the range to check against (ex: 192.168.16.0/24)
  635. * @return bool true if the IP is in the range, otherwise false
  636. */
  637. private static function checkCIDR(string $ip, string $range): bool {
  638. $binary_ip = self::ipToBits($ip);
  639. if ($binary_ip === '') {
  640. return false;
  641. }
  642. $split = explode('/', $range);
  643. $subnet = $split[0] ?? '';
  644. if ($subnet == '') {
  645. return false;
  646. }
  647. $binary_subnet = self::ipToBits($subnet);
  648. if ($binary_subnet === '') {
  649. return false;
  650. }
  651. if (strlen($binary_ip) !== strlen($binary_subnet)) {
  652. return false; // Do not mix IPv4 and IPv6
  653. }
  654. $mask_bits_str = $split[1] ?? '';
  655. if (!ctype_digit($mask_bits_str)) {
  656. return false;
  657. }
  658. $mask_bits = (int)$mask_bits_str;
  659. $max_mask_bits = str_contains($ip, ':') ? 128 : 32;
  660. if ($mask_bits < 0 || $mask_bits > $max_mask_bits) {
  661. return false; // Reject invalid mask bits lengths
  662. }
  663. if ($mask_bits === 0) {
  664. return true;
  665. }
  666. $ip_net_bits = substr($binary_ip, 0, $mask_bits);
  667. $subnet_bits = substr($binary_subnet, 0, $mask_bits);
  668. return $ip_net_bits === $subnet_bits;
  669. }
  670. /**
  671. * Check if the client (e.g. last proxy) is allowed to send unsafe headers.
  672. * This uses the `TRUSTED_PROXY` environment variable or the `trusted_sources` configuration option to get an array of the authorized ranges,
  673. * The connection IP is obtained from the `CONN_REMOTE_ADDR`
  674. * (if available, to be robust even when using Apache mod_remoteip) or `REMOTE_ADDR` environment variables.
  675. * @return bool true if the sender’s IP is in one of the ranges defined in the configuration, else false
  676. */
  677. public static function checkTrustedIP(): bool {
  678. if (!FreshRSS_Context::hasSystemConf()) {
  679. return false;
  680. }
  681. $remoteIp = Minz_Request::connectionRemoteAddress();
  682. if ($remoteIp === '') {
  683. return false;
  684. }
  685. $trusted = getenv('TRUSTED_PROXY');
  686. if ($trusted != 0 && is_string($trusted)) {
  687. $trusted = preg_split('/\s+/', $trusted, -1, PREG_SPLIT_NO_EMPTY);
  688. }
  689. if (!is_array($trusted) || empty($trusted)) {
  690. $trusted = FreshRSS_Context::systemConf()->trusted_sources;
  691. }
  692. foreach ($trusted as $cidr) {
  693. if (self::checkCIDR($remoteIp, $cidr)) {
  694. return true;
  695. }
  696. }
  697. return false;
  698. }
  699. public static function httpAuthUser(bool $onlyTrusted = true): string {
  700. $auths = array_unique(array_filter(
  701. array_intersect_key($_SERVER, ['REMOTE_USER' => '', 'REDIRECT_REMOTE_USER' => '', 'HTTP_REMOTE_USER' => '', 'HTTP_X_WEBAUTH_USER' => '']),
  702. fn($value) => is_string($value) && $value !== ''
  703. ));
  704. if (count($auths) > 1) {
  705. Minz_Log::warning('Multiple HTTP authentication headers!');
  706. return '';
  707. }
  708. if (!empty($_SERVER['REMOTE_USER']) && is_string($_SERVER['REMOTE_USER'])) {
  709. return $_SERVER['REMOTE_USER'];
  710. }
  711. if (!empty($_SERVER['REDIRECT_REMOTE_USER']) && is_string($_SERVER['REDIRECT_REMOTE_USER'])) {
  712. return $_SERVER['REDIRECT_REMOTE_USER'];
  713. }
  714. if (!$onlyTrusted || self::checkTrustedIP()) {
  715. if (!empty($_SERVER['HTTP_REMOTE_USER']) && is_string($_SERVER['HTTP_REMOTE_USER'])) {
  716. return $_SERVER['HTTP_REMOTE_USER'];
  717. }
  718. if (!empty($_SERVER['HTTP_X_WEBAUTH_USER']) && is_string($_SERVER['HTTP_X_WEBAUTH_USER'])) {
  719. return $_SERVER['HTTP_X_WEBAUTH_USER'];
  720. }
  721. }
  722. return '';
  723. }
  724. }