|
@@ -507,12 +507,47 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string {
|
|
|
return $html;
|
|
return $html;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * Set an HTML base URL to the HTML content if there is none.
|
|
|
|
|
+ * @param string $html the raw downloaded HTML content
|
|
|
|
|
+ * @param string $href the HTML base URL
|
|
|
|
|
+ * @return string an HTML string
|
|
|
|
|
+ */
|
|
|
|
|
+function enforceHtmlBase(string $html, string $href): string {
|
|
|
|
|
+ $doc = new DOMDocument();
|
|
|
|
|
+ $doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
|
|
|
|
|
+ if ($doc->documentElement === null) {
|
|
|
|
|
+ return '';
|
|
|
|
|
+ }
|
|
|
|
|
+ $xpath = new DOMXPath($doc);
|
|
|
|
|
+ $bases = $xpath->evaluate('//base');
|
|
|
|
|
+ if (!($bases instanceof DOMNodeList) || $bases->length === 0) {
|
|
|
|
|
+ $base = $doc->createElement('base');
|
|
|
|
|
+ if ($base === false) {
|
|
|
|
|
+ return $html;
|
|
|
|
|
+ }
|
|
|
|
|
+ $base->setAttribute('href', $href);
|
|
|
|
|
+ $head = null;
|
|
|
|
|
+ $heads = $xpath->evaluate('//head');
|
|
|
|
|
+ if ($heads instanceof DOMNodeList && $heads->length > 0) {
|
|
|
|
|
+ $head = $heads->item(0);
|
|
|
|
|
+ }
|
|
|
|
|
+ if ($head instanceof DOMElement) {
|
|
|
|
|
+ $head->insertBefore($base, $head->firstChild);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ $doc->insertBefore($base, $doc->documentElement->firstChild);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return $doc->saveHTML() ?: $html;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* @param string $type {html,json,opml,xml}
|
|
* @param string $type {html,json,opml,xml}
|
|
|
* @param array<string,mixed> $attributes
|
|
* @param array<string,mixed> $attributes
|
|
|
* @param array<int,mixed> $curl_options
|
|
* @param array<int,mixed> $curl_options
|
|
|
|
|
+ * @return array{body:string,effective_url:string,redirect_count:int,fail:bool}
|
|
|
*/
|
|
*/
|
|
|
-function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): string {
|
|
|
|
|
|
|
+function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = [], array $curl_options = []): array {
|
|
|
$limits = FreshRSS_Context::systemConf()->limits;
|
|
$limits = FreshRSS_Context::systemConf()->limits;
|
|
|
$feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
|
|
$feed_timeout = empty($attributes['timeout']) || !is_numeric($attributes['timeout']) ? 0 : intval($attributes['timeout']);
|
|
|
|
|
|
|
@@ -521,7 +556,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
|
|
|
$body = @file_get_contents($cachePath);
|
|
$body = @file_get_contents($cachePath);
|
|
|
if ($body != false) {
|
|
if ($body != false) {
|
|
|
syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
|
|
syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . \SimplePie\Misc::url_remove_credentials($url));
|
|
|
- return $body;
|
|
|
|
|
|
|
+ return ['body' => $body, 'effective_url' => $url, 'redirect_count' => 0, 'fail' => false];
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -553,7 +588,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
|
|
|
// TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
|
|
// TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
|
|
|
$ch = curl_init();
|
|
$ch = curl_init();
|
|
|
if ($ch === false) {
|
|
if ($ch === false) {
|
|
|
- return '';
|
|
|
|
|
|
|
+ return ['body' => '', 'effective_url' => '', 'redirect_count' => 0, 'fail' => true];
|
|
|
}
|
|
}
|
|
|
curl_setopt_array($ch, [
|
|
curl_setopt_array($ch, [
|
|
|
CURLOPT_URL => $url,
|
|
CURLOPT_URL => $url,
|
|
@@ -598,10 +633,13 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
|
|
|
$body = curl_exec($ch);
|
|
$body = curl_exec($ch);
|
|
|
$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
|
$c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
|
$c_content_type = '' . curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
|
|
|
|
+ $c_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
|
|
|
|
|
+ $c_redirect_count = curl_getinfo($ch, CURLINFO_REDIRECT_COUNT);
|
|
|
$c_error = curl_error($ch);
|
|
$c_error = curl_error($ch);
|
|
|
curl_close($ch);
|
|
curl_close($ch);
|
|
|
|
|
|
|
|
- if ($c_status != 200 || $c_error != '' || $body === false) {
|
|
|
|
|
|
|
+ $fail = $c_status != 200 || $c_error != '' || $body === false;
|
|
|
|
|
+ if ($fail) {
|
|
|
Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
|
|
Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
|
|
|
$body = '';
|
|
$body = '';
|
|
|
// TODO: Implement HTTP 410 Gone
|
|
// TODO: Implement HTTP 410 Gone
|
|
@@ -611,6 +649,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
|
|
|
$body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
|
|
$body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
|
|
|
if ($type !== 'json') {
|
|
if ($type !== 'json') {
|
|
|
$body = enforceHttpEncoding($body, $c_content_type);
|
|
$body = enforceHttpEncoding($body, $c_content_type);
|
|
|
|
|
+ $body = enforceHtmlBase($body, $c_effective_url);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -618,7 +657,7 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
|
|
|
Minz_Log::warning("Error saving cache $cachePath for $url");
|
|
Minz_Log::warning("Error saving cache $cachePath for $url");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- return $body;
|
|
|
|
|
|
|
+ return ['body' => $body, 'effective_url' => $c_effective_url, 'redirect_count' => $c_redirect_count, 'fail' => $fail];
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|