Przeglądaj źródła

XML+XPath (#5076)

* XML+XPath
#fix https://github.com/FreshRSS/FreshRSS/issues/5075
Implementation allowing to take an XML document as input using an XML parser (instead of an HTML parser for HTML+XPath)

* Remove noise from another PR

* Better MIME for XML

* And add glob *.xml for cache cleaning

* Minor syntax

* Add glob json for clean cache
Alexandre Alapetite 3 lat temu
rodzic
commit
05ae1b0d26

+ 10 - 4
app/Controllers/feedController.php

@@ -81,6 +81,7 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController {
 				$feed->load(true);	//Throws FreshRSS_Feed_Exception, Minz_FileNotExistException
 				break;
 			case FreshRSS_Feed::KIND_HTML_XPATH:
+			case FreshRSS_Feed::KIND_XML_XPATH:
 				$feed->_website($url);
 				break;
 		}
@@ -201,8 +202,8 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController {
 			$timeout = intval(Minz_Request::param('timeout', 0));
 			$attributes['timeout'] = $timeout > 0 ? $timeout : null;
 
-			$feed_kind = Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS);
-			if ($feed_kind == FreshRSS_Feed::KIND_HTML_XPATH) {
+			$feed_kind = (int)Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS);
+			if ($feed_kind === FreshRSS_Feed::KIND_HTML_XPATH || $feed_kind === FreshRSS_Feed::KIND_XML_XPATH) {
 				$xPathSettings = [];
 				if (Minz_Request::param('xPathFeedTitle', '') != '') $xPathSettings['feedTitle'] = Minz_Request::param('xPathFeedTitle', '', true);
 				if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true);
@@ -385,10 +386,15 @@ class FreshRSS_feed_Controller extends FreshRSS_ActionController {
 				if ($simplePiePush) {
 					$simplePie = $simplePiePush;	//Used by WebSub
 				} elseif ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) {
-					$simplePie = $feed->loadHtmlXpath(false, $isNewFeed);
-					if ($simplePie == null) {
+					$simplePie = $feed->loadHtmlXpath();
+					if ($simplePie === null) {
 						throw new FreshRSS_Feed_Exception('HTML+XPath Web scraping failed for [' . $feed->url(false) . ']');
 					}
+				} elseif ($feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) {
+					$simplePie = $feed->loadHtmlXpath();
+					if ($simplePie === null) {
+						throw new FreshRSS_Feed_Exception('XML+XPath parsing failed for [' . $feed->url(false) . ']');
+					}
 				} else {
 					$simplePie = $feed->load(false, $isNewFeed);
 				}

+ 1 - 1
app/Controllers/subscriptionController.php

@@ -203,7 +203,7 @@ class FreshRSS_subscription_Controller extends FreshRSS_ActionController {
 			$feed->_filtersAction('read', preg_split('/[\n\r]+/', Minz_Request::param('filteractions_read', '')));
 
 			$feed->_kind(intval(Minz_Request::param('feed_kind', FreshRSS_Feed::KIND_RSS)));
-			if ($feed->kind() == FreshRSS_Feed::KIND_HTML_XPATH) {
+			if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) {
 				$xPathSettings = [];
 				if (Minz_Request::param('xPathItem', '') != '') $xPathSettings['item'] = Minz_Request::param('xPathItem', '', true);
 				if (Minz_Request::param('xPathItemTitle', '') != '') $xPathSettings['itemTitle'] = Minz_Request::param('xPathItemTitle', '', true);

+ 24 - 5
app/Models/Feed.php

@@ -17,6 +17,11 @@ class FreshRSS_Feed extends Minz_Model {
 	 * @var int
 	 */
 	const KIND_HTML_XPATH = 10;
+	/**
+	 * Normal XML with XPath scraping
+	 * @var int
+	 */
+	const KIND_XML_XPATH = 15;
 	/**
 	 * Normal JSON with XPath scraping
 	 * @var int
@@ -586,7 +591,7 @@ class FreshRSS_Feed extends Minz_Model {
 	/**
 	 * @return SimplePie|null
 	 */
-	public function loadHtmlXpath(bool $loadDetails = false, bool $noCache = false) {
+	public function loadHtmlXpath() {
 		if ($this->url == '') {
 			return null;
 		}
@@ -614,8 +619,9 @@ class FreshRSS_Feed extends Minz_Model {
 			return null;
 		}
 
-		$cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), FreshRSS_Feed::KIND_HTML_XPATH);
-		$html = httpGet($feedSourceUrl, $cachePath, 'html', $this->attributes());
+		$cachePath = FreshRSS_Feed::cacheFilename($feedSourceUrl, $this->attributes(), $this->kind());
+		$html = httpGet($feedSourceUrl, $cachePath,
+			$this->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'xml' : 'html', $this->attributes());
 		if (strlen($html) <= 0) {
 			return null;
 		}
@@ -630,7 +636,18 @@ class FreshRSS_Feed extends Minz_Model {
 			$doc = new DOMDocument();
 			$doc->recover = true;
 			$doc->strictErrorChecking = false;
-			$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
+
+			switch ($this->kind()) {
+				case FreshRSS_Feed::KIND_HTML_XPATH:
+					$doc->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
+					break;
+				case FreshRSS_Feed::KIND_XML_XPATH:
+					$doc->loadXML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING);
+					break;
+				default:
+					return null;
+			}
+
 			$xpath = new DOMXPath($doc);
 			$view->rss_title = $xPathFeedTitle == '' ? $this->name() :
 				htmlspecialchars(@$xpath->evaluate('normalize-space(' . $xPathFeedTitle . ')'), ENT_COMPAT, 'UTF-8');
@@ -776,8 +793,10 @@ class FreshRSS_Feed extends Minz_Model {
 	public static function cacheFilename(string $url, array $attributes, int $kind = FreshRSS_Feed::KIND_RSS): string {
 		$simplePie = customSimplePie($attributes);
 		$filename = $simplePie->get_cache_filename($url);
-		if ($kind == FreshRSS_Feed::KIND_HTML_XPATH) {
+		if ($kind === FreshRSS_Feed::KIND_HTML_XPATH) {
 			return CACHE_PATH . '/' . $filename . '.html';
+		} elseif ($kind === FreshRSS_Feed::KIND_XML_XPATH) {
+			return CACHE_PATH . '/' . $filename . '.xml';
 		} else {
 			return CACHE_PATH . '/' . $filename . '.spc';
 		}

+ 1 - 0
app/Services/ExportService.php

@@ -21,6 +21,7 @@ class FreshRSS_Export_Service {
 
 	const FRSS_NAMESPACE = 'https://freshrss.org/opml';
 	const TYPE_HTML_XPATH = 'HTML+XPath';
+	const TYPE_XML_XPATH = 'XML+XPath';
 	const TYPE_RSS_ATOM = 'rss';
 
 	/**

+ 4 - 1
app/Services/ImportService.php

@@ -160,10 +160,13 @@ class FreshRSS_Import_Service {
 			$feed->_website($website);
 			$feed->_description($description);
 
-			switch ($feed_elt['type'] ?? '') {
+			switch (strtolower($feed_elt['type'] ?? '')) {
 				case strtolower(FreshRSS_Export_Service::TYPE_HTML_XPATH):
 					$feed->_kind(FreshRSS_Feed::KIND_HTML_XPATH);
 					break;
+				case strtolower(FreshRSS_Export_Service::TYPE_XML_XPATH):
+					$feed->_kind(FreshRSS_Feed::KIND_XML_XPATH);
+					break;
 				case strtolower(FreshRSS_Export_Service::TYPE_RSS_ATOM):
 				default:
 					$feed->_kind(FreshRSS_Feed::KIND_RSS);

+ 1 - 0
app/i18n/cz/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath pro:',
 			),
 			'rss' => 'RSS / Atom (výchozí)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Vymazat mezipaměť',

+ 1 - 0
app/i18n/de/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath für:',
 			),
 			'rss' => 'RSS / Atom (Standard)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Zwischenspeicher leeren',

+ 1 - 0
app/i18n/el/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath for:',	// TODO
 			),
 			'rss' => 'RSS / Atom (default)',	// TODO
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Clear cache',	// TODO

+ 1 - 0
app/i18n/en-us/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath for:',	// IGNORE
 			),
 			'rss' => 'RSS / Atom (default)',	// IGNORE
+			'xml_xpath' => 'XML + XPath',	// IGNORE
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Clear cache',	// IGNORE

+ 1 - 0
app/i18n/en/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath for:',
 			),
 			'rss' => 'RSS / Atom (default)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Clear cache',

+ 1 - 0
app/i18n/es/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath para:',
 			),
 			'rss' => 'RSS / Atom (por defecto)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Borrar caché',

+ 1 - 0
app/i18n/fr/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath pour :',
 			),
 			'rss' => 'RSS / Atom (par défaut)',
+			'xml_xpath' => 'XML + XPath',	// IGNORE
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Vider le cache',

+ 1 - 0
app/i18n/he/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath for:',	// TODO
 			),
 			'rss' => 'RSS / Atom (default)',	// TODO
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Clear cache',	// TODO

+ 1 - 0
app/i18n/id/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath for:',	// TODO
 			),
 			'rss' => 'RSS / Atom (default)',	// TODO
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Clear cache',	// TODO

+ 1 - 0
app/i18n/it/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath per:',
 			),
 			'rss' => 'RSS / Atom (predefinito)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Svuota cache',

+ 1 - 0
app/i18n/ja/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPathは:',
 			),
 			'rss' => 'RSS / Atom (標準)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'キャッシュのクリア',

+ 1 - 0
app/i18n/ko/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => '다음의 XPath:',
 			),
 			'rss' => 'RSS / Atom (기본값)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => '캐쉬 지우기',

+ 1 - 0
app/i18n/nl/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath voor:',
 			),
 			'rss' => 'RSS / Atom (standaard)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Cache leegmaken',

+ 1 - 0
app/i18n/oc/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath per :',
 			),
 			'rss' => 'RSS / Atom (defaut)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Escafar lo cache',

+ 1 - 0
app/i18n/pl/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath dla:',
 			),
 			'rss' => 'RSS / Atom (domyślne)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Wyczyść pamięć podręczną',

+ 1 - 0
app/i18n/pt-br/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath para:',
 			),
 			'rss' => 'RSS / Atom (padrão)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Limpar o cache',

+ 1 - 0
app/i18n/ru/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath для:',
 			),
 			'rss' => 'RSS / Atom (по умолчанию)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Очистить кэш',

+ 1 - 0
app/i18n/sk/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath pre:',
 			),
 			'rss' => 'RSS / Atom (prednastavené)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Vymazať vyrovnáciu pamäť',

+ 1 - 0
app/i18n/tr/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath:',
 			),
 			'rss' => 'RSS / Atom (varsayılan)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => 'Önbelleği temizle',

+ 1 - 0
app/i18n/zh-cn/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath 定位:',
 			),
 			'rss' => 'RSS / Atom (默认)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => '清理缓存',

+ 1 - 0
app/i18n/zh-tw/sub.php

@@ -122,6 +122,7 @@ return array(
 				'xpath' => 'XPath 定位:',
 			),
 			'rss' => 'RSS / Atom (默認)',
+			'xml_xpath' => 'XML + XPath',	// TODO
 		),
 		'maintenance' => array(
 			'clear_cache' => '清理暫存',

+ 9 - 2
app/views/helpers/export/opml.phtml

@@ -18,8 +18,15 @@ function feedsToOutlines($feeds, $excludeMutedFeeds = false): array {
 			'description' => htmlspecialchars_decode($feed->description(), ENT_QUOTES),
 		];
 
-		if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) {
-			$outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH;
+		if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH || $feed->kind() === FreshRSS_Feed::KIND_XML_XPATH) {
+			switch ($feed->kind()) {
+				case FreshRSS_Feed::KIND_HTML_XPATH:
+					$outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH;
+					break;
+				case FreshRSS_Feed::KIND_XML_XPATH:
+					$outline['type'] = FreshRSS_Export_Service::TYPE_XML_XPATH;
+					break;
+			}
 			/** @var array<string,string> */
 			$xPathSettings = $feed->attributes('xpath');
 			$outline['frss:xPathItem'] = $xPathSettings['item'] ?? null;

+ 3 - 2
app/views/helpers/feed/update.phtml

@@ -391,8 +391,9 @@
 			<label class="group-name" for="feed_kind"><?= _t('sub.feed.kind') ?></label>
 			<div class="group-controls">
 				<select name="feed_kind" id="feed_kind" class="select-show w100">
-					<option value="<?= FreshRSS_Feed::KIND_RSS ?>" <?= $this->feed->kind() == FreshRSS_Feed::KIND_RSS ? 'selected="selected"' : '' ?>><?= _t('sub.feed.kind.rss') ?></option>
-					<option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" <?= $this->feed->kind() == FreshRSS_Feed::KIND_HTML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option>
+					<option value="<?= FreshRSS_Feed::KIND_RSS ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_RSS ? 'selected="selected"' : '' ?>><?= _t('sub.feed.kind.rss') ?></option>
+					<option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option>
+					<option value="<?= FreshRSS_Feed::KIND_XML_XPATH ?>" <?= $this->feed->kind() === FreshRSS_Feed::KIND_XML_XPATH ? 'selected="selected"' : '' ?> data-show="html_xpath"><?= _t('sub.feed.kind.xml_xpath') ?></option>
 				</select>
 			</div>
 		</div>

+ 1 - 0
app/views/subscription/add.phtml

@@ -70,6 +70,7 @@
 					<select name="feed_kind" id="feed_kind" class="select-show">
 						<option value="<?= FreshRSS_Feed::KIND_RSS ?>" selected="selected"><?= _t('sub.feed.kind.rss') ?></option>
 						<option value="<?= FreshRSS_Feed::KIND_HTML_XPATH ?>" data-show="html_xpath"><?= _t('sub.feed.kind.html_xpath') ?></option>
+						<option value="<?= FreshRSS_Feed::KIND_XML_XPATH ?>" data-show="html_xpath"><?= _t('sub.feed.kind.xml_xpath') ?></option>
 					</select>
 				</div>
 			</div>

+ 3 - 1
docs/en/developers/OPML.md

@@ -17,12 +17,14 @@ FreshRSS uses the XML namespace <https://freshrss.org/opml> to export/import ext
 
 The list of the custom FreshRSS attributes can be seen in [the source code](https://github.com/FreshRSS/FreshRSS/blob/edge/app/views/helpers/export/opml.phtml), and here is an overview:
 
-### HTML+XPath
+### HTML+XPath or XML+XPath
 
 * `<outline type="HTML+XPath" ...`: Additional type of source, which is not RSS/Atom, but HTML Web Scraping using [XPath](https://www.w3.org/TR/xpath-10/) 1.0.
 
 > ℹ️ [XPath 1.0](https://en.wikipedia.org/wiki/XPath) is a standard query language, which FreshRSS supports to enable [Web scraping](https://en.wikipedia.org/wiki/Web_scraping).
 
+* `<outline type="XML+XPath" ...`: Same than `HTML+XPath` but using an XML parser.
+
 The following attributes are using similar naming conventions than [RSS-Bridge](https://rss-bridge.github.io/rss-bridge/Bridge_API/XPathAbstract.html).
 
 * `frss:xPathItem`: XPath expression for extracting the feed items from the source page.

+ 12 - 2
lib/lib_rss.php

@@ -365,7 +365,11 @@ function sanitizeHTML($data, string $base = '', ?int $maxLength = null): string
 
 function cleanCache(int $hours = 720): void {
 	// N.B.: GLOB_BRACE is not available on all platforms
-	$files = array_merge(glob(CACHE_PATH . '/*.html', GLOB_NOSORT) ?: [], glob(CACHE_PATH . '/*.spc', GLOB_NOSORT) ?: []);
+	$files = array_merge(
+		glob(CACHE_PATH . '/*.html', GLOB_NOSORT) ?: [],
+		glob(CACHE_PATH . '/*.json', GLOB_NOSORT) ?: [],
+		glob(CACHE_PATH . '/*.spc', GLOB_NOSORT) ?: [],
+		glob(CACHE_PATH . '/*.xml', GLOB_NOSORT) ?: []);
 	foreach ($files as $file) {
 		if (substr($file, -10) === 'index.html') {
 			continue;
@@ -410,7 +414,7 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string {
 }
 
 /**
- * @param string $type {html,opml}
+ * @param string $type {html,json,opml,xml}
  * @param array<string,mixed> $attributes
  */
 function httpGet(string $url, string $cachePath, string $type = 'html', array $attributes = []): string {
@@ -439,9 +443,15 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
 
 	$accept = '*/*;q=0.8';
 	switch ($type) {
+		case 'json':
+			$accept = 'application/json,application/javascript;q=0.9,text/javascript;q=0.8,*/*;q=0.7';
+			break;
 		case 'opml':
 			$accept = 'text/x-opml,text/xml;q=0.9,application/xml;q=0.9,*/*;q=0.8';
 			break;
+		case 'xml':
+			$accept = 'application/xml,application/xhtml+xml,text/xml;q=0.9,*/*;q=0.8';
+			break;
 		case 'html':
 		default:
 			$accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';

+ 9 - 2
p/scripts/feed.js

@@ -88,10 +88,17 @@ function init_disable_elements_on_update(parent) {
 function init_select_show(parent) {
 	const listener = (select) => {
 		const options = select.querySelectorAll('option[data-show]');
+		const shows = {};	// To allow multiple options to show the same element
 		for (const option of options) {
-			const elem = document.getElementById(option.dataset.show);
+			if (!shows[option.dataset.show]) {
+				shows[option.dataset.show] = option.selected;
+			}
+		}
+
+		for (const show in shows) {
+			const elem = document.getElementById(show);
 			if (elem) {
-				elem.style.display = option.selected ? 'block' : 'none';
+				elem.style.display = shows[show] ? 'block' : 'none';
 			}
 		}
 	};