Преглед изворни кода

OPML export/import of some proprietary FreshRSS attributes (#4342)

* OPML export/import of some proprietary FreshRSS attributes
#fix https://github.com/FreshRSS/FreshRSS/issues/4077
And one of the TODOs of https://github.com/FreshRSS/FreshRSS/pull/4220
XPath options, CSS Selector, and action filters

* Bump library patch version

* OPML namespace + documentation

* Add example
Alexandre Alapetite пре 3 година
родитељ
комит
4a87206f28

+ 3 - 2
app/Models/BooleanSearch.php

@@ -5,6 +5,7 @@
  */
 class FreshRSS_BooleanSearch {
 
+	/** @var string */
 	private $raw_input = '';
 	private $searches = array();
 
@@ -54,11 +55,11 @@ class FreshRSS_BooleanSearch {
 		return null;
 	}
 
-	public function __toString() {
+	public function __toString(): string {
 		return $this->getRawInput();
 	}
 
-	public function getRawInput() {
+	public function getRawInput(): string {
 		return $this->raw_input;
 	}
 }

+ 8 - 4
app/Models/Feed.php

@@ -245,7 +245,7 @@ class FreshRSS_Feed extends Minz_Model {
 		}
 		$this->url = $value;
 	}
-	public function _kind($value) {
+	public function _kind(int $value) {
 		$this->kind = $value;
 	}
 	public function _category($value) {
@@ -567,8 +567,8 @@ class FreshRSS_Feed extends Minz_Model {
 			$feedSourceUrl = preg_replace('#((.+)://)(.+)#', '${1}' . $this->httpAuth . '@${3}', $feedSourceUrl);
 		}
 
-		// Same naming conventions than https://github.com/RSS-Bridge/rss-bridge/wiki/XPathAbstract
-		// https://github.com/RSS-Bridge/rss-bridge/wiki/The-collectData-function
+		// Same naming conventions than https://rss-bridge.github.io/rss-bridge/Bridge_API/XPathAbstract.html
+		// https://rss-bridge.github.io/rss-bridge/Bridge_API/BridgeAbstract.html#collectdata
 		/** @var array<string,string> */
 		$xPathSettings = $this->attributes('xpath');
 		$xPathFeedTitle = $xPathSettings['feedTitle'] ?? '';
@@ -758,7 +758,8 @@ class FreshRSS_Feed extends Minz_Model {
 		}
 	}
 
-	public function filtersAction(string $action) {
+	/** @return array<FreshRSS_BooleanSearch> */
+	public function filtersAction(string $action): array {
 		$action = trim($action);
 		if ($action == '') {
 			return array();
@@ -775,6 +776,9 @@ class FreshRSS_Feed extends Minz_Model {
 		return $filters;
 	}
 
+	/**
+	 * @param array<string> $filters
+	 */
 	public function _filtersAction(string $action, $filters) {
 		$action = trim($action);
 		if ($action == '' || !is_array($filters)) {

+ 4 - 0
app/Models/FeedDAO.php

@@ -104,6 +104,7 @@ class FreshRSS_FeedDAO extends Minz_ModelPdo implements FreshRSS_Searchable {
 				'website' => $feed->website(),
 				'description' => $feed->description(),
 				'lastUpdate' => 0,
+				'pathEntries' => $feed->pathEntries(),
 				'httpAuth' => $feed->httpAuth(),
 				'attributes' => $feed->attributes(),
 			);
@@ -384,6 +385,9 @@ SQL;
 		return false;
 	}
 
+	/**
+	 * @return array<FreshRSS_Feed>
+	 */
 	public function listByCategory(int $cat): array {
 		$sql = 'SELECT * FROM `_feed` WHERE category=?';
 		$stm = $this->pdo->prepare($sql);

+ 2 - 1
app/Models/FilterAction.php

@@ -2,6 +2,7 @@
 
 class FreshRSS_FilterAction {
 
+	/** @var FreshRSS_BooleanSearch */
 	private $booleanSearch = null;
 	private $actions = null;
 
@@ -33,7 +34,7 @@ class FreshRSS_FilterAction {
 					'actions' => $this->actions,
 				);
 		}
-		return '';
+		return [];
 	}
 
 	public static function fromJSON($json) {

+ 4 - 0
app/Services/ExportService.php

@@ -19,6 +19,10 @@ class FreshRSS_Export_Service {
 	/** @var FreshRSS_TagDAO */
 	private $tag_dao;
 
+	const FRSS_NAMESPACE = 'https://freshrss.org/opml';
+	const TYPE_HTML_XPATH = 'HTML+XPath';
+	const TYPE_RSS_ATOM = 'rss';
+
 	/**
 	 * Initialize the service for the given user.
 	 *

+ 31 - 0
app/Services/ImportService.php

@@ -148,6 +148,37 @@ class FreshRSS_Import_Service {
 			$feed->_website($website);
 			$feed->_description($description);
 
+			switch ($feed_elt['type'] ?? '') {
+				case FreshRSS_Export_Service::TYPE_HTML_XPATH:
+					$feed->_kind(FreshRSS_Feed::KIND_HTML_XPATH);
+					break;
+				case FreshRSS_Export_Service::TYPE_RSS_ATOM:
+				default:
+					$feed->_kind(FreshRSS_Feed::KIND_RSS);
+					break;
+			}
+
+			$xPathSettings = [];
+			foreach ($feed_elt as $key => $value) {
+				if (is_array($value) && !empty($value['value']) && ($value['namespace'] ?? '') === FreshRSS_Export_Service::FRSS_NAMESPACE) {
+					switch ($key) {
+						case 'cssFullContent': $feed->_pathEntries($value['value']); break;
+						case 'filtersActionRead': $feed->_filtersAction('read', preg_split('/[\n\r]+/', $value['value'])); break;
+						case 'xPathItem': $xPathSettings['item'] = $value['value']; break;
+						case 'xPathItemTitle': $xPathSettings['itemTitle'] = $value['value']; break;
+						case 'xPathItemContent': $xPathSettings['itemContent'] = $value['value']; break;
+						case 'xPathItemUri': $xPathSettings['itemUri'] = $value['value']; break;
+						case 'xPathItemAuthor': $xPathSettings['itemAuthor'] = $value['value']; break;
+						case 'xPathItemTimestamp': $xPathSettings['itemTimestamp'] = $value['value']; break;
+						case 'xPathItemThumbnail': $xPathSettings['itemThumbnail'] = $value['value']; break;
+						case 'xPathItemCategories': $xPathSettings['itemCategories'] = $value['value']; break;
+					}
+				}
+			}
+			if (!empty($xPathSettings)) {
+				$feed->_attributes('xpath', $xPathSettings);
+			}
+
 			// Call the extension hook
 			$feed = Minz_ExtensionManager::callHook('feed_before_insert', $feed);
 			if ($feed != null) {

+ 29 - 3
app/views/helpers/export/opml.phtml

@@ -15,14 +15,40 @@ foreach ($this->categories as $key => $cat) {
 		'@outlines' => array()
 	);
 
+	/** @var FreshRSS_Feed $feed */
 	foreach ($cat['feeds'] as $feed) {
-		$opml_array['body'][$key]['@outlines'][] = array(
+		$outline = [
 			'text' => htmlspecialchars_decode($feed->name(), ENT_QUOTES),
-			'type' => 'rss',
+			'type' => FreshRSS_Export_Service::TYPE_RSS_ATOM,
 			'xmlUrl' => htmlspecialchars_decode($feed->url(), ENT_QUOTES),
 			'htmlUrl' => htmlspecialchars_decode($feed->website(), ENT_QUOTES),
 			'description' => htmlspecialchars_decode($feed->description(), ENT_QUOTES),
-		);
+		];
+		if ($feed->kind() === FreshRSS_Feed::KIND_HTML_XPATH) {
+			$outline['type'] = FreshRSS_Export_Service::TYPE_HTML_XPATH;
+			/** @var array<string,string> */
+			$xPathSettings = $feed->attributes('xpath');
+			$outline['frss:xPathItem'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['item'] ?? null];
+			$outline['frss:xPathItemTitle'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemTitle'] ?? null];
+			$outline['frss:xPathItemContent'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemContent'] ?? null];
+			$outline['frss:xPathItemUri'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemUri'] ?? null];
+			$outline['frss:xPathItemAuthor'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemAuthor'] ?? null];
+			$outline['frss:xPathItemTimestamp'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemTimestamp'] ?? null];
+			$outline['frss:xPathItemThumbnail'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemThumbnail'] ?? null];
+			$outline['frss:xPathItemCategories'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $xPathSettings['itemCategories'] ?? null];
+		}
+		if (!empty($feed->filtersAction('read'))) {
+			$filters = '';
+			foreach ($feed->filtersAction('read') as $filterRead) {
+				$filters .= $filterRead->getRawInput() . "\n";
+			}
+			$filters = trim($filters);
+			$outline['frss:filtersActionRead'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $filters];
+		}
+		if ($feed->pathEntries() != '') {
+			$outline['frss:cssFullContent'] = ['namespace' => FreshRSS_Export_Service::FRSS_NAMESPACE, 'value' => $feed->pathEntries()];
+		}
+		$opml_array['body'][$key]['@outlines'][] = $outline;
 	}
 }
 

+ 74 - 0
docs/en/developers/OPML.md

@@ -0,0 +1,74 @@
+# OPML in FreshRSS
+
+FreshRSS supports the [OPML](https://en.wikipedia.org/wiki/OPML) format to export and import lists of RSS/Atom feeds in a standard way, compatible with several other RSS aggregators.
+
+However, FreshRSS also supports several additional features not covered by the basic OPML specification.
+Luckily, the [OPML specification](http://opml.org/spec2.opml) allows extensions:
+
+> *An OPML file may contain elements and attributes not described on this page, only if those elements are defined in a namespace.*
+
+and:
+
+> *OPML can also be extended by the addition of new values for the type attribute.*
+
+## FreshRSS OPML extension
+
+FreshRSS uses the XML namespace <https://freshrss.org/opml> to export/import extended information not covered by the basic OPML specification.
+
+The list of the custom FreshRSS attributes can be seen in [the source code](https://github.com/FreshRSS/FreshRSS/blob/edge/app/views/helpers/export/opml.phtml), and here is an overview:
+
+### HTML+XPath
+
+* `<outline type="HTML+XPath" ...`: Additional type of source, which is not RSS/Atom, but HTML Web Scraping using [XPath](https://www.w3.org/TR/xpath-10/) 1.0.
+
+> ℹ️ [XPath 1.0](https://en.wikipedia.org/wiki/XPath) is a standard query language, which FreshRSS supports to enable [Web scraping](https://en.wikipedia.org/wiki/Web_scraping).
+
+The following attributes are using similar naming conventions than [RSS-Bridge](https://rss-bridge.github.io/rss-bridge/Bridge_API/XPathAbstract.html).
+
+* `frss:xPathItem`: XPath expression for extracting the feed items from the source page.
+	* Example: `//div[@class="news-item"]`
+* `frss:xPathItemTitle`: XPath expression for extracting the feed title from the source page.
+	* Example: `descendant::h2`
+* `frss:xPathItemContent`: XPath expression for extracting an item’s content from the item context.
+	* Example: `.`
+* `frss:xPathItemUri`: XPath expression for extracting an item link from the item context.
+	* Example: `descendant::a/@href`
+* `frss:xPathItemAuthor`: XPath expression for extracting an item author from the item context.
+	* Example: `"Anonymous"`
+* `frss:xPathItemTimestamp`: XPath expression for extracting an item timestamp from the item context. The result will be parsed by [`strtotime()`](https://php.net/strtotime).
+* `frss:xPathItemThumbnail`: XPath expression for extracting an item’s thumbnail (image) URL from the item context.
+	* Example: `descendant::img/@src`
+* `frss:xPathItemCategories`: XPath expression for extracting a list of categories (tags) from the item context.
+
+### Miscellaneous
+
+* `frss:cssFullContent`: [CSS Selector](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors) to enable the download and extraction of the matching HTML section of each articles’ Web address.
+	* Example: `div.main`
+* `frss:filtersActionRead`: List (separated by a new line) of search queries to automatically mark a new article as read.
+
+### Example
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<opml version="2.0">
+	<head>
+		<title>FreshRSS OPML extension example</title>
+	</head>
+	<body>
+		<outline xmlns:frss="https://freshrss.org/opml"
+			text="Example"
+			type="HTML+XPath"
+			xmlUrl="https://www.example.net/page.html"
+			htmlUrl="https://www.example.net/page.html"
+			description="Example of Web scraping"
+			frss:xPathItem="//a[contains(@href, '/interesting/')]/ancestor::article"
+			frss:xPathItemTitle="descendant::h2"
+			frss:xPathItemContent="."
+			frss:xPathItemUri="descendant::a[string-length(@href)&gt;0]/@href"
+			frss:xPathItemThumbnail="descendant::img/@src"
+			frss:cssFullContent="article"
+			frss:filtersActionRead="intitle:⚡️ OR intitle:🔥&#10;something"
+		/>
+	</body>
+</opml>
+```

+ 24 - 7
lib/lib_opml.php

@@ -12,7 +12,7 @@
  *
  * @author   Marien Fressinaud <dev@marienfressinaud.fr>
  * @link     https://github.com/marienfressinaud/lib_opml
- * @version  0.2-FreshRSS~1.5.1
+ * @version  0.2-FreshRSS~1.20.0
  * @license  public domain
  *
  * Usages:
@@ -91,8 +91,20 @@ function libopml_parse_outline($outline_xml, $strict = true) {
 	// An outline may contain any kind of attributes but "text" attribute is
 	// required !
 	$text_is_present = false;
-	foreach ($outline_xml->attributes() as $key => $value) {
-		$outline[$key] = (string)$value;
+
+	$elem = dom_import_simplexml($outline_xml);
+	/** @var DOMAttr $attr */
+	foreach ($elem->attributes as $attr) {
+		$key = $attr->localName;
+
+		if ($attr->namespaceURI == '') {
+			$outline[$key] = $attr->value;
+		} else {
+			$outline[$key] = [
+				'namespace' => $attr->namespaceURI,
+				'value' => $attr->value,
+			];
+		}
 
 		if ($key === 'text') {
 			$text_is_present = true;
@@ -257,17 +269,22 @@ function libopml_render_outline($parent_elt, $outline, $strict) {
 			foreach ($value as $outline_child) {
 				libopml_render_outline($outline_elt, $outline_child, $strict);
 			}
-		} elseif (is_array($value)) {
+		} elseif (is_array($value) && !isset($value['namespace'])) {
 			throw new LibOPML_Exception(
-				'Type of outline elements cannot be array: ' . $key
+				'Type of outline elements cannot be array (except for providing a namespace): ' . $key
 			);
 		} else {
 			// Detect text attribute is present, that's good :)
 			if ($key === 'text') {
 				$text_is_present = true;
 			}
-
-			$outline_elt->addAttribute($key, $value);
+			if (is_array($value)) {
+				if (!empty($value['namespace']) && !empty($value['value'])) {
+					$outline_elt->addAttribute($key, $value['value'], $value['namespace']);
+				}
+			} else {
+				$outline_elt->addAttribute($key, $value);
+			}
 		}
 	}