Просмотр исходного кода

Fix parentheses for complex `OR` boolean search expressions (#6672)

* Fix OR parentheses

* Pass all tests

* Forgotten comment

* Minor whitespace

* Fix several cases envolving negation

* Line length

* Fix `OR NOT`
Alexandre Alapetite 1 год назад
Родитель
Сommit
58de38bd73
4 измененных файлов с 245 добавлено и 20 удалено
  1. 118 11
      app/Models/BooleanSearch.php
  2. 14 6
      app/Models/Entry.php
  3. 1 1
      app/Models/EntryDAO.php
  4. 112 2
      tests/app/Models/SearchTest.php

+ 118 - 11
app/Models/BooleanSearch.php

@@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch {
 	private array $searches = [];
 
 	/**
-	 * @phpstan-var 'AND'|'OR'|'AND NOT'
+	 * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
 	 */
 	private string $operator;
 
-	/** @param 'AND'|'OR'|'AND NOT' $operator */
+	/** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
 	public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
 		$this->operator = $operator;
 		$input = trim($input);
@@ -38,6 +38,8 @@ class FreshRSS_BooleanSearch {
 		}
 		$this->raw_input = $input;
 
+		$input = self::consistentOrParentheses($input);
+
 		// Either parse everything as a series of BooleanSearch’s combined by implicit AND
 		// or parse everything as a series of Search’s combined by explicit OR
 		$this->parseParentheses($input, $level) || $this->parseOrSegments($input);
@@ -130,6 +132,107 @@ class FreshRSS_BooleanSearch {
 		return $input;
 	}
 
+	/**
+	 * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
+	 */
+	public static function addOrParentheses(string $input): string {
+		$input = trim($input);
+		if ($input === '') {
+			return '';
+		}
+		$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
+		$ns = count($splits);
+		if ($ns <= 1) {
+			return $input;
+		}
+		$result = '';
+		$segment = '';
+		for ($i = 0; $i < $ns; $i++) {
+			$segment .= $splits[$i];
+			if (trim($segment) === '') {
+				$segment = '';
+			} elseif (strcasecmp($segment, 'OR') === 0) {
+				$result .= $segment . ' ';
+				$segment = '';
+			} else {
+				$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
+				if ($quotes % 2 === 0) {
+					$segment = trim($segment);
+					if (in_array($segment, ['!', '-'], true)) {
+						$result .= $segment;
+					} else {
+						$result .= '(' . $segment . ') ';
+					}
+					$segment = '';
+				}
+			}
+		}
+		$segment = trim($segment);
+		if (in_array($segment, ['!', '-'], true)) {
+			$result .= $segment;
+		} elseif ($segment !== '') {
+			$result .= '(' . $segment . ')';
+		}
+		return trim($result);
+	}
+
+	/**
+	 * If the query contains a mix of `OR` expressions with and without parentheses,
+	 * then add parentheses to make the query consistent.
+	 * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
+	 */
+	public static function consistentOrParentheses(string $input): string {
+		if (!preg_match('/(?<!\\\\)\\(/', $input)) {
+			// No unescaped parentheses in the input
+			return trim($input);
+		}
+		$parenthesesCount = 0;
+		$result = '';
+		$segment = '';
+		$length = strlen($input);
+
+		for ($i = 0; $i < $length; $i++) {
+			$c = $input[$i];
+			$backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
+			if (!$backslashed) {
+				if ($c === '(') {
+					if ($parenthesesCount === 0) {
+						if ($segment !== '') {
+							$result = rtrim($result) . ' ' . self::addOrParentheses($segment);
+							$negation = preg_match('/[!-]$/', $result);
+							if (!$negation) {
+								$result .= ' ';
+							}
+							$segment = '';
+						}
+						$c = '';
+					}
+					$parenthesesCount++;
+				} elseif ($c === ')') {
+					$parenthesesCount--;
+					if ($parenthesesCount === 0) {
+						$segment = self::consistentOrParentheses($segment);
+						if ($segment !== '') {
+							$result .= '(' . $segment . ')';
+							$segment = '';
+						}
+						$c = '';
+					}
+				}
+			}
+			$segment .= $c;
+		}
+		if (trim($segment) !== '') {
+			$result = rtrim($result);
+			$negation = preg_match('/[!-]$/', $segment);
+			if (!$negation) {
+				$result .= ' ';
+			}
+			$result .= self::addOrParentheses($segment);
+		}
+		return trim($result);
+	}
+
 	/** @return bool True if some parenthesis logic took over, false otherwise */
 	private function parseParentheses(string $input, int $level): bool {
 		$input = trim($input);
@@ -146,9 +249,14 @@ class FreshRSS_BooleanSearch {
 				$hasParenthesis = true;
 
 				$before = trim($before);
-				if (preg_match('/[!-]$/i', $before)) {
+				if (preg_match('/[!-]$/', $before)) {
 					// Trim trailing negation
-					$before = substr($before, 0, -1);
+					$before = rtrim($before, ' !-');
+					$isOr = preg_match('/\bOR$/i', $before);
+					if ($isOr) {
+						// Trim trailing OR
+						$before = substr($before, 0, -2);
+					}
 
 					// The text prior to the negation is a BooleanSearch
 					$searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
@@ -157,8 +265,8 @@ class FreshRSS_BooleanSearch {
 					}
 					$before = '';
 
-					// The next BooleanSearch will have to be combined with AND NOT instead of default AND
-					$nextOperator = 'AND NOT';
+					// The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
+					$nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
 				} elseif (preg_match('/\bOR$/i', $before)) {
 					// Trim trailing OR
 					$before = substr($before, 0, -2);
@@ -212,7 +320,7 @@ class FreshRSS_BooleanSearch {
 					$i++;
 				}
 				// $sub = trim($sub);
-				// if ($sub != '') {
+				// if ($sub !== '') {
 				// 	// TODO: Consider throwing an error or warning in case of non-matching parenthesis
 				// }
 			// } elseif ($c === ')') {
@@ -249,12 +357,11 @@ class FreshRSS_BooleanSearch {
 			return;
 		}
 		$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
-
 		$segment = '';
 		$ns = count($splits);
 		for ($i = 0; $i < $ns; $i++) {
 			$segment = $segment . $splits[$i];
-			if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) {
+			if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
 				$segment = '';
 			} else {
 				$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
@@ -266,7 +373,7 @@ class FreshRSS_BooleanSearch {
 			}
 		}
 		$segment = trim($segment);
-		if ($segment != '') {
+		if ($segment !== '') {
 			$this->searches[] = new FreshRSS_Search($segment);
 		}
 	}
@@ -280,7 +387,7 @@ class FreshRSS_BooleanSearch {
 		return $this->searches;
 	}
 
-	/** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */
+	/** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
 	public function operator(): string {
 		return $this->operator;
 	}

+ 14 - 6
app/Models/Entry.php

@@ -577,12 +577,20 @@ HTML;
 		foreach ($booleanSearch->searches() as $filter) {
 			if ($filter instanceof FreshRSS_BooleanSearch) {
 				// BooleanSearches are combined by AND (default) or OR or AND NOT (special cases) operators and are recursive
-				if ($filter->operator() === 'OR') {
-					$ok |= $this->matches($filter);
-				} elseif ($filter->operator() === 'AND NOT') {
-					$ok &= !$this->matches($filter);
-				} else {	// AND
-					$ok &= $this->matches($filter);
+				switch ($filter->operator()) {
+					case 'OR':
+						$ok |= $this->matches($filter);
+						break;
+					case 'OR NOT':
+						$ok |= !$this->matches($filter);
+						break;
+					case 'AND NOT':
+						$ok &= !$this->matches($filter);
+						break;
+					case 'AND':
+					default:
+						$ok &= $this->matches($filter);
+						break;
 				}
 			} elseif ($filter instanceof FreshRSS_Search) {
 				// Searches are combined by OR and are not recursive

+ 1 - 1
app/Models/EntryDAO.php

@@ -762,7 +762,7 @@ SQL;
 				if ($filterSearch !== '') {
 					if ($search !== '') {
 						$search .= $filter->operator();
-					} elseif ($filter->operator() === 'AND NOT') {
+					} elseif (in_array($filter->operator(), ['AND NOT', 'OR NOT'], true)) {
 						// Special case if we start with a negation (there is already the default AND before)
 						$search .= ' NOT';
 					}

+ 112 - 2
tests/app/Models/SearchTest.php

@@ -284,13 +284,60 @@ class SearchTest extends PHPUnit\Framework\TestCase {
 		);
 	}
 
+	/**
+	 * @dataProvider provideAddOrParentheses
+	 */
+	public function test__addOrParentheses(string $input, string $output): void {
+		self::assertEquals($output, FreshRSS_BooleanSearch::addOrParentheses($input));
+	}
+
+	/** @return array<array{string,string}> */
+	public function provideAddOrParentheses(): array {
+		return [
+			['ab', 'ab'],
+			['ab cd', 'ab cd'],
+			['!ab -cd', '!ab -cd'],
+			['ab OR cd', '(ab) OR (cd)'],
+			['!ab OR -cd', '(!ab) OR (-cd)'],
+			['ab cd OR ef OR "gh ij"', '(ab cd) OR (ef) OR ("gh ij")'],
+			['ab (!cd)', 'ab (!cd)'],
+		];
+	}
+
+	/**
+	 * @dataProvider provideconsistentOrParentheses
+	 */
+	public function test__consistentOrParentheses(string $input, string $output): void {
+		self::assertEquals($output, FreshRSS_BooleanSearch::consistentOrParentheses($input));
+	}
+
+	/** @return array<array{string,string}> */
+	public function provideconsistentOrParentheses(): array {
+		return [
+			['ab cd ef', 'ab cd ef'],
+			['(ab cd ef)', '(ab cd ef)'],
+			['("ab cd" ef)', '("ab cd" ef)'],
+			['"ab cd" (ef gh) "ij kl"', '"ab cd" (ef gh) "ij kl"'],
+			['ab (!cd)', 'ab (!cd)'],
+			['ab !(cd)', 'ab !(cd)'],
+			['(ab) -(cd)', '(ab) -(cd)'],
+			['ab cd OR ef OR "gh ij"', 'ab cd OR ef OR "gh ij"'],
+			['"plain or text" OR (cd)', '("plain or text") OR (cd)'],
+			['(ab) OR cd OR ef OR (gh)', '(ab) OR (cd) OR (ef) OR (gh)'],
+			['(ab (cd OR ef)) OR gh OR ij OR (kl)', '(ab (cd OR ef)) OR (gh) OR (ij) OR (kl)'],
+			['(ab (cd OR ef OR (gh))) OR ij', '(ab ((cd) OR (ef) OR (gh))) OR (ij)'],
+			['(ab (!cd OR ef OR (gh))) OR ij', '(ab ((!cd) OR (ef) OR (gh))) OR (ij)'],
+			['(ab !(cd OR ef OR !(gh))) OR ij', '(ab !((cd) OR (ef) OR !(gh))) OR (ij)'],
+		];
+	}
+
 	/**
 	 * @dataProvider provideParentheses
 	 * @param array<string> $values
 	 */
-	public function test__construct_parentheses(string $input, string $sql, array $values): void {
+	public function test__parentheses(string $input, string $sql, array $values): void {
 		[$filterValues, $filterSearch] = FreshRSS_EntryDAOPGSQL::sqlBooleanSearch('e.', new FreshRSS_BooleanSearch($input));
-		self::assertEquals($sql, $filterSearch);
+		self::assertEquals(trim($sql), trim($filterSearch));
 		self::assertEquals($values, $filterValues);
 	}
 
@@ -337,6 +384,69 @@ class SearchTest extends PHPUnit\Framework\TestCase {
 				'(e.title LIKE ? )',
 				['%"hello world"%'],
 			],
+			[
+				'(ab) OR (cd) OR (ef)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'("plain or text") OR (cd)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
+			],
+			[
+				'"plain or text" OR cd',
+				'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
+				['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
+			],
+			[
+				'"plain OR text" OR cd',
+				'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) ',
+				['%plain OR text%', '%plain OR text%', '%cd%', '%cd%'],
+			],
+			[
+				'ab OR cd OR (ef)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'ab OR cd OR ef',
+				'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'(ab) cd OR ef OR (gh)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+				'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
+			],
+			[
+				'(ab) OR cd OR ef OR (gh)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+				'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
+			],
+			[
+				'ab OR (!(cd OR ef))',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR (NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'ab !(cd OR ef)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) AND NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'ab OR !(cd OR ef)',
+				'(((e.title LIKE ? OR e.content LIKE ?) )) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
+			],
+			[
+				'(ab (!cd OR ef OR (gh))) OR !(ij OR kl)',
+				'((((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title NOT LIKE ? AND e.content NOT LIKE ? )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
+				'OR (((e.title LIKE ? OR e.content LIKE ?) )))) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
+				['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%', '%ij%', '%ij%', '%kl%', '%kl%'],
+			],
 		];
 	}
 }