4
0

BooleanSearch.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var list<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true,
  21. bool $expandUserQueries = true
  22. ) {
  23. $input = trim($input);
  24. $input = ltrim($input, ' )');
  25. $input = rtrim($input, ' (\\');
  26. if ($input === '') {
  27. return;
  28. }
  29. $this->raw_input = $input;
  30. if ($level === 0) {
  31. $input = self::escapeLiterals($input);
  32. if ($expandUserQueries || !$allowUserQueries) {
  33. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  34. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  35. }
  36. $input = trim($input);
  37. }
  38. $input = self::consistentOrParentheses($input);
  39. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  40. // or parse everything as a series of Search’s combined by explicit OR
  41. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  42. }
  43. public function __clone() {
  44. foreach ($this->searches as $key => $search) {
  45. $this->searches[$key] = clone $search;
  46. }
  47. $this->expanded = null;
  48. $this->notExpanded = null;
  49. }
  50. /**
  51. * Parse the user queries (saved searches) by name and expand them in the input string.
  52. */
  53. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  54. $all_matches = [];
  55. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  56. $all_matches[] = $matchesFound;
  57. }
  58. if (preg_match_all('/\bsearch:(?P<search>[^\s"\']*)/', $input, $matchesFound)) {
  59. $all_matches[] = $matchesFound;
  60. }
  61. if (!empty($all_matches)) {
  62. $queries = [];
  63. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  64. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  65. $queries[$raw_query['name']] = trim($raw_query['search']);
  66. }
  67. }
  68. $fromS = [];
  69. $toS = [];
  70. foreach ($all_matches as $matches) {
  71. if (empty($matches['search'])) {
  72. continue;
  73. }
  74. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  75. $name = trim($matches['search'][$i]);
  76. $name = self::unescapeLiterals($name);
  77. $fromS[] = $matches[0][$i];
  78. if ($allowUserQueries && !empty($queries[$name])) {
  79. $toS[] = '(' . self::escapeLiterals($queries[$name]) . ')';
  80. } else {
  81. $toS[] = '';
  82. }
  83. }
  84. }
  85. $input = str_replace($fromS, $toS, $input);
  86. }
  87. return $input;
  88. }
  89. /**
  90. * Parse the user queries (saved searches) by ID and expand them in the input string.
  91. */
  92. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  93. $all_matches = [];
  94. if (preg_match_all('/\bS:(?P<search>[0-9,]+)/', $input, $matchesFound)) {
  95. $all_matches[] = $matchesFound;
  96. }
  97. if (!empty($all_matches)) {
  98. $queries = [];
  99. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  100. $queries[] = trim($raw_query['search'] ?? '');
  101. }
  102. $fromS = [];
  103. $toS = [];
  104. foreach ($all_matches as $matches) {
  105. if (empty($matches['search'])) {
  106. continue;
  107. }
  108. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  109. $ids = explode(',', $matches['search'][$i]);
  110. $ids = array_map('intval', $ids);
  111. $matchedQueries = [];
  112. foreach ($ids as $id) {
  113. if (!empty($queries[$id])) {
  114. $matchedQueries[] = $queries[$id];
  115. }
  116. }
  117. $fromS[] = $matches[0][$i];
  118. if ($allowUserQueries && !empty($matchedQueries)) {
  119. $escapedQueries = array_map(fn(string $query): string => self::escapeLiterals($query), $matchedQueries);
  120. $toS[] = '((' . implode(') OR (', $escapedQueries) . '))';
  121. } else {
  122. $toS[] = '';
  123. }
  124. }
  125. }
  126. $input = str_replace($fromS, $toS, $input);
  127. }
  128. return $input;
  129. }
  130. /**
  131. * Temporarily escape parentheses and 'OR' used in regex expressions or inside "quoted strings".
  132. */
  133. public static function escapeLiterals(string $input): string {
  134. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  135. function (array $matches): string {
  136. $match = $matches[0];
  137. $match = str_replace(['(', ')'], ['\\u0028', '\\u0029'], $match);
  138. $match = preg_replace_callback('/\bOR\b/i', fn(array $ms): string =>
  139. str_replace(['O', 'o', 'R', 'r'], ['\\u004f', '\\u006f', '\\u0052', '\\u0072'], $ms[0]),
  140. $match
  141. ) ?? '';
  142. return $match;
  143. },
  144. $input
  145. ) ?? '';
  146. }
  147. public static function unescapeLiterals(string $input): string {
  148. return str_replace(
  149. ['\\u0028', '\\u0029', '\\u004f', '\\u006f', '\\u0052', '\\u0072'],
  150. ['(', ')', 'O', 'o', 'R', 'r'],
  151. $input
  152. );
  153. }
  154. /**
  155. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  156. */
  157. public static function addOrParentheses(string $input): string {
  158. $input = trim($input);
  159. if ($input === '') {
  160. return '';
  161. }
  162. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  163. $ns = count($splits);
  164. if ($ns <= 1) {
  165. return $input;
  166. }
  167. $result = '';
  168. $segment = '';
  169. for ($i = 0; $i < $ns; $i++) {
  170. $segment .= $splits[$i];
  171. if (trim($segment) === '') {
  172. $segment = '';
  173. } elseif (strcasecmp($segment, 'OR') === 0) {
  174. $result .= $segment . ' ';
  175. $segment = '';
  176. } else {
  177. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  178. if ($quotes % 2 === 0) {
  179. $segment = trim($segment);
  180. if (in_array($segment, ['!', '-'], true)) {
  181. $result .= $segment;
  182. } else {
  183. $result .= '(' . $segment . ') ';
  184. }
  185. $segment = '';
  186. }
  187. }
  188. }
  189. $segment = trim($segment);
  190. if (in_array($segment, ['!', '-'], true)) {
  191. $result .= $segment;
  192. } elseif ($segment !== '') {
  193. $result .= '(' . $segment . ')';
  194. }
  195. return trim($result);
  196. }
  197. /**
  198. * If the query contains a mix of `OR` expressions with and without parentheses,
  199. * then add parentheses to make the query consistent.
  200. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  201. */
  202. public static function consistentOrParentheses(string $input): string {
  203. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  204. // No unescaped parentheses in the input
  205. return trim($input);
  206. }
  207. $parenthesesCount = 0;
  208. $result = '';
  209. $segment = '';
  210. $length = strlen($input);
  211. for ($i = 0; $i < $length; $i++) {
  212. $c = $input[$i];
  213. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  214. if (!$backslashed) {
  215. if ($c === '(') {
  216. if ($parenthesesCount === 0) {
  217. if ($segment !== '') {
  218. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  219. $negation = preg_match('/[!-]$/', $result);
  220. if (!$negation) {
  221. $result .= ' ';
  222. }
  223. $segment = '';
  224. }
  225. $c = '';
  226. }
  227. $parenthesesCount++;
  228. } elseif ($c === ')') {
  229. $parenthesesCount--;
  230. if ($parenthesesCount === 0) {
  231. $segment = self::consistentOrParentheses($segment);
  232. if ($segment !== '') {
  233. $result .= '(' . $segment . ')';
  234. $segment = '';
  235. }
  236. $c = '';
  237. }
  238. }
  239. }
  240. $segment .= $c;
  241. }
  242. if (trim($segment) !== '') {
  243. $result = rtrim($result);
  244. $negation = preg_match('/[!-]$/', $segment);
  245. if (!$negation) {
  246. $result .= ' ';
  247. }
  248. $result .= self::addOrParentheses($segment);
  249. }
  250. return trim($result);
  251. }
  252. /** @return bool True if some parenthesis logic took over, false otherwise */
  253. private function parseParentheses(string $input, int $level): bool {
  254. $input = trim($input);
  255. $length = strlen($input);
  256. $i = 0;
  257. $before = '';
  258. $hasParenthesis = false;
  259. $nextOperator = 'AND';
  260. while ($i < $length) {
  261. $c = $input[$i];
  262. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  263. if ($c === '(' && !$backslashed) {
  264. $hasParenthesis = true;
  265. $before = trim($before);
  266. if (preg_match('/[!-]$/', $before)) {
  267. // Trim trailing negation
  268. $before = rtrim($before, ' !-');
  269. $isOr = preg_match('/\bOR$/i', $before);
  270. if ($isOr) {
  271. // Trim trailing OR
  272. $before = substr($before, 0, -2);
  273. }
  274. // The text prior to the negation is a BooleanSearch
  275. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  276. if (count($searchBefore->searches()) > 0) {
  277. $this->searches[] = $searchBefore;
  278. }
  279. $before = '';
  280. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  281. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  282. } elseif (preg_match('/\bOR$/i', $before)) {
  283. // Trim trailing OR
  284. $before = substr($before, 0, -2);
  285. // The text prior to the OR is a BooleanSearch
  286. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  287. if (count($searchBefore->searches()) > 0) {
  288. $this->searches[] = $searchBefore;
  289. }
  290. $before = '';
  291. // The next BooleanSearch will have to be combined with OR instead of default AND
  292. $nextOperator = 'OR';
  293. } elseif ($before !== '') {
  294. // The text prior to the opening parenthesis is a BooleanSearch
  295. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  296. if (count($searchBefore->searches()) > 0) {
  297. $this->searches[] = $searchBefore;
  298. }
  299. $before = '';
  300. }
  301. // Search the matching closing parenthesis
  302. $parentheses = 1;
  303. $sub = '';
  304. $i++;
  305. while ($i < $length) {
  306. $c = $input[$i];
  307. $backslashed = $input[$i - 1] === '\\';
  308. if ($c === '(' && !$backslashed) {
  309. // One nested level deeper
  310. $parentheses++;
  311. $sub .= $c;
  312. } elseif ($c === ')' && !$backslashed) {
  313. $parentheses--;
  314. if ($parentheses === 0) {
  315. // Found the matching closing parenthesis
  316. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  317. $nextOperator = 'AND';
  318. if (count($searchSub->searches()) > 0) {
  319. $this->searches[] = $searchSub;
  320. }
  321. $sub = '';
  322. break;
  323. } else {
  324. $sub .= $c;
  325. }
  326. } else {
  327. $sub .= $c;
  328. }
  329. $i++;
  330. }
  331. // $sub = trim($sub);
  332. // if ($sub !== '') {
  333. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  334. // }
  335. // } elseif ($c === ')') {
  336. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  337. } else {
  338. $before .= $c;
  339. }
  340. $i++;
  341. }
  342. if ($hasParenthesis) {
  343. $before = trim($before);
  344. if (preg_match('/^OR\b/i', $before)) {
  345. // The next BooleanSearch will have to be combined with OR instead of default AND
  346. $nextOperator = 'OR';
  347. // Trim leading OR
  348. $before = substr($before, 2);
  349. }
  350. // The remaining text after the last parenthesis is a BooleanSearch
  351. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  352. $nextOperator = 'AND';
  353. if (count($searchBefore->searches()) > 0) {
  354. $this->searches[] = $searchBefore;
  355. }
  356. return true;
  357. }
  358. // There was no parenthesis logic to apply
  359. return false;
  360. }
  361. private function parseOrSegments(string $input): void {
  362. $input = trim($input);
  363. if ($input === '') {
  364. return;
  365. }
  366. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  367. $segment = '';
  368. $ns = count($splits);
  369. for ($i = 0; $i < $ns; $i++) {
  370. $segment = $segment . $splits[$i];
  371. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  372. $segment = '';
  373. } else {
  374. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  375. if ($quotes % 2 === 0) {
  376. $segment = trim($segment);
  377. $this->searches[] = new FreshRSS_Search($segment);
  378. $segment = '';
  379. }
  380. }
  381. }
  382. $segment = trim($segment);
  383. if ($segment !== '') {
  384. $this->searches[] = new FreshRSS_Search($segment);
  385. }
  386. }
  387. /**
  388. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  389. * or a series of FreshRSS_Search combined by explicit OR
  390. * @return list<FreshRSS_BooleanSearch|FreshRSS_Search>
  391. */
  392. public function searches(): array {
  393. return $this->searches;
  394. }
  395. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  396. public function operator(): string {
  397. return $this->operator;
  398. }
  399. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  400. public function prepend(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  401. array_unshift($this->searches, $search);
  402. }
  403. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  404. public function add(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  405. $this->searches[] = $search;
  406. }
  407. /**
  408. * Modify the first compatible search of the Boolean expression, or add it at the beginning.
  409. * Useful to modify some search parameters.
  410. * @return FreshRSS_BooleanSearch a new instance, modified.
  411. */
  412. public function enforce(FreshRSS_Search $search): self {
  413. $result = clone $this;
  414. $result->raw_input = '';
  415. $result->expanded = null;
  416. $result->notExpanded = null;
  417. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  418. $result->searches[0] = $result->searches[0]->enforce($search);
  419. return $result;
  420. }
  421. if (count($result->searches) === 2) {
  422. foreach ($result->searches as $booleanSearch) {
  423. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  424. break;
  425. }
  426. if ($booleanSearch->operator() === 'AND') {
  427. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  428. $booleanSearch->searches[0]->hasSameOperators($search)) {
  429. $booleanSearch->searches[0] = $search;
  430. return $result;
  431. }
  432. }
  433. }
  434. }
  435. if (count($result->searches) > 1 || (count($result->searches) > 0 && $result->searches[0] instanceof FreshRSS_Search)) {
  436. // Wrap the existing searches in a new BooleanSearch if needed
  437. $wrap = new FreshRSS_BooleanSearch('');
  438. foreach ($result->searches as $existingSearch) {
  439. $wrap->add($existingSearch);
  440. }
  441. if (count($wrap->searches) > 0) {
  442. $result->searches = [$wrap];
  443. }
  444. }
  445. array_unshift($result->searches, $search);
  446. return $result;
  447. }
  448. /**
  449. * Remove the first compatible search of the Boolean expression, if any.
  450. * Useful to modify some search parameters.
  451. * @return FreshRSS_BooleanSearch a new instance, modified.
  452. */
  453. public function remove(FreshRSS_Search $search): self {
  454. $result = clone $this;
  455. $result->raw_input = '';
  456. $result->expanded = null;
  457. $result->notExpanded = null;
  458. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  459. $result->searches[0] = $result->searches[0]->remove($search);
  460. return $result;
  461. }
  462. if (count($result->searches) === 2) {
  463. foreach ($result->searches as $booleanSearch) {
  464. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  465. break;
  466. }
  467. if ($booleanSearch->operator() === 'AND') {
  468. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  469. $booleanSearch->searches[0]->hasSameOperators($search)) {
  470. array_shift($booleanSearch->searches);
  471. return $result;
  472. }
  473. }
  474. }
  475. }
  476. return $result;
  477. }
  478. /**
  479. * Return the minimum visibility (priority) level needed for this Boolean search, or null if it does not require any specific visibility level.
  480. * For instance, if the search includes some feed IDs then it will return PRIORITY_HIDDEN,
  481. * and if it includes some category IDs then it will return PRIORITY_CATEGORY.
  482. */
  483. public function needVisibility(): ?int {
  484. $minVisibility = FreshRSS_Feed::PRIORITY_IMPORTANT + 1;
  485. foreach ($this->searches as $search) {
  486. if ($search instanceof FreshRSS_BooleanSearch) {
  487. $visibility = $search->needVisibility();
  488. if ($visibility !== null) {
  489. $minVisibility = min($minVisibility, $visibility);
  490. }
  491. } elseif ($search instanceof FreshRSS_Search) {
  492. $visibility = $search->needVisibility();
  493. if ($visibility !== null) {
  494. $minVisibility = min($minVisibility, $visibility);
  495. }
  496. }
  497. }
  498. return $minVisibility < FreshRSS_Feed::PRIORITY_IMPORTANT ? $minVisibility : null;
  499. }
  500. private ?string $expanded = null;
  501. #[\Override]
  502. public function __toString(): string {
  503. if ($this->expanded === null) {
  504. $result = '';
  505. foreach ($this->searches as $search) {
  506. $part = $search->__toString();
  507. if ($part === '') {
  508. continue;
  509. }
  510. $operator = $search instanceof FreshRSS_BooleanSearch ? $search->operator : 'OR';
  511. if ((str_contains($part, ' ') || str_starts_with($part, '-')) && (count($this->searches) > 1 || in_array($operator, ['OR NOT', 'AND NOT'], true))) {
  512. $part = '(' . $part . ')';
  513. }
  514. $result .= match ($operator) {
  515. 'OR' => $result === '' ? '' : ' OR ',
  516. 'OR NOT' => $result === '' ? '-' : ' OR -',
  517. 'AND NOT' => $result === '' ? '-' : ' -',
  518. 'AND' => $result === '' ? '' : ' ',
  519. default => throw new InvalidArgumentException('Invalid operator: ' . $operator),
  520. } . $part;
  521. }
  522. $this->expanded = trim($result);
  523. }
  524. return $this->expanded;
  525. }
  526. private ?string $notExpanded = null;
  527. /**
  528. * @param bool $expandUserQueries Whether to expand user queries (saved searches) or not
  529. */
  530. public function toString(bool $expandUserQueries = true): string {
  531. if ($expandUserQueries) {
  532. return $this->__toString();
  533. }
  534. if ($this->notExpanded === null) {
  535. $this->notExpanded = (new FreshRSS_BooleanSearch($this->raw_input, expandUserQueries: false))->__toString();
  536. }
  537. return $this->notExpanded;
  538. }
  539. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  540. #[Deprecated('Use __toString(expanded: false) instead')]
  541. public function getRawInput(): string {
  542. return $this->raw_input;
  543. }
  544. }