4
0

BooleanSearch.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var list<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true,
  21. bool $expandUserQueries = true
  22. ) {
  23. $input = trim($input);
  24. if ($input === '') {
  25. return;
  26. }
  27. $this->raw_input = $input;
  28. if ($level === 0) {
  29. $input = self::escapeLiterals($input);
  30. if ($expandUserQueries || !$allowUserQueries) {
  31. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  32. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  33. }
  34. $input = trim($input);
  35. }
  36. $input = self::consistentOrParentheses($input);
  37. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  38. // or parse everything as a series of Search’s combined by explicit OR
  39. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  40. }
  41. public function __clone() {
  42. foreach ($this->searches as $key => $search) {
  43. $this->searches[$key] = clone $search;
  44. }
  45. $this->expanded = null;
  46. $this->notExpanded = null;
  47. }
  48. /**
  49. * Parse the user queries (saved searches) by name and expand them in the input string.
  50. */
  51. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  52. $all_matches = [];
  53. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  54. $all_matches[] = $matchesFound;
  55. }
  56. if (preg_match_all('/\bsearch:(?P<search>[^\s"\']*)/', $input, $matchesFound)) {
  57. $all_matches[] = $matchesFound;
  58. }
  59. if (!empty($all_matches)) {
  60. $queries = [];
  61. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  62. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  63. $queries[$raw_query['name']] = trim($raw_query['search']);
  64. }
  65. }
  66. $fromS = [];
  67. $toS = [];
  68. foreach ($all_matches as $matches) {
  69. if (empty($matches['search'])) {
  70. continue;
  71. }
  72. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  73. $name = trim($matches['search'][$i]);
  74. $name = self::unescapeLiterals($name);
  75. $fromS[] = $matches[0][$i];
  76. if ($allowUserQueries && !empty($queries[$name])) {
  77. $toS[] = '(' . self::escapeLiterals($queries[$name]) . ')';
  78. } else {
  79. $toS[] = '';
  80. }
  81. }
  82. }
  83. $input = str_replace($fromS, $toS, $input);
  84. }
  85. return $input;
  86. }
  87. /**
  88. * Parse the user queries (saved searches) by ID and expand them in the input string.
  89. */
  90. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  91. $all_matches = [];
  92. if (preg_match_all('/\bS:(?P<search>[0-9,]+)/', $input, $matchesFound)) {
  93. $all_matches[] = $matchesFound;
  94. }
  95. if (!empty($all_matches)) {
  96. $queries = [];
  97. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  98. $queries[] = trim($raw_query['search'] ?? '');
  99. }
  100. $fromS = [];
  101. $toS = [];
  102. foreach ($all_matches as $matches) {
  103. if (empty($matches['search'])) {
  104. continue;
  105. }
  106. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  107. $ids = explode(',', $matches['search'][$i]);
  108. $ids = array_map('intval', $ids);
  109. $matchedQueries = [];
  110. foreach ($ids as $id) {
  111. if (!empty($queries[$id])) {
  112. $matchedQueries[] = $queries[$id];
  113. }
  114. }
  115. $fromS[] = $matches[0][$i];
  116. if ($allowUserQueries && !empty($matchedQueries)) {
  117. $escapedQueries = array_map(fn(string $query): string => self::escapeLiterals($query), $matchedQueries);
  118. $toS[] = '((' . implode(') OR (', $escapedQueries) . '))';
  119. } else {
  120. $toS[] = '';
  121. }
  122. }
  123. }
  124. $input = str_replace($fromS, $toS, $input);
  125. }
  126. return $input;
  127. }
  128. /**
  129. * Temporarily escape parentheses and 'OR' used in regex expressions or inside "quoted strings".
  130. */
  131. public static function escapeLiterals(string $input): string {
  132. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  133. function (array $matches): string {
  134. $match = $matches[0];
  135. $match = str_replace(['(', ')'], ['\\u0028', '\\u0029'], $match);
  136. $match = preg_replace_callback('/\bOR\b/i', fn(array $ms): string =>
  137. str_replace(['O', 'o', 'R', 'r'], ['\\u004f', '\\u006f', '\\u0052', '\\u0072'], $ms[0]),
  138. $match
  139. ) ?? '';
  140. return $match;
  141. },
  142. $input
  143. ) ?? '';
  144. }
  145. public static function unescapeLiterals(string $input): string {
  146. return str_replace(
  147. ['\\u0028', '\\u0029', '\\u004f', '\\u006f', '\\u0052', '\\u0072'],
  148. ['(', ')', 'O', 'o', 'R', 'r'],
  149. $input
  150. );
  151. }
  152. /**
  153. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  154. */
  155. public static function addOrParentheses(string $input): string {
  156. $input = trim($input);
  157. if ($input === '') {
  158. return '';
  159. }
  160. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  161. $ns = count($splits);
  162. if ($ns <= 1) {
  163. return $input;
  164. }
  165. $result = '';
  166. $segment = '';
  167. for ($i = 0; $i < $ns; $i++) {
  168. $segment .= $splits[$i];
  169. if (trim($segment) === '') {
  170. $segment = '';
  171. } elseif (strcasecmp($segment, 'OR') === 0) {
  172. $result .= $segment . ' ';
  173. $segment = '';
  174. } else {
  175. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  176. if ($quotes % 2 === 0) {
  177. $segment = trim($segment);
  178. if (in_array($segment, ['!', '-'], true)) {
  179. $result .= $segment;
  180. } else {
  181. $result .= '(' . $segment . ') ';
  182. }
  183. $segment = '';
  184. }
  185. }
  186. }
  187. $segment = trim($segment);
  188. if (in_array($segment, ['!', '-'], true)) {
  189. $result .= $segment;
  190. } elseif ($segment !== '') {
  191. $result .= '(' . $segment . ')';
  192. }
  193. return trim($result);
  194. }
  195. /**
  196. * If the query contains a mix of `OR` expressions with and without parentheses,
  197. * then add parentheses to make the query consistent.
  198. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  199. */
  200. public static function consistentOrParentheses(string $input): string {
  201. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  202. // No unescaped parentheses in the input
  203. return trim($input);
  204. }
  205. $parenthesesCount = 0;
  206. $result = '';
  207. $segment = '';
  208. $length = strlen($input);
  209. for ($i = 0; $i < $length; $i++) {
  210. $c = $input[$i];
  211. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  212. if (!$backslashed) {
  213. if ($c === '(') {
  214. if ($parenthesesCount === 0) {
  215. if ($segment !== '') {
  216. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  217. $negation = preg_match('/[!-]$/', $result);
  218. if (!$negation) {
  219. $result .= ' ';
  220. }
  221. $segment = '';
  222. }
  223. $c = '';
  224. }
  225. $parenthesesCount++;
  226. } elseif ($c === ')') {
  227. $parenthesesCount--;
  228. if ($parenthesesCount === 0) {
  229. $segment = self::consistentOrParentheses($segment);
  230. if ($segment !== '') {
  231. $result .= '(' . $segment . ')';
  232. $segment = '';
  233. }
  234. $c = '';
  235. }
  236. }
  237. }
  238. $segment .= $c;
  239. }
  240. if (trim($segment) !== '') {
  241. $result = rtrim($result);
  242. $negation = preg_match('/[!-]$/', $segment);
  243. if (!$negation) {
  244. $result .= ' ';
  245. }
  246. $result .= self::addOrParentheses($segment);
  247. }
  248. return trim($result);
  249. }
  250. /** @return bool True if some parenthesis logic took over, false otherwise */
  251. private function parseParentheses(string $input, int $level): bool {
  252. $input = trim($input);
  253. $length = strlen($input);
  254. $i = 0;
  255. $before = '';
  256. $hasParenthesis = false;
  257. $nextOperator = 'AND';
  258. while ($i < $length) {
  259. $c = $input[$i];
  260. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  261. if ($c === '(' && !$backslashed) {
  262. $hasParenthesis = true;
  263. $before = trim($before);
  264. if (preg_match('/[!-]$/', $before)) {
  265. // Trim trailing negation
  266. $before = rtrim($before, ' !-');
  267. $isOr = preg_match('/\bOR$/i', $before);
  268. if ($isOr) {
  269. // Trim trailing OR
  270. $before = substr($before, 0, -2);
  271. }
  272. // The text prior to the negation is a BooleanSearch
  273. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  274. if (count($searchBefore->searches()) > 0) {
  275. $this->searches[] = $searchBefore;
  276. }
  277. $before = '';
  278. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  279. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  280. } elseif (preg_match('/\bOR$/i', $before)) {
  281. // Trim trailing OR
  282. $before = substr($before, 0, -2);
  283. // The text prior to the OR is a BooleanSearch
  284. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  285. if (count($searchBefore->searches()) > 0) {
  286. $this->searches[] = $searchBefore;
  287. }
  288. $before = '';
  289. // The next BooleanSearch will have to be combined with OR instead of default AND
  290. $nextOperator = 'OR';
  291. } elseif ($before !== '') {
  292. // The text prior to the opening parenthesis is a BooleanSearch
  293. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  294. if (count($searchBefore->searches()) > 0) {
  295. $this->searches[] = $searchBefore;
  296. }
  297. $before = '';
  298. }
  299. // Search the matching closing parenthesis
  300. $parentheses = 1;
  301. $sub = '';
  302. $i++;
  303. while ($i < $length) {
  304. $c = $input[$i];
  305. $backslashed = $input[$i - 1] === '\\';
  306. if ($c === '(' && !$backslashed) {
  307. // One nested level deeper
  308. $parentheses++;
  309. $sub .= $c;
  310. } elseif ($c === ')' && !$backslashed) {
  311. $parentheses--;
  312. if ($parentheses === 0) {
  313. // Found the matching closing parenthesis
  314. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  315. $nextOperator = 'AND';
  316. if (count($searchSub->searches()) > 0) {
  317. $this->searches[] = $searchSub;
  318. }
  319. $sub = '';
  320. break;
  321. } else {
  322. $sub .= $c;
  323. }
  324. } else {
  325. $sub .= $c;
  326. }
  327. $i++;
  328. }
  329. // $sub = trim($sub);
  330. // if ($sub !== '') {
  331. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  332. // }
  333. // } elseif ($c === ')') {
  334. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  335. } else {
  336. $before .= $c;
  337. }
  338. $i++;
  339. }
  340. if ($hasParenthesis) {
  341. $before = trim($before);
  342. if (preg_match('/^OR\b/i', $before)) {
  343. // The next BooleanSearch will have to be combined with OR instead of default AND
  344. $nextOperator = 'OR';
  345. // Trim leading OR
  346. $before = substr($before, 2);
  347. }
  348. // The remaining text after the last parenthesis is a BooleanSearch
  349. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  350. $nextOperator = 'AND';
  351. if (count($searchBefore->searches()) > 0) {
  352. $this->searches[] = $searchBefore;
  353. }
  354. return true;
  355. }
  356. // There was no parenthesis logic to apply
  357. return false;
  358. }
  359. private function parseOrSegments(string $input): void {
  360. $input = trim($input);
  361. if ($input === '') {
  362. return;
  363. }
  364. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  365. $segment = '';
  366. $ns = count($splits);
  367. for ($i = 0; $i < $ns; $i++) {
  368. $segment = $segment . $splits[$i];
  369. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  370. $segment = '';
  371. } else {
  372. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  373. if ($quotes % 2 === 0) {
  374. $segment = trim($segment);
  375. $this->searches[] = new FreshRSS_Search($segment);
  376. $segment = '';
  377. }
  378. }
  379. }
  380. $segment = trim($segment);
  381. if ($segment !== '') {
  382. $this->searches[] = new FreshRSS_Search($segment);
  383. }
  384. }
  385. /**
  386. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  387. * or a series of FreshRSS_Search combined by explicit OR
  388. * @return list<FreshRSS_BooleanSearch|FreshRSS_Search>
  389. */
  390. public function searches(): array {
  391. return $this->searches;
  392. }
  393. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  394. public function operator(): string {
  395. return $this->operator;
  396. }
  397. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  398. public function prepend(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  399. array_unshift($this->searches, $search);
  400. }
  401. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  402. public function add(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  403. $this->searches[] = $search;
  404. }
  405. /**
  406. * Modify the first compatible search of the Boolean expression, or add it at the beginning.
  407. * Useful to modify some search parameters.
  408. * @return FreshRSS_BooleanSearch a new instance, modified.
  409. */
  410. public function enforce(FreshRSS_Search $search): self {
  411. $result = clone $this;
  412. $result->raw_input = '';
  413. $result->expanded = null;
  414. $result->notExpanded = null;
  415. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  416. $result->searches[0] = $result->searches[0]->enforce($search);
  417. return $result;
  418. }
  419. if (count($result->searches) === 2) {
  420. foreach ($result->searches as $booleanSearch) {
  421. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  422. break;
  423. }
  424. if ($booleanSearch->operator() === 'AND') {
  425. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  426. $booleanSearch->searches[0]->hasSameOperators($search)) {
  427. $booleanSearch->searches[0] = $search;
  428. return $result;
  429. }
  430. }
  431. }
  432. }
  433. if (count($result->searches) > 1 || (count($result->searches) > 0 && $result->searches[0] instanceof FreshRSS_Search)) {
  434. // Wrap the existing searches in a new BooleanSearch if needed
  435. $wrap = new FreshRSS_BooleanSearch('');
  436. foreach ($result->searches as $existingSearch) {
  437. $wrap->add($existingSearch);
  438. }
  439. if (count($wrap->searches) > 0) {
  440. $result->searches = [$wrap];
  441. }
  442. }
  443. array_unshift($result->searches, $search);
  444. return $result;
  445. }
  446. /**
  447. * Remove the first compatible search of the Boolean expression, if any.
  448. * Useful to modify some search parameters.
  449. * @return FreshRSS_BooleanSearch a new instance, modified.
  450. */
  451. public function remove(FreshRSS_Search $search): self {
  452. $result = clone $this;
  453. $result->raw_input = '';
  454. $result->expanded = null;
  455. $result->notExpanded = null;
  456. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  457. $result->searches[0] = $result->searches[0]->remove($search);
  458. return $result;
  459. }
  460. if (count($result->searches) === 2) {
  461. foreach ($result->searches as $booleanSearch) {
  462. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  463. break;
  464. }
  465. if ($booleanSearch->operator() === 'AND') {
  466. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  467. $booleanSearch->searches[0]->hasSameOperators($search)) {
  468. array_shift($booleanSearch->searches);
  469. return $result;
  470. }
  471. }
  472. }
  473. }
  474. return $result;
  475. }
  476. /**
  477. * Return the minimum visibility (priority) level needed for this Boolean search, or null if it does not require any specific visibility level.
  478. * For instance, if the search includes some feed IDs then it will return PRIORITY_HIDDEN,
  479. * and if it includes some category IDs then it will return PRIORITY_CATEGORY.
  480. */
  481. public function needVisibility(): ?int {
  482. $minVisibility = FreshRSS_Feed::PRIORITY_IMPORTANT + 1;
  483. foreach ($this->searches as $search) {
  484. if ($search instanceof FreshRSS_BooleanSearch) {
  485. $visibility = $search->needVisibility();
  486. if ($visibility !== null) {
  487. $minVisibility = min($minVisibility, $visibility);
  488. }
  489. } elseif ($search instanceof FreshRSS_Search) {
  490. $visibility = $search->needVisibility();
  491. if ($visibility !== null) {
  492. $minVisibility = min($minVisibility, $visibility);
  493. }
  494. }
  495. }
  496. return $minVisibility < FreshRSS_Feed::PRIORITY_IMPORTANT ? $minVisibility : null;
  497. }
  498. private ?string $expanded = null;
  499. #[\Override]
  500. public function __toString(): string {
  501. if ($this->expanded === null) {
  502. $result = '';
  503. foreach ($this->searches as $search) {
  504. $part = $search->__toString();
  505. if ($part === '') {
  506. continue;
  507. }
  508. $operator = $search instanceof FreshRSS_BooleanSearch ? $search->operator : 'OR';
  509. if ((str_contains($part, ' ') || str_starts_with($part, '-')) && (count($this->searches) > 1 || in_array($operator, ['OR NOT', 'AND NOT'], true))) {
  510. $part = '(' . $part . ')';
  511. }
  512. $result .= match ($operator) {
  513. 'OR' => $result === '' ? '' : ' OR ',
  514. 'OR NOT' => $result === '' ? '-' : ' OR -',
  515. 'AND NOT' => $result === '' ? '-' : ' -',
  516. 'AND' => $result === '' ? '' : ' ',
  517. default => throw new InvalidArgumentException('Invalid operator: ' . $operator),
  518. } . $part;
  519. }
  520. $this->expanded = trim($result);
  521. }
  522. return $this->expanded;
  523. }
  524. private ?string $notExpanded = null;
  525. /**
  526. * @param bool $expandUserQueries Whether to expand user queries (saved searches) or not
  527. */
  528. public function toString(bool $expandUserQueries = true): string {
  529. if ($expandUserQueries) {
  530. return $this->__toString();
  531. }
  532. if ($this->notExpanded === null) {
  533. $this->notExpanded = (new FreshRSS_BooleanSearch($this->raw_input, expandUserQueries: false))->__toString();
  534. }
  535. return $this->notExpanded;
  536. }
  537. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  538. #[Deprecated('Use __toString(expanded: false) instead')]
  539. public function getRawInput(): string {
  540. return $this->raw_input;
  541. }
  542. }