4
0

BooleanSearch.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var list<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true,
  21. bool $expandUserQueries = true
  22. ) {
  23. $input = trim($input);
  24. if ($input === '') {
  25. return;
  26. }
  27. $this->raw_input = $input;
  28. if ($level === 0) {
  29. $input = self::escapeLiterals($input);
  30. if ($expandUserQueries || !$allowUserQueries) {
  31. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  32. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  33. }
  34. $input = trim($input);
  35. }
  36. $input = self::consistentOrParentheses($input);
  37. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  38. // or parse everything as a series of Search’s combined by explicit OR
  39. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  40. }
  41. public function __clone() {
  42. foreach ($this->searches as $key => $search) {
  43. $this->searches[$key] = clone $search;
  44. }
  45. $this->expanded = null;
  46. $this->notExpanded = null;
  47. }
  48. /**
  49. * Parse the user queries (saved searches) by name and expand them in the input string.
  50. */
  51. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  52. $all_matches = [];
  53. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  54. $all_matches[] = $matchesFound;
  55. }
  56. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  57. $all_matches[] = $matchesFound;
  58. }
  59. if (!empty($all_matches)) {
  60. $queries = [];
  61. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  62. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  63. $queries[$raw_query['name']] = trim($raw_query['search']);
  64. }
  65. }
  66. $fromS = [];
  67. $toS = [];
  68. foreach ($all_matches as $matches) {
  69. if (empty($matches['search'])) {
  70. continue;
  71. }
  72. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  73. $name = trim($matches['search'][$i]);
  74. $fromS[] = $matches[0][$i];
  75. if ($allowUserQueries && !empty($queries[$name])) {
  76. $toS[] = '(' . self::escapeLiterals($queries[$name]) . ')';
  77. } else {
  78. $toS[] = '';
  79. }
  80. }
  81. }
  82. $input = str_replace($fromS, $toS, $input);
  83. }
  84. return $input;
  85. }
  86. /**
  87. * Parse the user queries (saved searches) by ID and expand them in the input string.
  88. */
  89. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  90. $all_matches = [];
  91. if (preg_match_all('/\bS:(?P<search>[0-9,]+)/', $input, $matchesFound)) {
  92. $all_matches[] = $matchesFound;
  93. }
  94. if (!empty($all_matches)) {
  95. $queries = [];
  96. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  97. $queries[] = trim($raw_query['search'] ?? '');
  98. }
  99. $fromS = [];
  100. $toS = [];
  101. foreach ($all_matches as $matches) {
  102. if (empty($matches['search'])) {
  103. continue;
  104. }
  105. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  106. $ids = explode(',', $matches['search'][$i]);
  107. $ids = array_map('intval', $ids);
  108. $matchedQueries = [];
  109. foreach ($ids as $id) {
  110. if (!empty($queries[$id])) {
  111. $matchedQueries[] = $queries[$id];
  112. }
  113. }
  114. $fromS[] = $matches[0][$i];
  115. if ($allowUserQueries && !empty($matchedQueries)) {
  116. $escapedQueries = array_map(fn(string $query): string => self::escapeLiterals($query), $matchedQueries);
  117. $toS[] = '(' . implode(') OR (', $escapedQueries) . ')';
  118. } else {
  119. $toS[] = '';
  120. }
  121. }
  122. }
  123. $input = str_replace($fromS, $toS, $input);
  124. }
  125. return $input;
  126. }
  127. /**
  128. * Temporarily escape parentheses and 'OR' used in regex expressions or inside "quoted strings".
  129. */
  130. public static function escapeLiterals(string $input): string {
  131. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  132. function (array $matches): string {
  133. $match = $matches[0];
  134. $match = str_replace(['(', ')'], ['\\u0028', '\\u0029'], $match);
  135. $match = preg_replace_callback('/\bOR\b/i', fn(array $ms): string =>
  136. str_replace(['O', 'o', 'R', 'r'], ['\\u004f', '\\u006f', '\\u0052', '\\u0072'], $ms[0]),
  137. $match
  138. ) ?? '';
  139. return $match;
  140. },
  141. $input
  142. ) ?? '';
  143. }
  144. public static function unescapeLiterals(string $input): string {
  145. return str_replace(
  146. ['\\u0028', '\\u0029', '\\u004f', '\\u006f', '\\u0052', '\\u0072'],
  147. ['(', ')', 'O', 'o', 'R', 'r'],
  148. $input
  149. );
  150. }
  151. /**
  152. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  153. */
  154. public static function addOrParentheses(string $input): string {
  155. $input = trim($input);
  156. if ($input === '') {
  157. return '';
  158. }
  159. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  160. $ns = count($splits);
  161. if ($ns <= 1) {
  162. return $input;
  163. }
  164. $result = '';
  165. $segment = '';
  166. for ($i = 0; $i < $ns; $i++) {
  167. $segment .= $splits[$i];
  168. if (trim($segment) === '') {
  169. $segment = '';
  170. } elseif (strcasecmp($segment, 'OR') === 0) {
  171. $result .= $segment . ' ';
  172. $segment = '';
  173. } else {
  174. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  175. if ($quotes % 2 === 0) {
  176. $segment = trim($segment);
  177. if (in_array($segment, ['!', '-'], true)) {
  178. $result .= $segment;
  179. } else {
  180. $result .= '(' . $segment . ') ';
  181. }
  182. $segment = '';
  183. }
  184. }
  185. }
  186. $segment = trim($segment);
  187. if (in_array($segment, ['!', '-'], true)) {
  188. $result .= $segment;
  189. } elseif ($segment !== '') {
  190. $result .= '(' . $segment . ')';
  191. }
  192. return trim($result);
  193. }
  194. /**
  195. * If the query contains a mix of `OR` expressions with and without parentheses,
  196. * then add parentheses to make the query consistent.
  197. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  198. */
  199. public static function consistentOrParentheses(string $input): string {
  200. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  201. // No unescaped parentheses in the input
  202. return trim($input);
  203. }
  204. $parenthesesCount = 0;
  205. $result = '';
  206. $segment = '';
  207. $length = strlen($input);
  208. for ($i = 0; $i < $length; $i++) {
  209. $c = $input[$i];
  210. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  211. if (!$backslashed) {
  212. if ($c === '(') {
  213. if ($parenthesesCount === 0) {
  214. if ($segment !== '') {
  215. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  216. $negation = preg_match('/[!-]$/', $result);
  217. if (!$negation) {
  218. $result .= ' ';
  219. }
  220. $segment = '';
  221. }
  222. $c = '';
  223. }
  224. $parenthesesCount++;
  225. } elseif ($c === ')') {
  226. $parenthesesCount--;
  227. if ($parenthesesCount === 0) {
  228. $segment = self::consistentOrParentheses($segment);
  229. if ($segment !== '') {
  230. $result .= '(' . $segment . ')';
  231. $segment = '';
  232. }
  233. $c = '';
  234. }
  235. }
  236. }
  237. $segment .= $c;
  238. }
  239. if (trim($segment) !== '') {
  240. $result = rtrim($result);
  241. $negation = preg_match('/[!-]$/', $segment);
  242. if (!$negation) {
  243. $result .= ' ';
  244. }
  245. $result .= self::addOrParentheses($segment);
  246. }
  247. return trim($result);
  248. }
  249. /** @return bool True if some parenthesis logic took over, false otherwise */
  250. private function parseParentheses(string $input, int $level): bool {
  251. $input = trim($input);
  252. $length = strlen($input);
  253. $i = 0;
  254. $before = '';
  255. $hasParenthesis = false;
  256. $nextOperator = 'AND';
  257. while ($i < $length) {
  258. $c = $input[$i];
  259. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  260. if ($c === '(' && !$backslashed) {
  261. $hasParenthesis = true;
  262. $before = trim($before);
  263. if (preg_match('/[!-]$/', $before)) {
  264. // Trim trailing negation
  265. $before = rtrim($before, ' !-');
  266. $isOr = preg_match('/\bOR$/i', $before);
  267. if ($isOr) {
  268. // Trim trailing OR
  269. $before = substr($before, 0, -2);
  270. }
  271. // The text prior to the negation is a BooleanSearch
  272. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  273. if (count($searchBefore->searches()) > 0) {
  274. $this->searches[] = $searchBefore;
  275. }
  276. $before = '';
  277. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  278. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  279. } elseif (preg_match('/\bOR$/i', $before)) {
  280. // Trim trailing OR
  281. $before = substr($before, 0, -2);
  282. // The text prior to the OR is a BooleanSearch
  283. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  284. if (count($searchBefore->searches()) > 0) {
  285. $this->searches[] = $searchBefore;
  286. }
  287. $before = '';
  288. // The next BooleanSearch will have to be combined with OR instead of default AND
  289. $nextOperator = 'OR';
  290. } elseif ($before !== '') {
  291. // The text prior to the opening parenthesis is a BooleanSearch
  292. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  293. if (count($searchBefore->searches()) > 0) {
  294. $this->searches[] = $searchBefore;
  295. }
  296. $before = '';
  297. }
  298. // Search the matching closing parenthesis
  299. $parentheses = 1;
  300. $sub = '';
  301. $i++;
  302. while ($i < $length) {
  303. $c = $input[$i];
  304. $backslashed = $input[$i - 1] === '\\';
  305. if ($c === '(' && !$backslashed) {
  306. // One nested level deeper
  307. $parentheses++;
  308. $sub .= $c;
  309. } elseif ($c === ')' && !$backslashed) {
  310. $parentheses--;
  311. if ($parentheses === 0) {
  312. // Found the matching closing parenthesis
  313. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  314. $nextOperator = 'AND';
  315. if (count($searchSub->searches()) > 0) {
  316. $this->searches[] = $searchSub;
  317. }
  318. $sub = '';
  319. break;
  320. } else {
  321. $sub .= $c;
  322. }
  323. } else {
  324. $sub .= $c;
  325. }
  326. $i++;
  327. }
  328. // $sub = trim($sub);
  329. // if ($sub !== '') {
  330. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  331. // }
  332. // } elseif ($c === ')') {
  333. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  334. } else {
  335. $before .= $c;
  336. }
  337. $i++;
  338. }
  339. if ($hasParenthesis) {
  340. $before = trim($before);
  341. if (preg_match('/^OR\b/i', $before)) {
  342. // The next BooleanSearch will have to be combined with OR instead of default AND
  343. $nextOperator = 'OR';
  344. // Trim leading OR
  345. $before = substr($before, 2);
  346. }
  347. // The remaining text after the last parenthesis is a BooleanSearch
  348. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  349. $nextOperator = 'AND';
  350. if (count($searchBefore->searches()) > 0) {
  351. $this->searches[] = $searchBefore;
  352. }
  353. return true;
  354. }
  355. // There was no parenthesis logic to apply
  356. return false;
  357. }
  358. private function parseOrSegments(string $input): void {
  359. $input = trim($input);
  360. if ($input === '') {
  361. return;
  362. }
  363. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  364. $segment = '';
  365. $ns = count($splits);
  366. for ($i = 0; $i < $ns; $i++) {
  367. $segment = $segment . $splits[$i];
  368. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  369. $segment = '';
  370. } else {
  371. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  372. if ($quotes % 2 === 0) {
  373. $segment = trim($segment);
  374. $this->searches[] = new FreshRSS_Search($segment);
  375. $segment = '';
  376. }
  377. }
  378. }
  379. $segment = trim($segment);
  380. if ($segment !== '') {
  381. $this->searches[] = new FreshRSS_Search($segment);
  382. }
  383. }
  384. /**
  385. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  386. * or a series of FreshRSS_Search combined by explicit OR
  387. * @return list<FreshRSS_BooleanSearch|FreshRSS_Search>
  388. */
  389. public function searches(): array {
  390. return $this->searches;
  391. }
  392. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  393. public function operator(): string {
  394. return $this->operator;
  395. }
  396. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  397. public function prepend(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  398. array_unshift($this->searches, $search);
  399. }
  400. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  401. public function add(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  402. $this->searches[] = $search;
  403. }
  404. /**
  405. * Modify the first compatible search of the Boolean expression, or add it at the beginning.
  406. * Useful to modify some search parameters.
  407. * @return FreshRSS_BooleanSearch a new instance, modified.
  408. */
  409. public function enforce(FreshRSS_Search $search): self {
  410. $result = clone $this;
  411. $result->raw_input = '';
  412. $result->expanded = null;
  413. $result->notExpanded = null;
  414. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  415. $result->searches[0] = $result->searches[0]->enforce($search);
  416. return $result;
  417. }
  418. if (count($result->searches) === 2) {
  419. foreach ($result->searches as $booleanSearch) {
  420. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  421. break;
  422. }
  423. if ($booleanSearch->operator() === 'AND') {
  424. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  425. $booleanSearch->searches[0]->hasSameOperators($search)) {
  426. $booleanSearch->searches[0] = $search;
  427. return $result;
  428. }
  429. }
  430. }
  431. }
  432. if (count($result->searches) > 1 || (count($result->searches) > 0 && $result->searches[0] instanceof FreshRSS_Search)) {
  433. // Wrap the existing searches in a new BooleanSearch if needed
  434. $wrap = new FreshRSS_BooleanSearch('');
  435. foreach ($result->searches as $existingSearch) {
  436. $wrap->add($existingSearch);
  437. }
  438. if (count($wrap->searches) > 0) {
  439. $result->searches = [$wrap];
  440. }
  441. }
  442. array_unshift($result->searches, $search);
  443. return $result;
  444. }
  445. /**
  446. * Remove the first compatible search of the Boolean expression, if any.
  447. * Useful to modify some search parameters.
  448. * @return FreshRSS_BooleanSearch a new instance, modified.
  449. */
  450. public function remove(FreshRSS_Search $search): self {
  451. $result = clone $this;
  452. $result->raw_input = '';
  453. $result->expanded = null;
  454. $result->notExpanded = null;
  455. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  456. $result->searches[0] = $result->searches[0]->remove($search);
  457. return $result;
  458. }
  459. if (count($result->searches) === 2) {
  460. foreach ($result->searches as $booleanSearch) {
  461. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  462. break;
  463. }
  464. if ($booleanSearch->operator() === 'AND') {
  465. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  466. $booleanSearch->searches[0]->hasSameOperators($search)) {
  467. array_shift($booleanSearch->searches);
  468. return $result;
  469. }
  470. }
  471. }
  472. }
  473. return $result;
  474. }
  475. private ?string $expanded = null;
  476. #[\Override]
  477. public function __toString(): string {
  478. if ($this->expanded === null) {
  479. $result = '';
  480. foreach ($this->searches as $search) {
  481. $part = $search->__toString();
  482. if ($part === '') {
  483. continue;
  484. }
  485. $operator = $search instanceof FreshRSS_BooleanSearch ? $search->operator : 'OR';
  486. if ((str_contains($part, ' ') || str_starts_with($part, '-')) && (count($this->searches) > 1 || in_array($operator, ['OR NOT', 'AND NOT'], true))) {
  487. $part = '(' . $part . ')';
  488. }
  489. $result .= match ($operator) {
  490. 'OR' => $result === '' ? '' : ' OR ',
  491. 'OR NOT' => $result === '' ? '-' : ' OR -',
  492. 'AND NOT' => $result === '' ? '-' : ' -',
  493. 'AND' => $result === '' ? '' : ' ',
  494. default => throw new InvalidArgumentException('Invalid operator: ' . $operator),
  495. } . $part;
  496. }
  497. $this->expanded = trim($result);
  498. }
  499. return $this->expanded;
  500. }
  501. private ?string $notExpanded = null;
  502. /**
  503. * @param bool $expandUserQueries Whether to expand user queries (saved searches) or not
  504. */
  505. public function toString(bool $expandUserQueries = true): string {
  506. if ($expandUserQueries) {
  507. return $this->__toString();
  508. }
  509. if ($this->notExpanded === null) {
  510. $this->notExpanded = (new FreshRSS_BooleanSearch($this->raw_input, expandUserQueries: false))->__toString();
  511. }
  512. return $this->notExpanded;
  513. }
  514. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  515. #[Deprecated('Use __toString(expanded: false) instead')]
  516. public function getRawInput(): string {
  517. return $this->raw_input;
  518. }
  519. }