BooleanSearch.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var list<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true
  21. ) {
  22. $input = trim($input);
  23. if ($input === '') {
  24. return;
  25. }
  26. if ($level === 0) {
  27. $input = preg_replace('/:&quot;(.*?)&quot;/', ':"\1"', $input);
  28. if (!is_string($input)) {
  29. return;
  30. }
  31. $input = preg_replace('/(?<=[\s(!-]|^)&quot;(.*?)&quot;/', '"\1"', $input);
  32. if (!is_string($input)) {
  33. return;
  34. }
  35. }
  36. $this->raw_input = $input;
  37. if ($level === 0) {
  38. $input = self::escapeLiteralParentheses($input);
  39. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  40. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  41. $input = trim($input);
  42. }
  43. $input = self::consistentOrParentheses($input);
  44. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  45. // or parse everything as a series of Search’s combined by explicit OR
  46. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  47. }
  48. public function __clone() {
  49. foreach ($this->searches as $key => $search) {
  50. $this->searches[$key] = clone $search;
  51. }
  52. }
  53. /**
  54. * Parse the user queries (saved searches) by name and expand them in the input string.
  55. */
  56. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  57. $all_matches = [];
  58. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  59. $all_matches[] = $matchesFound;
  60. }
  61. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  62. $all_matches[] = $matchesFound;
  63. }
  64. if (!empty($all_matches)) {
  65. $queries = [];
  66. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  67. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  68. $queries[$raw_query['name']] = trim($raw_query['search']);
  69. }
  70. }
  71. $fromS = [];
  72. $toS = [];
  73. foreach ($all_matches as $matches) {
  74. if (empty($matches['search'])) {
  75. continue;
  76. }
  77. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  78. $name = trim($matches['search'][$i]);
  79. if (!empty($queries[$name])) {
  80. $fromS[] = $matches[0][$i];
  81. if ($allowUserQueries) {
  82. $toS[] = '(' . self::escapeLiteralParentheses($queries[$name]) . ')';
  83. } else {
  84. $toS[] = '';
  85. }
  86. }
  87. }
  88. }
  89. $input = str_replace($fromS, $toS, $input);
  90. }
  91. return $input;
  92. }
  93. /**
  94. * Parse the user queries (saved searches) by ID and expand them in the input string.
  95. */
  96. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  97. $all_matches = [];
  98. if (preg_match_all('/\bS:(?P<search>[0-9,]+)/', $input, $matchesFound)) {
  99. $all_matches[] = $matchesFound;
  100. }
  101. if (!empty($all_matches)) {
  102. $queries = [];
  103. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  104. $queries[] = trim($raw_query['search'] ?? '');
  105. }
  106. $fromS = [];
  107. $toS = [];
  108. foreach ($all_matches as $matches) {
  109. if (empty($matches['search'])) {
  110. continue;
  111. }
  112. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  113. $ids = explode(',', $matches['search'][$i]);
  114. $ids = array_map('intval', $ids);
  115. $matchedQueries = [];
  116. foreach ($ids as $id) {
  117. if (!empty($queries[$id])) {
  118. $matchedQueries[] = $queries[$id];
  119. }
  120. }
  121. if (empty($matchedQueries)) {
  122. continue;
  123. }
  124. $fromS[] = $matches[0][$i];
  125. if ($allowUserQueries) {
  126. $escapedQueries = array_map(fn(string $query): string => self::escapeLiteralParentheses($query), $matchedQueries);
  127. $toS[] = '(' . implode(') OR (', $escapedQueries) . ')';
  128. } else {
  129. $toS[] = '';
  130. }
  131. }
  132. }
  133. $input = str_replace($fromS, $toS, $input);
  134. }
  135. return $input;
  136. }
  137. /**
  138. * Temporarily escape parentheses used in regex expressions or inside quoted strings.
  139. */
  140. public static function escapeLiteralParentheses(string $input): string {
  141. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  142. fn(array $matches): string => str_replace(['(', ')'], ['\\u0028', '\\u0029'], $matches[0]),
  143. $input
  144. ) ?? '';
  145. }
  146. public static function unescapeLiteralParentheses(string $input): string {
  147. return str_replace(['\\u0028', '\\u0029'], ['(', ')'], $input);
  148. }
  149. /**
  150. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  151. */
  152. public static function addOrParentheses(string $input): string {
  153. $input = trim($input);
  154. if ($input === '') {
  155. return '';
  156. }
  157. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  158. $ns = count($splits);
  159. if ($ns <= 1) {
  160. return $input;
  161. }
  162. $result = '';
  163. $segment = '';
  164. for ($i = 0; $i < $ns; $i++) {
  165. $segment .= $splits[$i];
  166. if (trim($segment) === '') {
  167. $segment = '';
  168. } elseif (strcasecmp($segment, 'OR') === 0) {
  169. $result .= $segment . ' ';
  170. $segment = '';
  171. } else {
  172. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  173. if ($quotes % 2 === 0) {
  174. $segment = trim($segment);
  175. if (in_array($segment, ['!', '-'], true)) {
  176. $result .= $segment;
  177. } else {
  178. $result .= '(' . $segment . ') ';
  179. }
  180. $segment = '';
  181. }
  182. }
  183. }
  184. $segment = trim($segment);
  185. if (in_array($segment, ['!', '-'], true)) {
  186. $result .= $segment;
  187. } elseif ($segment !== '') {
  188. $result .= '(' . $segment . ')';
  189. }
  190. return trim($result);
  191. }
  192. /**
  193. * If the query contains a mix of `OR` expressions with and without parentheses,
  194. * then add parentheses to make the query consistent.
  195. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  196. */
  197. public static function consistentOrParentheses(string $input): string {
  198. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  199. // No unescaped parentheses in the input
  200. return trim($input);
  201. }
  202. $parenthesesCount = 0;
  203. $result = '';
  204. $segment = '';
  205. $length = strlen($input);
  206. for ($i = 0; $i < $length; $i++) {
  207. $c = $input[$i];
  208. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  209. if (!$backslashed) {
  210. if ($c === '(') {
  211. if ($parenthesesCount === 0) {
  212. if ($segment !== '') {
  213. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  214. $negation = preg_match('/[!-]$/', $result);
  215. if (!$negation) {
  216. $result .= ' ';
  217. }
  218. $segment = '';
  219. }
  220. $c = '';
  221. }
  222. $parenthesesCount++;
  223. } elseif ($c === ')') {
  224. $parenthesesCount--;
  225. if ($parenthesesCount === 0) {
  226. $segment = self::consistentOrParentheses($segment);
  227. if ($segment !== '') {
  228. $result .= '(' . $segment . ')';
  229. $segment = '';
  230. }
  231. $c = '';
  232. }
  233. }
  234. }
  235. $segment .= $c;
  236. }
  237. if (trim($segment) !== '') {
  238. $result = rtrim($result);
  239. $negation = preg_match('/[!-]$/', $segment);
  240. if (!$negation) {
  241. $result .= ' ';
  242. }
  243. $result .= self::addOrParentheses($segment);
  244. }
  245. return trim($result);
  246. }
  247. /** @return bool True if some parenthesis logic took over, false otherwise */
  248. private function parseParentheses(string $input, int $level): bool {
  249. $input = trim($input);
  250. $length = strlen($input);
  251. $i = 0;
  252. $before = '';
  253. $hasParenthesis = false;
  254. $nextOperator = 'AND';
  255. while ($i < $length) {
  256. $c = $input[$i];
  257. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  258. if ($c === '(' && !$backslashed) {
  259. $hasParenthesis = true;
  260. $before = trim($before);
  261. if (preg_match('/[!-]$/', $before)) {
  262. // Trim trailing negation
  263. $before = rtrim($before, ' !-');
  264. $isOr = preg_match('/\bOR$/i', $before);
  265. if ($isOr) {
  266. // Trim trailing OR
  267. $before = substr($before, 0, -2);
  268. }
  269. // The text prior to the negation is a BooleanSearch
  270. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  271. if (count($searchBefore->searches()) > 0) {
  272. $this->searches[] = $searchBefore;
  273. }
  274. $before = '';
  275. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  276. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  277. } elseif (preg_match('/\bOR$/i', $before)) {
  278. // Trim trailing OR
  279. $before = substr($before, 0, -2);
  280. // The text prior to the OR is a BooleanSearch
  281. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  282. if (count($searchBefore->searches()) > 0) {
  283. $this->searches[] = $searchBefore;
  284. }
  285. $before = '';
  286. // The next BooleanSearch will have to be combined with OR instead of default AND
  287. $nextOperator = 'OR';
  288. } elseif ($before !== '') {
  289. // The text prior to the opening parenthesis is a BooleanSearch
  290. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  291. if (count($searchBefore->searches()) > 0) {
  292. $this->searches[] = $searchBefore;
  293. }
  294. $before = '';
  295. }
  296. // Search the matching closing parenthesis
  297. $parentheses = 1;
  298. $sub = '';
  299. $i++;
  300. while ($i < $length) {
  301. $c = $input[$i];
  302. $backslashed = $input[$i - 1] === '\\';
  303. if ($c === '(' && !$backslashed) {
  304. // One nested level deeper
  305. $parentheses++;
  306. $sub .= $c;
  307. } elseif ($c === ')' && !$backslashed) {
  308. $parentheses--;
  309. if ($parentheses === 0) {
  310. // Found the matching closing parenthesis
  311. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  312. $nextOperator = 'AND';
  313. if (count($searchSub->searches()) > 0) {
  314. $this->searches[] = $searchSub;
  315. }
  316. $sub = '';
  317. break;
  318. } else {
  319. $sub .= $c;
  320. }
  321. } else {
  322. $sub .= $c;
  323. }
  324. $i++;
  325. }
  326. // $sub = trim($sub);
  327. // if ($sub !== '') {
  328. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  329. // }
  330. // } elseif ($c === ')') {
  331. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  332. } else {
  333. $before .= $c;
  334. }
  335. $i++;
  336. }
  337. if ($hasParenthesis) {
  338. $before = trim($before);
  339. if (preg_match('/^OR\b/i', $before)) {
  340. // The next BooleanSearch will have to be combined with OR instead of default AND
  341. $nextOperator = 'OR';
  342. // Trim leading OR
  343. $before = substr($before, 2);
  344. }
  345. // The remaining text after the last parenthesis is a BooleanSearch
  346. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  347. $nextOperator = 'AND';
  348. if (count($searchBefore->searches()) > 0) {
  349. $this->searches[] = $searchBefore;
  350. }
  351. return true;
  352. }
  353. // There was no parenthesis logic to apply
  354. return false;
  355. }
  356. private function parseOrSegments(string $input): void {
  357. $input = trim($input);
  358. if ($input === '') {
  359. return;
  360. }
  361. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  362. $segment = '';
  363. $ns = count($splits);
  364. for ($i = 0; $i < $ns; $i++) {
  365. $segment = $segment . $splits[$i];
  366. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  367. $segment = '';
  368. } else {
  369. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  370. if ($quotes % 2 === 0) {
  371. $segment = trim($segment);
  372. $this->searches[] = new FreshRSS_Search($segment);
  373. $segment = '';
  374. }
  375. }
  376. }
  377. $segment = trim($segment);
  378. if ($segment !== '') {
  379. $this->searches[] = new FreshRSS_Search($segment);
  380. }
  381. }
  382. /**
  383. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  384. * or a series of FreshRSS_Search combined by explicit OR
  385. * @return list<FreshRSS_BooleanSearch|FreshRSS_Search>
  386. */
  387. public function searches(): array {
  388. return $this->searches;
  389. }
  390. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  391. public function operator(): string {
  392. return $this->operator;
  393. }
  394. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  395. public function prepend(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  396. array_unshift($this->searches, $search);
  397. }
  398. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  399. public function add(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  400. $this->searches[] = $search;
  401. }
  402. /**
  403. * Modify the first compatible search of the Boolean expression, or add it at the beginning.
  404. * Useful to modify some search parameters.
  405. * @return FreshRSS_BooleanSearch a new instance, modified.
  406. */
  407. public function enforce(FreshRSS_Search $search): self {
  408. $result = clone $this;
  409. $result->raw_input = '';
  410. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  411. $result->searches[0] = $result->searches[0]->enforce($search);
  412. return $result;
  413. }
  414. if (count($result->searches) === 2) {
  415. foreach ($result->searches as $booleanSearch) {
  416. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  417. break;
  418. }
  419. if ($booleanSearch->operator() === 'AND') {
  420. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  421. $booleanSearch->searches[0]->hasSameOperators($search)) {
  422. $booleanSearch->searches[0] = $search;
  423. return $result;
  424. }
  425. }
  426. }
  427. }
  428. if (count($result->searches) > 1 || (count($result->searches) > 0 && $result->searches[0] instanceof FreshRSS_Search)) {
  429. // Wrap the existing searches in a new BooleanSearch if needed
  430. $wrap = new FreshRSS_BooleanSearch('');
  431. foreach ($result->searches as $existingSearch) {
  432. $wrap->add($existingSearch);
  433. }
  434. if (count($wrap->searches) > 0) {
  435. $result->searches = [$wrap];
  436. }
  437. }
  438. array_unshift($result->searches, $search);
  439. return $result;
  440. }
  441. /**
  442. * Remove the first compatible search of the Boolean expression, if any.
  443. * Useful to modify some search parameters.
  444. * @return FreshRSS_BooleanSearch a new instance, modified.
  445. */
  446. public function remove(FreshRSS_Search $search): self {
  447. $result = clone $this;
  448. $result->raw_input = '';
  449. if (count($result->searches) === 1 && $result->searches[0] instanceof FreshRSS_Search) {
  450. $result->searches[0] = $result->searches[0]->remove($search);
  451. return $result;
  452. }
  453. if (count($result->searches) === 2) {
  454. foreach ($result->searches as $booleanSearch) {
  455. if (!($booleanSearch instanceof FreshRSS_BooleanSearch)) {
  456. break;
  457. }
  458. if ($booleanSearch->operator() === 'AND') {
  459. if (count($booleanSearch->searches) === 1 && $booleanSearch->searches[0] instanceof FreshRSS_Search &&
  460. $booleanSearch->searches[0]->hasSameOperators($search)) {
  461. array_shift($booleanSearch->searches);
  462. return $result;
  463. }
  464. }
  465. }
  466. }
  467. return $result;
  468. }
  469. #[\Override]
  470. public function __toString(): string {
  471. $result = '';
  472. foreach ($this->searches as $search) {
  473. $part = $search->__toString();
  474. if ($part === '') {
  475. continue;
  476. }
  477. $operator = $search instanceof FreshRSS_BooleanSearch ? $search->operator() : 'OR';
  478. if ((str_contains($part, ' ') || str_starts_with($part, '-')) && (count($this->searches) > 1 || in_array($operator, ['OR NOT', 'AND NOT'], true))) {
  479. $part = '(' . $part . ')';
  480. }
  481. $result .= match ($operator) {
  482. 'OR' => $result === '' ? '' : ' OR ',
  483. 'OR NOT' => $result === '' ? '-' : ' OR -',
  484. 'AND NOT' => $result === '' ? '-' : ' -',
  485. 'AND' => $result === '' ? '' : ' ',
  486. default => throw new InvalidArgumentException('Invalid operator: ' . $operator),
  487. } . $part;
  488. }
  489. return trim($result);
  490. }
  491. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  492. public function getRawInput(): string {
  493. return $this->raw_input;
  494. }
  495. }