BooleanSearch.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var list<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true
  21. ) {
  22. $input = trim($input);
  23. if ($input === '') {
  24. return;
  25. }
  26. if ($level === 0) {
  27. $input = preg_replace('/:&quot;(.*?)&quot;/', ':"\1"', $input);
  28. if (!is_string($input)) {
  29. return;
  30. }
  31. $input = preg_replace('/(?<=[\s(!-]|^)&quot;(.*?)&quot;/', '"\1"', $input);
  32. if (!is_string($input)) {
  33. return;
  34. }
  35. }
  36. $this->raw_input = $input;
  37. if ($level === 0) {
  38. $input = self::escapeLiteralParentheses($input);
  39. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  40. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  41. $input = trim($input);
  42. }
  43. $input = self::consistentOrParentheses($input);
  44. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  45. // or parse everything as a series of Search’s combined by explicit OR
  46. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  47. }
  48. /**
  49. * Parse the user queries (saved searches) by name and expand them in the input string.
  50. */
  51. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  52. $all_matches = [];
  53. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  54. $all_matches[] = $matchesFound;
  55. }
  56. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  57. $all_matches[] = $matchesFound;
  58. }
  59. if (!empty($all_matches)) {
  60. $queries = [];
  61. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  62. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  63. $queries[$raw_query['name']] = trim($raw_query['search']);
  64. }
  65. }
  66. $fromS = [];
  67. $toS = [];
  68. foreach ($all_matches as $matches) {
  69. if (empty($matches['search'])) {
  70. continue;
  71. }
  72. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  73. $name = trim($matches['search'][$i]);
  74. if (!empty($queries[$name])) {
  75. $fromS[] = $matches[0][$i];
  76. if ($allowUserQueries) {
  77. $toS[] = '(' . self::escapeLiteralParentheses($queries[$name]) . ')';
  78. } else {
  79. $toS[] = '';
  80. }
  81. }
  82. }
  83. }
  84. $input = str_replace($fromS, $toS, $input);
  85. }
  86. return $input;
  87. }
  88. /**
  89. * Parse the user queries (saved searches) by ID and expand them in the input string.
  90. */
  91. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  92. $all_matches = [];
  93. if (preg_match_all('/\bS:(?P<search>[0-9,]+)/', $input, $matchesFound)) {
  94. $all_matches[] = $matchesFound;
  95. }
  96. if (!empty($all_matches)) {
  97. $queries = [];
  98. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  99. $queries[] = trim($raw_query['search'] ?? '');
  100. }
  101. $fromS = [];
  102. $toS = [];
  103. foreach ($all_matches as $matches) {
  104. if (empty($matches['search'])) {
  105. continue;
  106. }
  107. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  108. $ids = explode(',', $matches['search'][$i]);
  109. $ids = array_map('intval', $ids);
  110. $matchedQueries = [];
  111. foreach ($ids as $id) {
  112. if (!empty($queries[$id])) {
  113. $matchedQueries[] = $queries[$id];
  114. }
  115. }
  116. if (empty($matchedQueries)) {
  117. continue;
  118. }
  119. $fromS[] = $matches[0][$i];
  120. if ($allowUserQueries) {
  121. $escapedQueries = array_map(fn(string $query): string => self::escapeLiteralParentheses($query), $matchedQueries);
  122. $toS[] = '(' . implode(') OR (', $escapedQueries) . ')';
  123. } else {
  124. $toS[] = '';
  125. }
  126. }
  127. }
  128. $input = str_replace($fromS, $toS, $input);
  129. }
  130. return $input;
  131. }
  132. /**
  133. * Temporarily escape parentheses used in regex expressions or inside quoted strings.
  134. */
  135. public static function escapeLiteralParentheses(string $input): string {
  136. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  137. fn(array $matches): string => str_replace(['(', ')'], ['\\u0028', '\\u0029'], $matches[0]),
  138. $input
  139. ) ?? '';
  140. }
  141. public static function unescapeLiteralParentheses(string $input): string {
  142. return str_replace(['\\u0028', '\\u0029'], ['(', ')'], $input);
  143. }
  144. /**
  145. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  146. */
  147. public static function addOrParentheses(string $input): string {
  148. $input = trim($input);
  149. if ($input === '') {
  150. return '';
  151. }
  152. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  153. $ns = count($splits);
  154. if ($ns <= 1) {
  155. return $input;
  156. }
  157. $result = '';
  158. $segment = '';
  159. for ($i = 0; $i < $ns; $i++) {
  160. $segment .= $splits[$i];
  161. if (trim($segment) === '') {
  162. $segment = '';
  163. } elseif (strcasecmp($segment, 'OR') === 0) {
  164. $result .= $segment . ' ';
  165. $segment = '';
  166. } else {
  167. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  168. if ($quotes % 2 === 0) {
  169. $segment = trim($segment);
  170. if (in_array($segment, ['!', '-'], true)) {
  171. $result .= $segment;
  172. } else {
  173. $result .= '(' . $segment . ') ';
  174. }
  175. $segment = '';
  176. }
  177. }
  178. }
  179. $segment = trim($segment);
  180. if (in_array($segment, ['!', '-'], true)) {
  181. $result .= $segment;
  182. } elseif ($segment !== '') {
  183. $result .= '(' . $segment . ')';
  184. }
  185. return trim($result);
  186. }
  187. /**
  188. * If the query contains a mix of `OR` expressions with and without parentheses,
  189. * then add parentheses to make the query consistent.
  190. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  191. */
  192. public static function consistentOrParentheses(string $input): string {
  193. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  194. // No unescaped parentheses in the input
  195. return trim($input);
  196. }
  197. $parenthesesCount = 0;
  198. $result = '';
  199. $segment = '';
  200. $length = strlen($input);
  201. for ($i = 0; $i < $length; $i++) {
  202. $c = $input[$i];
  203. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  204. if (!$backslashed) {
  205. if ($c === '(') {
  206. if ($parenthesesCount === 0) {
  207. if ($segment !== '') {
  208. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  209. $negation = preg_match('/[!-]$/', $result);
  210. if (!$negation) {
  211. $result .= ' ';
  212. }
  213. $segment = '';
  214. }
  215. $c = '';
  216. }
  217. $parenthesesCount++;
  218. } elseif ($c === ')') {
  219. $parenthesesCount--;
  220. if ($parenthesesCount === 0) {
  221. $segment = self::consistentOrParentheses($segment);
  222. if ($segment !== '') {
  223. $result .= '(' . $segment . ')';
  224. $segment = '';
  225. }
  226. $c = '';
  227. }
  228. }
  229. }
  230. $segment .= $c;
  231. }
  232. if (trim($segment) !== '') {
  233. $result = rtrim($result);
  234. $negation = preg_match('/[!-]$/', $segment);
  235. if (!$negation) {
  236. $result .= ' ';
  237. }
  238. $result .= self::addOrParentheses($segment);
  239. }
  240. return trim($result);
  241. }
  242. /** @return bool True if some parenthesis logic took over, false otherwise */
  243. private function parseParentheses(string $input, int $level): bool {
  244. $input = trim($input);
  245. $length = strlen($input);
  246. $i = 0;
  247. $before = '';
  248. $hasParenthesis = false;
  249. $nextOperator = 'AND';
  250. while ($i < $length) {
  251. $c = $input[$i];
  252. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  253. if ($c === '(' && !$backslashed) {
  254. $hasParenthesis = true;
  255. $before = trim($before);
  256. if (preg_match('/[!-]$/', $before)) {
  257. // Trim trailing negation
  258. $before = rtrim($before, ' !-');
  259. $isOr = preg_match('/\bOR$/i', $before);
  260. if ($isOr) {
  261. // Trim trailing OR
  262. $before = substr($before, 0, -2);
  263. }
  264. // The text prior to the negation is a BooleanSearch
  265. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  266. if (count($searchBefore->searches()) > 0) {
  267. $this->searches[] = $searchBefore;
  268. }
  269. $before = '';
  270. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  271. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  272. } elseif (preg_match('/\bOR$/i', $before)) {
  273. // Trim trailing OR
  274. $before = substr($before, 0, -2);
  275. // The text prior to the OR is a BooleanSearch
  276. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  277. if (count($searchBefore->searches()) > 0) {
  278. $this->searches[] = $searchBefore;
  279. }
  280. $before = '';
  281. // The next BooleanSearch will have to be combined with OR instead of default AND
  282. $nextOperator = 'OR';
  283. } elseif ($before !== '') {
  284. // The text prior to the opening parenthesis is a BooleanSearch
  285. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  286. if (count($searchBefore->searches()) > 0) {
  287. $this->searches[] = $searchBefore;
  288. }
  289. $before = '';
  290. }
  291. // Search the matching closing parenthesis
  292. $parentheses = 1;
  293. $sub = '';
  294. $i++;
  295. while ($i < $length) {
  296. $c = $input[$i];
  297. $backslashed = $input[$i - 1] === '\\';
  298. if ($c === '(' && !$backslashed) {
  299. // One nested level deeper
  300. $parentheses++;
  301. $sub .= $c;
  302. } elseif ($c === ')' && !$backslashed) {
  303. $parentheses--;
  304. if ($parentheses === 0) {
  305. // Found the matching closing parenthesis
  306. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  307. $nextOperator = 'AND';
  308. if (count($searchSub->searches()) > 0) {
  309. $this->searches[] = $searchSub;
  310. }
  311. $sub = '';
  312. break;
  313. } else {
  314. $sub .= $c;
  315. }
  316. } else {
  317. $sub .= $c;
  318. }
  319. $i++;
  320. }
  321. // $sub = trim($sub);
  322. // if ($sub !== '') {
  323. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  324. // }
  325. // } elseif ($c === ')') {
  326. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  327. } else {
  328. $before .= $c;
  329. }
  330. $i++;
  331. }
  332. if ($hasParenthesis) {
  333. $before = trim($before);
  334. if (preg_match('/^OR\b/i', $before)) {
  335. // The next BooleanSearch will have to be combined with OR instead of default AND
  336. $nextOperator = 'OR';
  337. // Trim leading OR
  338. $before = substr($before, 2);
  339. }
  340. // The remaining text after the last parenthesis is a BooleanSearch
  341. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  342. $nextOperator = 'AND';
  343. if (count($searchBefore->searches()) > 0) {
  344. $this->searches[] = $searchBefore;
  345. }
  346. return true;
  347. }
  348. // There was no parenthesis logic to apply
  349. return false;
  350. }
  351. private function parseOrSegments(string $input): void {
  352. $input = trim($input);
  353. if ($input === '') {
  354. return;
  355. }
  356. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  357. $segment = '';
  358. $ns = count($splits);
  359. for ($i = 0; $i < $ns; $i++) {
  360. $segment = $segment . $splits[$i];
  361. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  362. $segment = '';
  363. } else {
  364. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  365. if ($quotes % 2 === 0) {
  366. $segment = trim($segment);
  367. $this->searches[] = new FreshRSS_Search($segment);
  368. $segment = '';
  369. }
  370. }
  371. }
  372. $segment = trim($segment);
  373. if ($segment !== '') {
  374. $this->searches[] = new FreshRSS_Search($segment);
  375. }
  376. }
  377. /**
  378. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  379. * or a series of FreshRSS_Search combined by explicit OR
  380. * @return list<FreshRSS_BooleanSearch|FreshRSS_Search>
  381. */
  382. public function searches(): array {
  383. return $this->searches;
  384. }
  385. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  386. public function operator(): string {
  387. return $this->operator;
  388. }
  389. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  390. public function add(FreshRSS_BooleanSearch|FreshRSS_Search $search): void {
  391. $this->searches[] = $search;
  392. }
  393. #[\Override]
  394. public function __toString(): string {
  395. return $this->getRawInput();
  396. }
  397. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  398. public function getRawInput(): string {
  399. return $this->raw_input;
  400. }
  401. }