4
0

BooleanSearch.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch implements \Stringable {
  7. private string $raw_input = '';
  8. /** @var array<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @param string $input
  12. * @param int $level
  13. * @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator
  14. * @param bool $allowUserQueries
  15. */
  16. public function __construct(
  17. string $input,
  18. int $level = 0,
  19. private readonly string $operator = 'AND',
  20. bool $allowUserQueries = true
  21. ) {
  22. $input = trim($input);
  23. if ($input === '') {
  24. return;
  25. }
  26. if ($level === 0) {
  27. $input = preg_replace('/:&quot;(.*?)&quot;/', ':"\1"', $input);
  28. if (!is_string($input)) {
  29. return;
  30. }
  31. $input = preg_replace('/(?<=[\s(!-]|^)&quot;(.*?)&quot;/', '"\1"', $input);
  32. if (!is_string($input)) {
  33. return;
  34. }
  35. }
  36. $this->raw_input = $input;
  37. if ($level === 0) {
  38. $input = self::escapeLiteralParentheses($input);
  39. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  40. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  41. $input = trim($input);
  42. }
  43. $input = self::consistentOrParentheses($input);
  44. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  45. // or parse everything as a series of Search’s combined by explicit OR
  46. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  47. }
  48. /**
  49. * Parse the user queries (saved searches) by name and expand them in the input string.
  50. */
  51. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  52. $all_matches = [];
  53. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  54. $all_matches[] = $matchesFound;
  55. }
  56. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  57. $all_matches[] = $matchesFound;
  58. }
  59. if (!empty($all_matches)) {
  60. $queries = [];
  61. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  62. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  63. $queries[$raw_query['name']] = trim($raw_query['search']);
  64. }
  65. }
  66. $fromS = [];
  67. $toS = [];
  68. foreach ($all_matches as $matches) {
  69. if (empty($matches['search'])) {
  70. continue;
  71. }
  72. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  73. $name = trim($matches['search'][$i]);
  74. if (!empty($queries[$name])) {
  75. $fromS[] = $matches[0][$i];
  76. if ($allowUserQueries) {
  77. $toS[] = '(' . self::escapeLiteralParentheses($queries[$name]) . ')';
  78. } else {
  79. $toS[] = '';
  80. }
  81. }
  82. }
  83. }
  84. $input = str_replace($fromS, $toS, $input);
  85. }
  86. return $input;
  87. }
  88. /**
  89. * Parse the user queries (saved searches) by ID and expand them in the input string.
  90. */
  91. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  92. $all_matches = [];
  93. if (preg_match_all('/\bS:(?P<search>\d+)/', $input, $matchesFound)) {
  94. $all_matches[] = $matchesFound;
  95. }
  96. if (!empty($all_matches)) {
  97. $queries = [];
  98. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  99. $queries[] = trim($raw_query['search'] ?? '');
  100. }
  101. $fromS = [];
  102. $toS = [];
  103. foreach ($all_matches as $matches) {
  104. if (empty($matches['search'])) {
  105. continue;
  106. }
  107. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  108. // Index starting from 1
  109. $id = (int)(trim($matches['search'][$i])) - 1;
  110. if (!empty($queries[$id])) {
  111. $fromS[] = $matches[0][$i];
  112. if ($allowUserQueries) {
  113. $toS[] = '(' . self::escapeLiteralParentheses($queries[$id]) . ')';
  114. } else {
  115. $toS[] = '';
  116. }
  117. }
  118. }
  119. }
  120. $input = str_replace($fromS, $toS, $input);
  121. }
  122. return $input;
  123. }
  124. /**
  125. * Temporarily escape parentheses used in regex expressions or inside quoted strings.
  126. */
  127. public static function escapeLiteralParentheses(string $input): string {
  128. return preg_replace_callback('%(?<=[\\s(:#!-]|^)(?<![\\\\])(?P<delim>[\'"/]).+?(?<!\\\\)(?P=delim)[im]*%',
  129. fn(array $matches): string => str_replace(['(', ')'], ['\\u0028', '\\u0029'], $matches[0]),
  130. $input
  131. ) ?? '';
  132. }
  133. public static function unescapeLiteralParentheses(string $input): string {
  134. return str_replace(['\\u0028', '\\u0029'], ['(', ')'], $input);
  135. }
  136. /**
  137. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  138. */
  139. public static function addOrParentheses(string $input): string {
  140. $input = trim($input);
  141. if ($input === '') {
  142. return '';
  143. }
  144. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  145. $ns = count($splits);
  146. if ($ns <= 1) {
  147. return $input;
  148. }
  149. $result = '';
  150. $segment = '';
  151. for ($i = 0; $i < $ns; $i++) {
  152. $segment .= $splits[$i];
  153. if (trim($segment) === '') {
  154. $segment = '';
  155. } elseif (strcasecmp($segment, 'OR') === 0) {
  156. $result .= $segment . ' ';
  157. $segment = '';
  158. } else {
  159. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  160. if ($quotes % 2 === 0) {
  161. $segment = trim($segment);
  162. if (in_array($segment, ['!', '-'], true)) {
  163. $result .= $segment;
  164. } else {
  165. $result .= '(' . $segment . ') ';
  166. }
  167. $segment = '';
  168. }
  169. }
  170. }
  171. $segment = trim($segment);
  172. if (in_array($segment, ['!', '-'], true)) {
  173. $result .= $segment;
  174. } elseif ($segment !== '') {
  175. $result .= '(' . $segment . ')';
  176. }
  177. return trim($result);
  178. }
  179. /**
  180. * If the query contains a mix of `OR` expressions with and without parentheses,
  181. * then add parentheses to make the query consistent.
  182. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  183. */
  184. public static function consistentOrParentheses(string $input): string {
  185. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  186. // No unescaped parentheses in the input
  187. return trim($input);
  188. }
  189. $parenthesesCount = 0;
  190. $result = '';
  191. $segment = '';
  192. $length = strlen($input);
  193. for ($i = 0; $i < $length; $i++) {
  194. $c = $input[$i];
  195. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  196. if (!$backslashed) {
  197. if ($c === '(') {
  198. if ($parenthesesCount === 0) {
  199. if ($segment !== '') {
  200. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  201. $negation = preg_match('/[!-]$/', $result);
  202. if (!$negation) {
  203. $result .= ' ';
  204. }
  205. $segment = '';
  206. }
  207. $c = '';
  208. }
  209. $parenthesesCount++;
  210. } elseif ($c === ')') {
  211. $parenthesesCount--;
  212. if ($parenthesesCount === 0) {
  213. $segment = self::consistentOrParentheses($segment);
  214. if ($segment !== '') {
  215. $result .= '(' . $segment . ')';
  216. $segment = '';
  217. }
  218. $c = '';
  219. }
  220. }
  221. }
  222. $segment .= $c;
  223. }
  224. if (trim($segment) !== '') {
  225. $result = rtrim($result);
  226. $negation = preg_match('/[!-]$/', $segment);
  227. if (!$negation) {
  228. $result .= ' ';
  229. }
  230. $result .= self::addOrParentheses($segment);
  231. }
  232. return trim($result);
  233. }
  234. /** @return bool True if some parenthesis logic took over, false otherwise */
  235. private function parseParentheses(string $input, int $level): bool {
  236. $input = trim($input);
  237. $length = strlen($input);
  238. $i = 0;
  239. $before = '';
  240. $hasParenthesis = false;
  241. $nextOperator = 'AND';
  242. while ($i < $length) {
  243. $c = $input[$i];
  244. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  245. if ($c === '(' && !$backslashed) {
  246. $hasParenthesis = true;
  247. $before = trim($before);
  248. if (preg_match('/[!-]$/', $before)) {
  249. // Trim trailing negation
  250. $before = rtrim($before, ' !-');
  251. $isOr = preg_match('/\bOR$/i', $before);
  252. if ($isOr) {
  253. // Trim trailing OR
  254. $before = substr($before, 0, -2);
  255. }
  256. // The text prior to the negation is a BooleanSearch
  257. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  258. if (count($searchBefore->searches()) > 0) {
  259. $this->searches[] = $searchBefore;
  260. }
  261. $before = '';
  262. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  263. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  264. } elseif (preg_match('/\bOR$/i', $before)) {
  265. // Trim trailing OR
  266. $before = substr($before, 0, -2);
  267. // The text prior to the OR is a BooleanSearch
  268. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  269. if (count($searchBefore->searches()) > 0) {
  270. $this->searches[] = $searchBefore;
  271. }
  272. $before = '';
  273. // The next BooleanSearch will have to be combined with OR instead of default AND
  274. $nextOperator = 'OR';
  275. } elseif ($before !== '') {
  276. // The text prior to the opening parenthesis is a BooleanSearch
  277. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  278. if (count($searchBefore->searches()) > 0) {
  279. $this->searches[] = $searchBefore;
  280. }
  281. $before = '';
  282. }
  283. // Search the matching closing parenthesis
  284. $parentheses = 1;
  285. $sub = '';
  286. $i++;
  287. while ($i < $length) {
  288. $c = $input[$i];
  289. $backslashed = $input[$i - 1] === '\\';
  290. if ($c === '(' && !$backslashed) {
  291. // One nested level deeper
  292. $parentheses++;
  293. $sub .= $c;
  294. } elseif ($c === ')' && !$backslashed) {
  295. $parentheses--;
  296. if ($parentheses === 0) {
  297. // Found the matching closing parenthesis
  298. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  299. $nextOperator = 'AND';
  300. if (count($searchSub->searches()) > 0) {
  301. $this->searches[] = $searchSub;
  302. }
  303. $sub = '';
  304. break;
  305. } else {
  306. $sub .= $c;
  307. }
  308. } else {
  309. $sub .= $c;
  310. }
  311. $i++;
  312. }
  313. // $sub = trim($sub);
  314. // if ($sub !== '') {
  315. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  316. // }
  317. // } elseif ($c === ')') {
  318. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  319. } else {
  320. $before .= $c;
  321. }
  322. $i++;
  323. }
  324. if ($hasParenthesis) {
  325. $before = trim($before);
  326. if (preg_match('/^OR\b/i', $before)) {
  327. // The next BooleanSearch will have to be combined with OR instead of default AND
  328. $nextOperator = 'OR';
  329. // Trim leading OR
  330. $before = substr($before, 2);
  331. }
  332. // The remaining text after the last parenthesis is a BooleanSearch
  333. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  334. $nextOperator = 'AND';
  335. if (count($searchBefore->searches()) > 0) {
  336. $this->searches[] = $searchBefore;
  337. }
  338. return true;
  339. }
  340. // There was no parenthesis logic to apply
  341. return false;
  342. }
  343. private function parseOrSegments(string $input): void {
  344. $input = trim($input);
  345. if ($input === '') {
  346. return;
  347. }
  348. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  349. $segment = '';
  350. $ns = count($splits);
  351. for ($i = 0; $i < $ns; $i++) {
  352. $segment = $segment . $splits[$i];
  353. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  354. $segment = '';
  355. } else {
  356. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  357. if ($quotes % 2 === 0) {
  358. $segment = trim($segment);
  359. $this->searches[] = new FreshRSS_Search($segment);
  360. $segment = '';
  361. }
  362. }
  363. }
  364. $segment = trim($segment);
  365. if ($segment !== '') {
  366. $this->searches[] = new FreshRSS_Search($segment);
  367. }
  368. }
  369. /**
  370. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  371. * or a series of FreshRSS_Search combined by explicit OR
  372. * @return array<FreshRSS_BooleanSearch|FreshRSS_Search>
  373. */
  374. public function searches(): array {
  375. return $this->searches;
  376. }
  377. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  378. public function operator(): string {
  379. return $this->operator;
  380. }
  381. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  382. public function add($search): void {
  383. $this->searches[] = $search;
  384. }
  385. #[\Override]
  386. public function __toString(): string {
  387. return $this->getRawInput();
  388. }
  389. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  390. public function getRawInput(): string {
  391. return $this->raw_input;
  392. }
  393. }