BooleanSearch.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch {
  7. private string $raw_input = '';
  8. /** @var array<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
  12. */
  13. private string $operator;
  14. /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
  15. public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
  16. $this->operator = $operator;
  17. $input = trim($input);
  18. if ($input === '') {
  19. return;
  20. }
  21. if ($level === 0) {
  22. $input = preg_replace('/:&quot;(.*?)&quot;/', ':"\1"', $input);
  23. if (!is_string($input)) {
  24. return;
  25. }
  26. $input = preg_replace('/(?<=[\s(!-]|^)&quot;(.*?)&quot;/', '"\1"', $input);
  27. if (!is_string($input)) {
  28. return;
  29. }
  30. }
  31. $this->raw_input = $input;
  32. if ($level === 0) {
  33. $input = self::escapeRegexParentheses($input);
  34. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  35. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  36. $input = trim($input);
  37. }
  38. $input = self::consistentOrParentheses($input);
  39. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  40. // or parse everything as a series of Search’s combined by explicit OR
  41. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  42. }
  43. /**
  44. * Parse the user queries (saved searches) by name and expand them in the input string.
  45. */
  46. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  47. $all_matches = [];
  48. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  49. $all_matches[] = $matchesFound;
  50. }
  51. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  52. $all_matches[] = $matchesFound;
  53. }
  54. if (!empty($all_matches)) {
  55. $queries = [];
  56. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  57. if (($raw_query['name'] ?? '') !== '' && ($raw_query['search'] ?? '') !== '') {
  58. $queries[$raw_query['name']] = trim($raw_query['search']);
  59. }
  60. }
  61. $fromS = [];
  62. $toS = [];
  63. foreach ($all_matches as $matches) {
  64. if (empty($matches['search'])) {
  65. continue;
  66. }
  67. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  68. $name = trim($matches['search'][$i]);
  69. if (!empty($queries[$name])) {
  70. $fromS[] = $matches[0][$i];
  71. if ($allowUserQueries) {
  72. $toS[] = '(' . self::escapeRegexParentheses($queries[$name]) . ')';
  73. } else {
  74. $toS[] = '';
  75. }
  76. }
  77. }
  78. }
  79. $input = str_replace($fromS, $toS, $input);
  80. }
  81. return $input;
  82. }
  83. /**
  84. * Parse the user queries (saved searches) by ID and expand them in the input string.
  85. */
  86. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  87. $all_matches = [];
  88. if (preg_match_all('/\bS:(?P<search>\d+)/', $input, $matchesFound)) {
  89. $all_matches[] = $matchesFound;
  90. }
  91. if (!empty($all_matches)) {
  92. $queries = [];
  93. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  94. $queries[] = trim($raw_query['search'] ?? '');
  95. }
  96. $fromS = [];
  97. $toS = [];
  98. foreach ($all_matches as $matches) {
  99. if (empty($matches['search'])) {
  100. continue;
  101. }
  102. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  103. // Index starting from 1
  104. $id = (int)(trim($matches['search'][$i])) - 1;
  105. if (!empty($queries[$id])) {
  106. $fromS[] = $matches[0][$i];
  107. if ($allowUserQueries) {
  108. $toS[] = '(' . self::escapeRegexParentheses($queries[$id]) . ')';
  109. } else {
  110. $toS[] = '';
  111. }
  112. }
  113. }
  114. }
  115. $input = str_replace($fromS, $toS, $input);
  116. }
  117. return $input;
  118. }
  119. /**
  120. * Temporarily escape parentheses used in regex expressions.
  121. */
  122. public static function escapeRegexParentheses(string $input): string {
  123. return preg_replace_callback('#(?<=[\\s(:!-]|^)(?<![\\\\])/.+?(?<!\\\\)/[im]*#',
  124. fn(array $matches): string => str_replace(['(', ')'], ['\\u0028', '\\u0029'], $matches[0]),
  125. $input
  126. ) ?? '';
  127. }
  128. public static function unescapeRegexParentheses(string $input): string {
  129. return str_replace(['\\u0028', '\\u0029'], ['(', ')'], $input);
  130. }
  131. /**
  132. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  133. */
  134. public static function addOrParentheses(string $input): string {
  135. $input = trim($input);
  136. if ($input === '') {
  137. return '';
  138. }
  139. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  140. $ns = count($splits);
  141. if ($ns <= 1) {
  142. return $input;
  143. }
  144. $result = '';
  145. $segment = '';
  146. for ($i = 0; $i < $ns; $i++) {
  147. $segment .= $splits[$i];
  148. if (trim($segment) === '') {
  149. $segment = '';
  150. } elseif (strcasecmp($segment, 'OR') === 0) {
  151. $result .= $segment . ' ';
  152. $segment = '';
  153. } else {
  154. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  155. if ($quotes % 2 === 0) {
  156. $segment = trim($segment);
  157. if (in_array($segment, ['!', '-'], true)) {
  158. $result .= $segment;
  159. } else {
  160. $result .= '(' . $segment . ') ';
  161. }
  162. $segment = '';
  163. }
  164. }
  165. }
  166. $segment = trim($segment);
  167. if (in_array($segment, ['!', '-'], true)) {
  168. $result .= $segment;
  169. } elseif ($segment !== '') {
  170. $result .= '(' . $segment . ')';
  171. }
  172. return trim($result);
  173. }
  174. /**
  175. * If the query contains a mix of `OR` expressions with and without parentheses,
  176. * then add parentheses to make the query consistent.
  177. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  178. */
  179. public static function consistentOrParentheses(string $input): string {
  180. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  181. // No unescaped parentheses in the input
  182. return trim($input);
  183. }
  184. $parenthesesCount = 0;
  185. $result = '';
  186. $segment = '';
  187. $length = strlen($input);
  188. for ($i = 0; $i < $length; $i++) {
  189. $c = $input[$i];
  190. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  191. if (!$backslashed) {
  192. if ($c === '(') {
  193. if ($parenthesesCount === 0) {
  194. if ($segment !== '') {
  195. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  196. $negation = preg_match('/[!-]$/', $result);
  197. if (!$negation) {
  198. $result .= ' ';
  199. }
  200. $segment = '';
  201. }
  202. $c = '';
  203. }
  204. $parenthesesCount++;
  205. } elseif ($c === ')') {
  206. $parenthesesCount--;
  207. if ($parenthesesCount === 0) {
  208. $segment = self::consistentOrParentheses($segment);
  209. if ($segment !== '') {
  210. $result .= '(' . $segment . ')';
  211. $segment = '';
  212. }
  213. $c = '';
  214. }
  215. }
  216. }
  217. $segment .= $c;
  218. }
  219. if (trim($segment) !== '') {
  220. $result = rtrim($result);
  221. $negation = preg_match('/[!-]$/', $segment);
  222. if (!$negation) {
  223. $result .= ' ';
  224. }
  225. $result .= self::addOrParentheses($segment);
  226. }
  227. return trim($result);
  228. }
  229. /** @return bool True if some parenthesis logic took over, false otherwise */
  230. private function parseParentheses(string $input, int $level): bool {
  231. $input = trim($input);
  232. $length = strlen($input);
  233. $i = 0;
  234. $before = '';
  235. $hasParenthesis = false;
  236. $nextOperator = 'AND';
  237. while ($i < $length) {
  238. $c = $input[$i];
  239. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  240. if ($c === '(' && !$backslashed) {
  241. $hasParenthesis = true;
  242. $before = trim($before);
  243. if (preg_match('/[!-]$/', $before)) {
  244. // Trim trailing negation
  245. $before = rtrim($before, ' !-');
  246. $isOr = preg_match('/\bOR$/i', $before);
  247. if ($isOr) {
  248. // Trim trailing OR
  249. $before = substr($before, 0, -2);
  250. }
  251. // The text prior to the negation is a BooleanSearch
  252. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  253. if (count($searchBefore->searches()) > 0) {
  254. $this->searches[] = $searchBefore;
  255. }
  256. $before = '';
  257. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  258. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  259. } elseif (preg_match('/\bOR$/i', $before)) {
  260. // Trim trailing OR
  261. $before = substr($before, 0, -2);
  262. // The text prior to the OR is a BooleanSearch
  263. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  264. if (count($searchBefore->searches()) > 0) {
  265. $this->searches[] = $searchBefore;
  266. }
  267. $before = '';
  268. // The next BooleanSearch will have to be combined with OR instead of default AND
  269. $nextOperator = 'OR';
  270. } elseif ($before !== '') {
  271. // The text prior to the opening parenthesis is a BooleanSearch
  272. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  273. if (count($searchBefore->searches()) > 0) {
  274. $this->searches[] = $searchBefore;
  275. }
  276. $before = '';
  277. }
  278. // Search the matching closing parenthesis
  279. $parentheses = 1;
  280. $sub = '';
  281. $i++;
  282. while ($i < $length) {
  283. $c = $input[$i];
  284. $backslashed = $input[$i - 1] === '\\';
  285. if ($c === '(' && !$backslashed) {
  286. // One nested level deeper
  287. $parentheses++;
  288. $sub .= $c;
  289. } elseif ($c === ')' && !$backslashed) {
  290. $parentheses--;
  291. if ($parentheses === 0) {
  292. // Found the matching closing parenthesis
  293. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  294. $nextOperator = 'AND';
  295. if (count($searchSub->searches()) > 0) {
  296. $this->searches[] = $searchSub;
  297. }
  298. $sub = '';
  299. break;
  300. } else {
  301. $sub .= $c;
  302. }
  303. } else {
  304. $sub .= $c;
  305. }
  306. $i++;
  307. }
  308. // $sub = trim($sub);
  309. // if ($sub !== '') {
  310. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  311. // }
  312. // } elseif ($c === ')') {
  313. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  314. } else {
  315. $before .= $c;
  316. }
  317. $i++;
  318. }
  319. if ($hasParenthesis) {
  320. $before = trim($before);
  321. if (preg_match('/^OR\b/i', $before)) {
  322. // The next BooleanSearch will have to be combined with OR instead of default AND
  323. $nextOperator = 'OR';
  324. // Trim leading OR
  325. $before = substr($before, 2);
  326. }
  327. // The remaining text after the last parenthesis is a BooleanSearch
  328. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  329. $nextOperator = 'AND';
  330. if (count($searchBefore->searches()) > 0) {
  331. $this->searches[] = $searchBefore;
  332. }
  333. return true;
  334. }
  335. // There was no parenthesis logic to apply
  336. return false;
  337. }
  338. private function parseOrSegments(string $input): void {
  339. $input = trim($input);
  340. if ($input === '') {
  341. return;
  342. }
  343. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  344. $segment = '';
  345. $ns = count($splits);
  346. for ($i = 0; $i < $ns; $i++) {
  347. $segment = $segment . $splits[$i];
  348. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  349. $segment = '';
  350. } else {
  351. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  352. if ($quotes % 2 === 0) {
  353. $segment = trim($segment);
  354. $this->searches[] = new FreshRSS_Search($segment);
  355. $segment = '';
  356. }
  357. }
  358. }
  359. $segment = trim($segment);
  360. if ($segment !== '') {
  361. $this->searches[] = new FreshRSS_Search($segment);
  362. }
  363. }
  364. /**
  365. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  366. * or a series of FreshRSS_Search combined by explicit OR
  367. * @return array<FreshRSS_BooleanSearch|FreshRSS_Search>
  368. */
  369. public function searches(): array {
  370. return $this->searches;
  371. }
  372. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  373. public function operator(): string {
  374. return $this->operator;
  375. }
  376. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  377. public function add($search): void {
  378. $this->searches[] = $search;
  379. }
  380. #[\Override]
  381. public function __toString(): string {
  382. return $this->getRawInput();
  383. }
  384. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  385. public function getRawInput(): string {
  386. return $this->raw_input;
  387. }
  388. }