BooleanSearch.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. <?php
  2. declare(strict_types=1);
  3. /**
  4. * Contains Boolean search from the search form.
  5. */
  6. class FreshRSS_BooleanSearch {
  7. private string $raw_input = '';
  8. /** @var array<FreshRSS_BooleanSearch|FreshRSS_Search> */
  9. private array $searches = [];
  10. /**
  11. * @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
  12. */
  13. private string $operator;
  14. /** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
  15. public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
  16. $this->operator = $operator;
  17. $input = trim($input);
  18. if ($input === '') {
  19. return;
  20. }
  21. if ($level === 0) {
  22. $input = preg_replace('/:&quot;(.*?)&quot;/', ':"\1"', $input);
  23. if (!is_string($input)) {
  24. return;
  25. }
  26. $input = preg_replace('/(?<=[\s!-]|^)&quot;(.*?)&quot;/', '"\1"', $input);
  27. if (!is_string($input)) {
  28. return;
  29. }
  30. $input = $this->parseUserQueryNames($input, $allowUserQueries);
  31. $input = $this->parseUserQueryIds($input, $allowUserQueries);
  32. $input = trim($input);
  33. }
  34. $this->raw_input = $input;
  35. $input = self::consistentOrParentheses($input);
  36. // Either parse everything as a series of BooleanSearch’s combined by implicit AND
  37. // or parse everything as a series of Search’s combined by explicit OR
  38. $this->parseParentheses($input, $level) || $this->parseOrSegments($input);
  39. }
  40. /**
  41. * Parse the user queries (saved searches) by name and expand them in the input string.
  42. */
  43. private function parseUserQueryNames(string $input, bool $allowUserQueries = true): string {
  44. $all_matches = [];
  45. if (preg_match_all('/\bsearch:(?P<delim>[\'"])(?P<search>.*)(?P=delim)/U', $input, $matchesFound)) {
  46. $all_matches[] = $matchesFound;
  47. }
  48. if (preg_match_all('/\bsearch:(?P<search>[^\s"]*)/', $input, $matchesFound)) {
  49. $all_matches[] = $matchesFound;
  50. }
  51. if (!empty($all_matches)) {
  52. /** @var array<string,FreshRSS_UserQuery> */
  53. $queries = [];
  54. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  55. $query = new FreshRSS_UserQuery($raw_query, FreshRSS_Context::categories(), FreshRSS_Context::labels());
  56. $queries[$query->getName()] = $query;
  57. }
  58. $fromS = [];
  59. $toS = [];
  60. foreach ($all_matches as $matches) {
  61. if (empty($matches['search'])) {
  62. continue;
  63. }
  64. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  65. $name = trim($matches['search'][$i]);
  66. if (!empty($queries[$name])) {
  67. $fromS[] = $matches[0][$i];
  68. if ($allowUserQueries) {
  69. $toS[] = '(' . trim($queries[$name]->getSearch()->getRawInput()) . ')';
  70. } else {
  71. $toS[] = '';
  72. }
  73. }
  74. }
  75. }
  76. $input = str_replace($fromS, $toS, $input);
  77. }
  78. return $input;
  79. }
  80. /**
  81. * Parse the user queries (saved searches) by ID and expand them in the input string.
  82. */
  83. private function parseUserQueryIds(string $input, bool $allowUserQueries = true): string {
  84. $all_matches = [];
  85. if (preg_match_all('/\bS:(?P<search>\d+)/', $input, $matchesFound)) {
  86. $all_matches[] = $matchesFound;
  87. }
  88. if (!empty($all_matches)) {
  89. /** @var array<string,FreshRSS_UserQuery> */
  90. $queries = [];
  91. foreach (FreshRSS_Context::userConf()->queries as $raw_query) {
  92. $query = new FreshRSS_UserQuery($raw_query, FreshRSS_Context::categories(), FreshRSS_Context::labels());
  93. $queries[] = $query;
  94. }
  95. $fromS = [];
  96. $toS = [];
  97. foreach ($all_matches as $matches) {
  98. if (empty($matches['search'])) {
  99. continue;
  100. }
  101. for ($i = count($matches['search']) - 1; $i >= 0; $i--) {
  102. // Index starting from 1
  103. $id = (int)(trim($matches['search'][$i])) - 1;
  104. if (!empty($queries[$id])) {
  105. $fromS[] = $matches[0][$i];
  106. if ($allowUserQueries) {
  107. $toS[] = '(' . trim($queries[$id]->getSearch()->getRawInput()) . ')';
  108. } else {
  109. $toS[] = '';
  110. }
  111. }
  112. }
  113. }
  114. $input = str_replace($fromS, $toS, $input);
  115. }
  116. return $input;
  117. }
  118. /**
  119. * Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
  120. */
  121. public static function addOrParentheses(string $input): string {
  122. $input = trim($input);
  123. if ($input === '') {
  124. return '';
  125. }
  126. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  127. $ns = count($splits);
  128. if ($ns <= 1) {
  129. return $input;
  130. }
  131. $result = '';
  132. $segment = '';
  133. for ($i = 0; $i < $ns; $i++) {
  134. $segment .= $splits[$i];
  135. if (trim($segment) === '') {
  136. $segment = '';
  137. } elseif (strcasecmp($segment, 'OR') === 0) {
  138. $result .= $segment . ' ';
  139. $segment = '';
  140. } else {
  141. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  142. if ($quotes % 2 === 0) {
  143. $segment = trim($segment);
  144. if (in_array($segment, ['!', '-'], true)) {
  145. $result .= $segment;
  146. } else {
  147. $result .= '(' . $segment . ') ';
  148. }
  149. $segment = '';
  150. }
  151. }
  152. }
  153. $segment = trim($segment);
  154. if (in_array($segment, ['!', '-'], true)) {
  155. $result .= $segment;
  156. } elseif ($segment !== '') {
  157. $result .= '(' . $segment . ')';
  158. }
  159. return trim($result);
  160. }
  161. /**
  162. * If the query contains a mix of `OR` expressions with and without parentheses,
  163. * then add parentheses to make the query consistent.
  164. * Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
  165. */
  166. public static function consistentOrParentheses(string $input): string {
  167. if (!preg_match('/(?<!\\\\)\\(/', $input)) {
  168. // No unescaped parentheses in the input
  169. return trim($input);
  170. }
  171. $parenthesesCount = 0;
  172. $result = '';
  173. $segment = '';
  174. $length = strlen($input);
  175. for ($i = 0; $i < $length; $i++) {
  176. $c = $input[$i];
  177. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  178. if (!$backslashed) {
  179. if ($c === '(') {
  180. if ($parenthesesCount === 0) {
  181. if ($segment !== '') {
  182. $result = rtrim($result) . ' ' . self::addOrParentheses($segment);
  183. $negation = preg_match('/[!-]$/', $result);
  184. if (!$negation) {
  185. $result .= ' ';
  186. }
  187. $segment = '';
  188. }
  189. $c = '';
  190. }
  191. $parenthesesCount++;
  192. } elseif ($c === ')') {
  193. $parenthesesCount--;
  194. if ($parenthesesCount === 0) {
  195. $segment = self::consistentOrParentheses($segment);
  196. if ($segment !== '') {
  197. $result .= '(' . $segment . ')';
  198. $segment = '';
  199. }
  200. $c = '';
  201. }
  202. }
  203. }
  204. $segment .= $c;
  205. }
  206. if (trim($segment) !== '') {
  207. $result = rtrim($result);
  208. $negation = preg_match('/[!-]$/', $segment);
  209. if (!$negation) {
  210. $result .= ' ';
  211. }
  212. $result .= self::addOrParentheses($segment);
  213. }
  214. return trim($result);
  215. }
  216. /** @return bool True if some parenthesis logic took over, false otherwise */
  217. private function parseParentheses(string $input, int $level): bool {
  218. $input = trim($input);
  219. $length = strlen($input);
  220. $i = 0;
  221. $before = '';
  222. $hasParenthesis = false;
  223. $nextOperator = 'AND';
  224. while ($i < $length) {
  225. $c = $input[$i];
  226. $backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
  227. if ($c === '(' && !$backslashed) {
  228. $hasParenthesis = true;
  229. $before = trim($before);
  230. if (preg_match('/[!-]$/', $before)) {
  231. // Trim trailing negation
  232. $before = rtrim($before, ' !-');
  233. $isOr = preg_match('/\bOR$/i', $before);
  234. if ($isOr) {
  235. // Trim trailing OR
  236. $before = substr($before, 0, -2);
  237. }
  238. // The text prior to the negation is a BooleanSearch
  239. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  240. if (count($searchBefore->searches()) > 0) {
  241. $this->searches[] = $searchBefore;
  242. }
  243. $before = '';
  244. // The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
  245. $nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
  246. } elseif (preg_match('/\bOR$/i', $before)) {
  247. // Trim trailing OR
  248. $before = substr($before, 0, -2);
  249. // The text prior to the OR is a BooleanSearch
  250. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  251. if (count($searchBefore->searches()) > 0) {
  252. $this->searches[] = $searchBefore;
  253. }
  254. $before = '';
  255. // The next BooleanSearch will have to be combined with OR instead of default AND
  256. $nextOperator = 'OR';
  257. } elseif ($before !== '') {
  258. // The text prior to the opening parenthesis is a BooleanSearch
  259. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  260. if (count($searchBefore->searches()) > 0) {
  261. $this->searches[] = $searchBefore;
  262. }
  263. $before = '';
  264. }
  265. // Search the matching closing parenthesis
  266. $parentheses = 1;
  267. $sub = '';
  268. $i++;
  269. while ($i < $length) {
  270. $c = $input[$i];
  271. $backslashed = $input[$i - 1] === '\\';
  272. if ($c === '(' && !$backslashed) {
  273. // One nested level deeper
  274. $parentheses++;
  275. $sub .= $c;
  276. } elseif ($c === ')' && !$backslashed) {
  277. $parentheses--;
  278. if ($parentheses === 0) {
  279. // Found the matching closing parenthesis
  280. $searchSub = new FreshRSS_BooleanSearch($sub, $level + 1, $nextOperator);
  281. $nextOperator = 'AND';
  282. if (count($searchSub->searches()) > 0) {
  283. $this->searches[] = $searchSub;
  284. }
  285. $sub = '';
  286. break;
  287. } else {
  288. $sub .= $c;
  289. }
  290. } else {
  291. $sub .= $c;
  292. }
  293. $i++;
  294. }
  295. // $sub = trim($sub);
  296. // if ($sub !== '') {
  297. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  298. // }
  299. // } elseif ($c === ')') {
  300. // // TODO: Consider throwing an error or warning in case of non-matching parenthesis
  301. } else {
  302. $before .= $c;
  303. }
  304. $i++;
  305. }
  306. if ($hasParenthesis) {
  307. $before = trim($before);
  308. if (preg_match('/^OR\b/i', $before)) {
  309. // The next BooleanSearch will have to be combined with OR instead of default AND
  310. $nextOperator = 'OR';
  311. // Trim leading OR
  312. $before = substr($before, 2);
  313. }
  314. // The remaining text after the last parenthesis is a BooleanSearch
  315. $searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
  316. $nextOperator = 'AND';
  317. if (count($searchBefore->searches()) > 0) {
  318. $this->searches[] = $searchBefore;
  319. }
  320. return true;
  321. }
  322. // There was no parenthesis logic to apply
  323. return false;
  324. }
  325. private function parseOrSegments(string $input): void {
  326. $input = trim($input);
  327. if ($input === '') {
  328. return;
  329. }
  330. $splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
  331. $segment = '';
  332. $ns = count($splits);
  333. for ($i = 0; $i < $ns; $i++) {
  334. $segment = $segment . $splits[$i];
  335. if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
  336. $segment = '';
  337. } else {
  338. $quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
  339. if ($quotes % 2 === 0) {
  340. $segment = trim($segment);
  341. $this->searches[] = new FreshRSS_Search($segment);
  342. $segment = '';
  343. }
  344. }
  345. }
  346. $segment = trim($segment);
  347. if ($segment !== '') {
  348. $this->searches[] = new FreshRSS_Search($segment);
  349. }
  350. }
  351. /**
  352. * Either a list of FreshRSS_BooleanSearch combined by implicit AND
  353. * or a series of FreshRSS_Search combined by explicit OR
  354. * @return array<FreshRSS_BooleanSearch|FreshRSS_Search>
  355. */
  356. public function searches(): array {
  357. return $this->searches;
  358. }
  359. /** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
  360. public function operator(): string {
  361. return $this->operator;
  362. }
  363. /** @param FreshRSS_BooleanSearch|FreshRSS_Search $search */
  364. public function add($search): void {
  365. $this->searches[] = $search;
  366. }
  367. #[\Override]
  368. public function __toString(): string {
  369. return $this->getRawInput();
  370. }
  371. /** @return string Plain text search query. Must be XML-encoded or URL-encoded depending on the situation */
  372. public function getRawInput(): string {
  373. return $this->raw_input;
  374. }
  375. }