CssSelectorLexer.php 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. <?php
  2. namespace Gt\CssXPath;
  3. class CssSelectorLexer {
  4. private CssAttributeTokenBuilder $attributeTokenBuilder;
  5. public function __construct(
  6. ?CssAttributeTokenBuilder $attributeTokenBuilder = null
  7. ) {
  8. $this->attributeTokenBuilder = $attributeTokenBuilder
  9. ?? new CssAttributeTokenBuilder();
  10. }
  11. /** @return array<int, array<string, mixed>> */
  12. public function lex(string $selector, ?callable $transform):array {
  13. $tokens = [];
  14. $length = strlen($selector);
  15. for($index = 0; $index < $length;) {
  16. $char = $selector[$index];
  17. if(ctype_space($char)) {
  18. $index = $this->consumeWhitespace(
  19. $selector,
  20. $index,
  21. $tokens,
  22. $transform
  23. );
  24. continue;
  25. }
  26. $index = $this->consumeToken(
  27. $selector,
  28. $index,
  29. $char,
  30. $tokens,
  31. $transform
  32. );
  33. }
  34. return $tokens;
  35. }
  36. /**
  37. * @param array<int, array<string, mixed>> $tokens
  38. */
  39. private function consumeToken(
  40. string $selector,
  41. int $index,
  42. string $char,
  43. array &$tokens,
  44. ?callable $transform
  45. ):int {
  46. return match($char) {
  47. "*" => $this->consumeSimpleToken("star", "*", $index, $tokens, $transform),
  48. ">" => $this->consumeSimpleToken("child", ">", $index, $tokens, $transform),
  49. "+" => $this->consumeSimpleToken(
  50. "sibling",
  51. "+",
  52. $index,
  53. $tokens,
  54. $transform
  55. ),
  56. "~" => $this->consumeSimpleToken(
  57. "subsequentsibling",
  58. "~",
  59. $index,
  60. $tokens,
  61. $transform
  62. ),
  63. "#" => $this->consumeIdentifierToken(
  64. "id",
  65. $selector,
  66. $index + 1,
  67. $tokens,
  68. $transform
  69. ),
  70. "." => $this->consumeIdentifierToken(
  71. "class",
  72. $selector,
  73. $index + 1,
  74. $tokens,
  75. $transform
  76. ),
  77. ":" => $this->consumePseudoToken($selector, $index, $tokens, $transform),
  78. "[" => $this->consumeAttributeToken($selector, $index, $tokens, $transform),
  79. default => $this->consumeDefaultToken(
  80. $selector,
  81. $index,
  82. $char,
  83. $tokens,
  84. $transform
  85. ),
  86. };
  87. }
  88. /**
  89. * @param array<int, array<string, mixed>> $tokens
  90. */
  91. private function consumeSimpleToken(
  92. string $type,
  93. string $content,
  94. int $index,
  95. array &$tokens,
  96. ?callable $transform
  97. ):int {
  98. $tokens[] = $this->buildMatchPayload($type, $content, $transform);
  99. return $index + 1;
  100. }
  101. /**
  102. * @param array<int, array<string, mixed>> $tokens
  103. */
  104. private function consumeIdentifierToken(
  105. string $type,
  106. string $selector,
  107. int $index,
  108. array &$tokens,
  109. ?callable $transform
  110. ):int {
  111. [$identifier, $nextIndex] = $this->readIdentifier($selector, $index);
  112. $tokens[] = $this->buildMatchPayload($type, $identifier, $transform);
  113. return $nextIndex;
  114. }
  115. /**
  116. * @param array<int, array<string, mixed>> $tokens
  117. */
  118. private function consumePseudoToken(
  119. string $selector,
  120. int $index,
  121. array &$tokens,
  122. ?callable $transform
  123. ):int {
  124. [$pseudoTokens, $nextIndex] = $this->readPseudo(
  125. $selector,
  126. $index,
  127. $transform
  128. );
  129. array_push($tokens, ...$pseudoTokens);
  130. return $nextIndex;
  131. }
  132. /**
  133. * @param array<int, array<string, mixed>> $tokens
  134. */
  135. private function consumeAttributeToken(
  136. string $selector,
  137. int $index,
  138. array &$tokens,
  139. ?callable $transform
  140. ):int {
  141. [$attributeToken, $nextIndex] = $this->readAttribute(
  142. $selector,
  143. $index,
  144. $transform
  145. );
  146. $tokens[] = $attributeToken;
  147. return $nextIndex;
  148. }
  149. /**
  150. * @param array<int, array<string, mixed>> $tokens
  151. */
  152. private function consumeDefaultToken(
  153. string $selector,
  154. int $index,
  155. string $char,
  156. array &$tokens,
  157. ?callable $transform
  158. ):int {
  159. if(!$this->isIdentifierCharacter($char)) {
  160. return $index + 1;
  161. }
  162. return $this->consumeIdentifierToken(
  163. "element",
  164. $selector,
  165. $index,
  166. $tokens,
  167. $transform
  168. );
  169. }
  170. /**
  171. * @param array<int, array<string, mixed>> $tokens
  172. */
  173. private function consumeWhitespace(
  174. string $selector,
  175. int $index,
  176. array &$tokens,
  177. ?callable $transform
  178. ):int {
  179. $length = strlen($selector);
  180. $nextIndex = $index;
  181. while($nextIndex < $length && ctype_space($selector[$nextIndex])) {
  182. $nextIndex++;
  183. }
  184. if($this->shouldEmitDescendantToken($selector, $tokens, $nextIndex)) {
  185. $tokens[] = $this->buildMatchPayload("descendant", " ", $transform);
  186. }
  187. return $nextIndex;
  188. }
  189. /**
  190. * @param array<int, array<string, mixed>> $tokens
  191. */
  192. private function shouldEmitDescendantToken(
  193. string $selector,
  194. array $tokens,
  195. int $nextIndex
  196. ):bool {
  197. if(empty($tokens) || !isset($selector[$nextIndex])) {
  198. return false;
  199. }
  200. $nextChar = $selector[$nextIndex];
  201. if(in_array($nextChar, [">", "+", "~", ",", ")"], true)) {
  202. return false;
  203. }
  204. $previousType = (string)$tokens[array_key_last($tokens)]["type"];
  205. return !in_array($previousType, [
  206. "child",
  207. "sibling",
  208. "subsequentsibling",
  209. "descendant",
  210. ], true);
  211. }
  212. /** @return array{0: string, 1: int} */
  213. private function readIdentifier(string $selector, int $index):array {
  214. $length = strlen($selector);
  215. $identifier = "";
  216. while($index < $length && $this->isIdentifierCharacter($selector[$index])) {
  217. $identifier .= $selector[$index];
  218. $index++;
  219. }
  220. return [$identifier, $index];
  221. }
  222. /**
  223. * @return array{0: array<int, array<string, mixed>>, 1: int}
  224. */
  225. private function readPseudo(
  226. string $selector,
  227. int $index,
  228. ?callable $transform
  229. ):array {
  230. $tokens = [];
  231. $isPseudoElement = isset($selector[$index + 1])
  232. && $selector[$index + 1] === ":";
  233. $nameStart = $index + ($isPseudoElement ? 2 : 1);
  234. [$name, $nextIndex] = $this->readIdentifier($selector, $nameStart);
  235. $tokens[] = $this->buildMatchPayload(
  236. $isPseudoElement ? "pseudo-element" : "pseudo",
  237. $name,
  238. $transform
  239. );
  240. if(isset($selector[$nextIndex]) && $selector[$nextIndex] === "(") {
  241. [$content, $nextIndex] = $this->readBalancedContent(
  242. $selector,
  243. $nextIndex,
  244. "(",
  245. ")"
  246. );
  247. $tokens[] = $this->buildMatchPayload(
  248. "pseudospecifier",
  249. $content,
  250. $transform
  251. );
  252. }
  253. return [$tokens, $nextIndex];
  254. }
  255. /**
  256. * @return array{0: array<string, mixed>, 1: int}
  257. */
  258. private function readAttribute(
  259. string $selector,
  260. int $index,
  261. ?callable $transform
  262. ):array {
  263. [$content, $nextIndex] = $this->readBalancedContent(
  264. $selector,
  265. $index,
  266. "[",
  267. "]"
  268. );
  269. return [
  270. $this->attributeTokenBuilder->build($content, $transform),
  271. $nextIndex,
  272. ];
  273. }
  274. /** @return array{0: string, 1: int} */
  275. private function readBalancedContent(
  276. string $selector,
  277. int $startIndex,
  278. string $open,
  279. string $close
  280. ):array {
  281. $length = strlen($selector);
  282. $depth = 1;
  283. $content = "";
  284. $quote = null;
  285. for($index = $startIndex + 1; $index < $length; $index++) {
  286. $char = $selector[$index];
  287. if($quote !== null) {
  288. $content .= $char;
  289. if($char === $quote) {
  290. $quote = null;
  291. }
  292. continue;
  293. }
  294. if($char === "'" || $char === '"') {
  295. $quote = $char;
  296. $content .= $char;
  297. continue;
  298. }
  299. if($char === $open) {
  300. $depth++;
  301. $content .= $char;
  302. continue;
  303. }
  304. if($char === $close) {
  305. $depth--;
  306. if($depth === 0) {
  307. return [$content, $index + 1];
  308. }
  309. $content .= $char;
  310. continue;
  311. }
  312. $content .= $char;
  313. }
  314. return [$content, $length];
  315. }
  316. private function isIdentifierCharacter(string $char):bool {
  317. return preg_match('/[\w-]/', $char) === 1;
  318. }
  319. /** @return array<string, string> */
  320. private function buildMatchPayload(
  321. string $groupKey,
  322. string $match,
  323. ?callable $transform
  324. ):array {
  325. if($transform) {
  326. return $transform($groupKey, $match);
  327. }
  328. return ["type" => $groupKey, "content" => $match];
  329. }
  330. }