Parser.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. <?php
  2. declare(strict_types=1);
  3. namespace PHPHtmlParser\Dom;
  4. use PHPHtmlParser\Content;
  5. use PHPHtmlParser\Contracts\Dom\ParserInterface;
  6. use PHPHtmlParser\Dom\Node\AbstractNode;
  7. use PHPHtmlParser\Dom\Node\HtmlNode;
  8. use PHPHtmlParser\Dom\Node\TextNode;
  9. use PHPHtmlParser\DTO\TagDTO;
  10. use PHPHtmlParser\Enum\StringToken;
  11. use PHPHtmlParser\Exceptions\ChildNotFoundException;
  12. use PHPHtmlParser\Exceptions\CircularException;
  13. use PHPHtmlParser\Exceptions\ContentLengthException;
  14. use PHPHtmlParser\Exceptions\LogicalException;
  15. use PHPHtmlParser\Exceptions\StrictException;
  16. use PHPHtmlParser\Options;
  17. use stringEncode\Encode;
  18. class Parser implements ParserInterface
  19. {
  20. /**
  21. * Attempts to parse the html in content.
  22. *
  23. * @throws ChildNotFoundException
  24. * @throws CircularException
  25. * @throws ContentLengthException
  26. * @throws LogicalException
  27. * @throws StrictException
  28. */
  29. public function parse(Options $options, Content $content, int $size): AbstractNode
  30. {
  31. // add the root node
  32. $root = new HtmlNode('root');
  33. $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
  34. $activeNode = $root;
  35. while ($activeNode !== null) {
  36. if ($activeNode && $activeNode->tag->name() === 'script'
  37. && $options->isCleanupInput() !== true
  38. ) {
  39. $str = $content->copyUntil('</');
  40. } else {
  41. $str = $content->copyUntil('<');
  42. }
  43. if ($str == '') {
  44. $tagDTO = $this->parseTag($options, $content, $size);
  45. if (!$tagDTO->isStatus()) {
  46. // we are done here
  47. $activeNode = null;
  48. continue;
  49. }
  50. // check if it was a closing tag
  51. if ($tagDTO->isClosing()) {
  52. $foundOpeningTag = true;
  53. $originalNode = $activeNode;
  54. while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
  55. $activeNode = $activeNode->getParent();
  56. if ($activeNode === null) {
  57. // we could not find opening tag
  58. $activeNode = $originalNode;
  59. $foundOpeningTag = false;
  60. break;
  61. }
  62. }
  63. if ($foundOpeningTag) {
  64. $activeNode = $activeNode->getParent();
  65. }
  66. continue;
  67. }
  68. if ($tagDTO->getNode() === null) {
  69. continue;
  70. }
  71. /** @var AbstractNode $node */
  72. $node = $tagDTO->getNode();
  73. $activeNode->addChild($node);
  74. // check if node is self closing
  75. if (!$node->getTag()->isSelfClosing()) {
  76. $activeNode = $node;
  77. }
  78. } elseif ($options->isWhitespaceTextNode() ||
  79. \trim($str) != ''
  80. ) {
  81. // we found text we care about
  82. $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
  83. $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
  84. $activeNode->addChild($textNode);
  85. }
  86. }
  87. return $root;
  88. }
  89. /**
  90. * Attempts to detect the charset that the html was sent in.
  91. *
  92. * @throws ChildNotFoundException
  93. */
  94. public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
  95. {
  96. // set the default
  97. $encode = new Encode();
  98. $encode->from($defaultCharset);
  99. $encode->to($defaultCharset);
  100. $enforceEncoding = $options->getEnforceEncoding();
  101. if ($enforceEncoding !== null) {
  102. // they want to enforce the given encoding
  103. $encode->from($enforceEncoding);
  104. $encode->to($enforceEncoding);
  105. return false;
  106. }
  107. /** @var AbstractNode $meta */
  108. $meta = $root->find('meta[http-equiv=Content-Type]', 0);
  109. if ($meta == null) {
  110. if (!$this->detectHTML5Charset($encode, $root)) {
  111. // could not find meta tag
  112. $root->propagateEncoding($encode);
  113. return false;
  114. }
  115. return true;
  116. }
  117. $content = $meta->getAttribute('content');
  118. if (\is_null($content)) {
  119. // could not find content
  120. $root->propagateEncoding($encode);
  121. return false;
  122. }
  123. $matches = [];
  124. if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
  125. $encode->from(\trim($matches[1]));
  126. $root->propagateEncoding($encode);
  127. return true;
  128. }
  129. // no charset found
  130. $root->propagateEncoding($encode);
  131. return false;
  132. }
  133. /**
  134. * Attempt to parse a tag out of the content.
  135. *
  136. * @throws StrictException
  137. * @throws ContentLengthException
  138. * @throws LogicalException
  139. * @throws StrictException
  140. */
  141. private function parseTag(Options $options, Content $content, int $size): TagDTO
  142. {
  143. if ($content->char() != '<') {
  144. // we are not at the beginning of a tag
  145. return TagDTO::makeFromPrimitives();
  146. }
  147. // check if this is a closing tag
  148. try {
  149. $content->fastForward(1);
  150. } catch (ContentLengthException $exception) {
  151. // we are at the end of the file
  152. return TagDTO::makeFromPrimitives();
  153. }
  154. if ($content->char() == '/') {
  155. return $this->makeEndTag($content, $options);
  156. }
  157. if ($content->char() == '?') {
  158. // special setting tag
  159. $tag = $content->fastForward(1)
  160. ->copyByToken(StringToken::SLASH(), true);
  161. $tag = (new Tag($tag))
  162. ->setOpening('<?')
  163. ->setClosing(' ?>')
  164. ->selfClosing();
  165. } elseif($content->string(3) == '!--') {
  166. // comment tag
  167. $tag = $content->fastForward(3)
  168. ->copyByToken(StringToken::CLOSECOMMENT(), true);
  169. $tag = (new Tag($tag))
  170. ->setOpening('<!--')
  171. ->setClosing('-->')
  172. ->selfClosing();
  173. } else {
  174. $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
  175. if (\trim($tag) == '') {
  176. // no tag found, invalid < found
  177. return TagDTO::makeFromPrimitives();
  178. }
  179. }
  180. $node = new HtmlNode($tag);
  181. $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
  182. $this->setUpAttributes($content, $size, $node, $options, $tag);
  183. $content->skipByToken(StringToken::BLANK());
  184. if ($content->char() == '/') {
  185. // self closing tag
  186. $node->getTag()->selfClosing();
  187. $content->fastForward(1);
  188. } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
  189. // Should be a self closing tag, check if we are strict
  190. if ($options->isStrict()) {
  191. $character = $content->getPosition();
  192. throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
  193. }
  194. // We force self closing on this tag.
  195. $node->getTag()->selfClosing();
  196. // Should this tag use a trailing slash?
  197. if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
  198. $node->getTag()->noTrailingSlash();
  199. }
  200. }
  201. if ($content->canFastForward(1)) {
  202. $content->fastForward(1);
  203. }
  204. return TagDTO::makeFromPrimitives(true, false, $node);
  205. }
  206. /**
  207. * @throws ChildNotFoundException
  208. */
  209. private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
  210. {
  211. /** @var AbstractNode|null $meta */
  212. $meta = $root->find('meta[charset]', 0);
  213. if ($meta == null) {
  214. return false;
  215. }
  216. $encode->from(\trim($meta->getAttribute('charset')));
  217. $root->propagateEncoding($encode);
  218. return true;
  219. }
  220. /**
  221. * @throws ContentLengthException
  222. * @throws LogicalException
  223. */
  224. private function makeEndTag(Content $content, Options $options): TagDTO
  225. {
  226. $tag = $content->fastForward(1)
  227. ->copyByToken(StringToken::SLASH(), true);
  228. // move to end of tag
  229. $content->copyUntil('>');
  230. $content->fastForward(1);
  231. // check if this closing tag counts
  232. $tag = \strtolower($tag);
  233. if (\in_array($tag, $options->getSelfClosing(), true)) {
  234. return TagDTO::makeFromPrimitives(true);
  235. }
  236. return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
  237. }
  238. /**
  239. * @param string|Tag $tag
  240. *
  241. * @throws ContentLengthException
  242. * @throws LogicalException
  243. * @throws StrictException
  244. */
  245. private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
  246. {
  247. while (
  248. $content->char() != '>' &&
  249. $content->char() != '/'
  250. ) {
  251. $space = $content->skipByToken(StringToken::BLANK(), true);
  252. if (empty($space)) {
  253. try {
  254. $content->fastForward(1);
  255. } catch (ContentLengthException $exception) {
  256. // reached the end of the content
  257. break;
  258. }
  259. continue;
  260. }
  261. $name = $content->copyByToken(StringToken::EQUAL(), true);
  262. if ($name == '/') {
  263. break;
  264. }
  265. if (empty($name)) {
  266. $content->skipByToken(StringToken::BLANK());
  267. continue;
  268. }
  269. $content->skipByToken(StringToken::BLANK());
  270. if ($content->char() == '=') {
  271. $content->fastForward(1)
  272. ->skipByToken(StringToken::BLANK());
  273. switch ($content->char()) {
  274. case '"':
  275. $content->fastForward(1);
  276. $string = $content->copyUntil('"', true);
  277. do {
  278. $moreString = $content->copyUntilUnless('"', '=>');
  279. $string .= $moreString;
  280. } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
  281. $content->fastForward(1);
  282. $node->getTag()->setAttribute($name, $string);
  283. break;
  284. case "'":
  285. $content->fastForward(1);
  286. $string = $content->copyUntil("'", true);
  287. do {
  288. $moreString = $content->copyUntilUnless("'", '=>');
  289. $string .= $moreString;
  290. } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
  291. $content->fastForward(1);
  292. $node->getTag()->setAttribute($name, $string, false);
  293. break;
  294. default:
  295. $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
  296. break;
  297. }
  298. } else {
  299. // no value attribute
  300. if ($options->isStrict()) {
  301. // can't have this in strict html
  302. $character = $content->getPosition();
  303. throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
  304. }
  305. $node->getTag()->setAttribute($name, null);
  306. if ($content->char() != '>') {
  307. $content->rewind(1);
  308. }
  309. }
  310. }
  311. }
  312. }