| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- <?php
- declare(strict_types=1);
- namespace PHPHtmlParser\Dom;
- use PHPHtmlParser\Content;
- use PHPHtmlParser\Contracts\Dom\ParserInterface;
- use PHPHtmlParser\Dom\Node\AbstractNode;
- use PHPHtmlParser\Dom\Node\HtmlNode;
- use PHPHtmlParser\Dom\Node\TextNode;
- use PHPHtmlParser\DTO\TagDTO;
- use PHPHtmlParser\Enum\StringToken;
- use PHPHtmlParser\Exceptions\ChildNotFoundException;
- use PHPHtmlParser\Exceptions\CircularException;
- use PHPHtmlParser\Exceptions\ContentLengthException;
- use PHPHtmlParser\Exceptions\LogicalException;
- use PHPHtmlParser\Exceptions\StrictException;
- use PHPHtmlParser\Options;
- use stringEncode\Encode;
- class Parser implements ParserInterface
- {
- /**
- * Attempts to parse the html in content.
- *
- * @throws ChildNotFoundException
- * @throws CircularException
- * @throws ContentLengthException
- * @throws LogicalException
- * @throws StrictException
- */
- public function parse(Options $options, Content $content, int $size): AbstractNode
- {
- // add the root node
- $root = new HtmlNode('root');
- $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
- $activeNode = $root;
- while ($activeNode !== null) {
- if ($activeNode && $activeNode->tag->name() === 'script'
- && $options->isCleanupInput() !== true
- ) {
- $str = $content->copyUntil('</');
- } else {
- $str = $content->copyUntil('<');
- }
- if ($str == '') {
- $tagDTO = $this->parseTag($options, $content, $size);
- if (!$tagDTO->isStatus()) {
- // we are done here
- $activeNode = null;
- continue;
- }
- // check if it was a closing tag
- if ($tagDTO->isClosing()) {
- $foundOpeningTag = true;
- $originalNode = $activeNode;
- while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
- $activeNode = $activeNode->getParent();
- if ($activeNode === null) {
- // we could not find opening tag
- $activeNode = $originalNode;
- $foundOpeningTag = false;
- break;
- }
- }
- if ($foundOpeningTag) {
- $activeNode = $activeNode->getParent();
- }
- continue;
- }
- if ($tagDTO->getNode() === null) {
- continue;
- }
- /** @var AbstractNode $node */
- $node = $tagDTO->getNode();
- $activeNode->addChild($node);
- // check if node is self closing
- if (!$node->getTag()->isSelfClosing()) {
- $activeNode = $node;
- }
- } elseif ($options->isWhitespaceTextNode() ||
- \trim($str) != ''
- ) {
- // we found text we care about
- $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
- $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
- $activeNode->addChild($textNode);
- }
- }
- return $root;
- }
- /**
- * Attempts to detect the charset that the html was sent in.
- *
- * @throws ChildNotFoundException
- */
- public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
- {
- // set the default
- $encode = new Encode();
- $encode->from($defaultCharset);
- $encode->to($defaultCharset);
- $enforceEncoding = $options->getEnforceEncoding();
- if ($enforceEncoding !== null) {
- // they want to enforce the given encoding
- $encode->from($enforceEncoding);
- $encode->to($enforceEncoding);
- return false;
- }
- /** @var AbstractNode $meta */
- $meta = $root->find('meta[http-equiv=Content-Type]', 0);
- if ($meta == null) {
- if (!$this->detectHTML5Charset($encode, $root)) {
- // could not find meta tag
- $root->propagateEncoding($encode);
- return false;
- }
- return true;
- }
- $content = $meta->getAttribute('content');
- if (\is_null($content)) {
- // could not find content
- $root->propagateEncoding($encode);
- return false;
- }
- $matches = [];
- if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
- $encode->from(\trim($matches[1]));
- $root->propagateEncoding($encode);
- return true;
- }
- // no charset found
- $root->propagateEncoding($encode);
- return false;
- }
- /**
- * Attempt to parse a tag out of the content.
- *
- * @throws StrictException
- * @throws ContentLengthException
- * @throws LogicalException
- * @throws StrictException
- */
- private function parseTag(Options $options, Content $content, int $size): TagDTO
- {
- if ($content->char() != '<') {
- // we are not at the beginning of a tag
- return TagDTO::makeFromPrimitives();
- }
- // check if this is a closing tag
- try {
- $content->fastForward(1);
- } catch (ContentLengthException $exception) {
- // we are at the end of the file
- return TagDTO::makeFromPrimitives();
- }
- if ($content->char() == '/') {
- return $this->makeEndTag($content, $options);
- }
- if ($content->char() == '?') {
- // special setting tag
- $tag = $content->fastForward(1)
- ->copyByToken(StringToken::SLASH(), true);
- $tag = (new Tag($tag))
- ->setOpening('<?')
- ->setClosing(' ?>')
- ->selfClosing();
- } elseif($content->string(3) == '!--') {
- // comment tag
- $tag = $content->fastForward(3)
- ->copyByToken(StringToken::CLOSECOMMENT(), true);
- $tag = (new Tag($tag))
- ->setOpening('<!--')
- ->setClosing('-->')
- ->selfClosing();
- } else {
- $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
- if (\trim($tag) == '') {
- // no tag found, invalid < found
- return TagDTO::makeFromPrimitives();
- }
- }
- $node = new HtmlNode($tag);
- $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
- $this->setUpAttributes($content, $size, $node, $options, $tag);
- $content->skipByToken(StringToken::BLANK());
- if ($content->char() == '/') {
- // self closing tag
- $node->getTag()->selfClosing();
- $content->fastForward(1);
- } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
- // Should be a self closing tag, check if we are strict
- if ($options->isStrict()) {
- $character = $content->getPosition();
- throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
- }
- // We force self closing on this tag.
- $node->getTag()->selfClosing();
- // Should this tag use a trailing slash?
- if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
- $node->getTag()->noTrailingSlash();
- }
- }
- if ($content->canFastForward(1)) {
- $content->fastForward(1);
- }
- return TagDTO::makeFromPrimitives(true, false, $node);
- }
- /**
- * @throws ChildNotFoundException
- */
- private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
- {
- /** @var AbstractNode|null $meta */
- $meta = $root->find('meta[charset]', 0);
- if ($meta == null) {
- return false;
- }
- $encode->from(\trim($meta->getAttribute('charset')));
- $root->propagateEncoding($encode);
- return true;
- }
- /**
- * @throws ContentLengthException
- * @throws LogicalException
- */
- private function makeEndTag(Content $content, Options $options): TagDTO
- {
- $tag = $content->fastForward(1)
- ->copyByToken(StringToken::SLASH(), true);
- // move to end of tag
- $content->copyUntil('>');
- $content->fastForward(1);
- // check if this closing tag counts
- $tag = \strtolower($tag);
- if (\in_array($tag, $options->getSelfClosing(), true)) {
- return TagDTO::makeFromPrimitives(true);
- }
- return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
- }
- /**
- * @param string|Tag $tag
- *
- * @throws ContentLengthException
- * @throws LogicalException
- * @throws StrictException
- */
- private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
- {
- while (
- $content->char() != '>' &&
- $content->char() != '/'
- ) {
- $space = $content->skipByToken(StringToken::BLANK(), true);
- if (empty($space)) {
- try {
- $content->fastForward(1);
- } catch (ContentLengthException $exception) {
- // reached the end of the content
- break;
- }
- continue;
- }
- $name = $content->copyByToken(StringToken::EQUAL(), true);
- if ($name == '/') {
- break;
- }
- if (empty($name)) {
- $content->skipByToken(StringToken::BLANK());
- continue;
- }
- $content->skipByToken(StringToken::BLANK());
- if ($content->char() == '=') {
- $content->fastForward(1)
- ->skipByToken(StringToken::BLANK());
- switch ($content->char()) {
- case '"':
- $content->fastForward(1);
- $string = $content->copyUntil('"', true);
- do {
- $moreString = $content->copyUntilUnless('"', '=>');
- $string .= $moreString;
- } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
- $content->fastForward(1);
- $node->getTag()->setAttribute($name, $string);
- break;
- case "'":
- $content->fastForward(1);
- $string = $content->copyUntil("'", true);
- do {
- $moreString = $content->copyUntilUnless("'", '=>');
- $string .= $moreString;
- } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
- $content->fastForward(1);
- $node->getTag()->setAttribute($name, $string, false);
- break;
- default:
- $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
- break;
- }
- } else {
- // no value attribute
- if ($options->isStrict()) {
- // can't have this in strict html
- $character = $content->getPosition();
- throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
- }
- $node->getTag()->setAttribute($name, null);
- if ($content->char() != '>') {
- $content->rewind(1);
- }
- }
- }
- }
- }
|