<?php
declare(strict_types=1);
namespace Pelago\Emogrifier\HtmlProcessor;
/**
* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
*
* The "vanilla" subclass is the HtmlNormalizer.
*
* @psalm-consistent-constructor
*/
abstract class AbstractHtmlProcessor
{
/**
* @var string
*/
protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
/**
* @var string
*/
protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
/**
* @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are
* self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen>
* (deprecated) are also included.
*
* @see https://bugs.php.net/bug.php?id=73175
*/
protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
/**
* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
* for any other element would implicitly start the `<body>` element due to tag omission rules.
*
* @var string
*/
protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
= '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';
/**
* regular expression pattern to match an HTML comment, including delimiters and modifiers
*
* @var string
*/
protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';
/**
* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
*
* @var string
*/
protected const HTML_TEMPLATE_ELEMENT_PATTERN
= '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';
/**
* @var ?\DOMDocument
*/
protected $domDocument = null;
/**
* @var ?\DOMXPath
*/
private $xPath = null;
/**
* The constructor.
*
* Please use `::fromHtml` or `::fromDomDocument` instead.
*/
private function __construct()
{
}
/**
* Builds a new instance from the given HTML.
*
* @param string $unprocessedHtml raw HTML, must be UTF-encoded, must not be empty
*
* @return static
*
* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
*/
public static function fromHtml(string $unprocessedHtml): self
{
if ($unprocessedHtml === '') {
throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
}
$instance = new static();
$instance->setHtml($unprocessedHtml);
return $instance;
}
/**
* Builds a new instance from the given DOM document.
*
* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
*
* @return static
*/
public static function fromDomDocument(\DOMDocument $document): self
{
$instance = new static();
$instance->setDomDocument($document);
return $instance;
}
/**
* Sets the HTML to process.
*
* @param string $html the HTML to process, must be UTF-8-encoded
*/
private function setHtml(string $html): void
{
$this->createUnifiedDomDocument($html);
}
/**
* Provides access to the internal DOMDocument representation of the HTML in its current state.
*
* @return \DOMDocument
*
* @throws \UnexpectedValueException
*/
public function getDomDocument(): \DOMDocument
{
if (!$this->domDocument instanceof \DOMDocument) {
$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
throw new \UnexpectedValueException($message, 1570472239);
}
return $this->domDocument;
}
/**
* @param \DOMDocument $domDocument
*/
private function setDomDocument(\DOMDocument $domDocument): void
{
$this->domDocument = $domDocument;
$this->xPath = new \DOMXPath($this->domDocument);
}
/**
* @return \DOMXPath
*
* @throws \UnexpectedValueException
*/
protected function getXPath(): \DOMXPath
{
if (!$this->xPath instanceof \DOMXPath) {
$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
throw new \UnexpectedValueException($message, 1617819086);
}
return $this->xPath;
}
/**
* Renders the normalized and processed HTML.
*
* @return string
*/
public function render(): string
{
$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();
return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
}
/**
* Renders the content of the BODY element of the normalized and processed HTML.
*
* @return string
*/
public function renderBodyContent(): string
{
$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());
$bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
return \preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
}
/**
* Eliminates any invalid closing tags for void elements from the given HTML.
*
* @param string $html
*
* @return string
*/
private function removeSelfClosingTagsClosingTags(string $html): string
{
return \preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
}
/**
* Returns the BODY element.
*
* This method assumes that there always is a BODY element.
*
* @return \DOMElement
*
* @throws \RuntimeException
*/
private function getBodyElement(): \DOMElement
{
$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
if (!$node instanceof \DOMElement) {
throw new \RuntimeException('There is no body element.', 1617922607);
}
return $node;
}
/**
* Creates a DOM document from the given HTML and stores it in $this->domDocument.
*
* The DOM document will always have a BODY element and a document type.
*
* @param string $html
*/
private function createUnifiedDomDocument(string $html): void
{
$this->createRawDomDocument($html);
$this->ensureExistenceOfBodyElement();
}
/**
* Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
*
* @param string $html
*/
private function createRawDomDocument(string $html): void
{
$domDocument = new \DOMDocument();
$domDocument->strictErrorChecking = false;
$domDocument->formatOutput = true;
$libXmlState = \libxml_use_internal_errors(true);
$domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
\libxml_clear_errors();
\libxml_use_internal_errors($libXmlState);
$this->setDomDocument($domDocument);
}
/**
* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
* ensuring that the HTML will be good for creating a DOM document from it.
*
* @param string $html
*
* @return string the unified HTML
*/
private function prepareHtmlForDomConversion(string $html): string
{
$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
return $this->addContentTypeMetaTag($htmlWithDocumentType);
}
/**
* Makes sure that the passed HTML has a document type, with lowercase "html".
*
* @param string $html
*
* @return string HTML with document type
*/
private function ensureDocumentType(string $html): string
{
$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
if ($hasDocumentType) {
return $this->normalizeDocumentType($html);
}
return self::DEFAULT_DOCUMENT_TYPE . $html;
}
/**
* Makes sure the document type in the passed HTML has lowercase "html".
*
* @param string $html
*
* @return string HTML with normalized document type
*/
private function normalizeDocumentType(string $html): string
{
// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
return \preg_replace(
'/<!DOCTYPE\\s++html(?=[\\s>])/i',
'<!DOCTYPE html',
$html,
1
);
}
/**
* Adds a Content-Type meta tag for the charset.
*
* This method also ensures that there is a HEAD element.
*
* @param string $html
*
* @return string the HTML with the meta tag added
*/
private function addContentTypeMetaTag(string $html): string
{
if ($this->hasContentTypeMetaTagInHead($html)) {
return $html;
}
// We are trying to insert the meta tag to the right spot in the DOM.
// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
$hasHeadTag = \preg_match('/<head[\\s>]/i', $html);
$hasHtmlTag = \stripos($html, '<html') !== false;
if ($hasHeadTag) {
$reworkedHtml = \preg_replace(
'/<head(?=[\\s>])([^>]*+)>/i',
'<head$1>' . self::CONTENT_TYPE_META_TAG,
$html
);
} elseif ($hasHtmlTag) {
$reworkedHtml = \preg_replace(
'/<html(.*?)>/is',
'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
$html
);
} else {
$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
}
return $reworkedHtml;
}
/**
* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
* encountering a start tag for any element which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*/
private function hasContentTypeMetaTagInHead(string $html): bool
{
\preg_match('%^.*?(?=<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is', $html, $matches);
if (isset($matches[0])) {
$htmlBefore = $matches[0];
try {
$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
} catch (\RuntimeException $exception) {
// If something unexpected occurs, assume the `Content-Type` that was found is valid.
\trigger_error($exception->getMessage());
$hasContentTypeMetaTagInHead = true;
}
} else {
$hasContentTypeMetaTagInHead = false;
}
return $hasContentTypeMetaTagInHead;
}
/**
* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
* which is permitted only within the `<body>`.
*
* @param string $html
*
* @return bool
*
* @throws \RuntimeException
*/
private function hasEndOfHeadElement(string $html): bool
{
$headEndTagMatchCount
= \preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html);
if (\is_int($headEndTagMatchCount) && $headEndTagMatchCount > 0) {
// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
|| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
} else {
$hasEndOfHeadElement = false;
}
return $hasEndOfHeadElement;
}
/**
* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
* is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlComments(string $html): string
{
$result = \preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616521475);
}
return $result;
}
/**
* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
* the string is removed.
*
* @param string $html
*
* @return string
*
* @throws \RuntimeException
*/
private function removeHtmlTemplateElements(string $html): string
{
$result = \preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
if (!\is_string($result)) {
throw new \RuntimeException('Internal PCRE error', 1616519652);
}
return $result;
}
/**
* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
* self-closing slash.
*
* @param string $html
*
* @return string HTML with problematic tags converted.
*/
private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
{
return \preg_replace(
'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
'$0/',
$html
);
}
/**
* Checks that $this->domDocument has a BODY element and adds it if it is missing.
*
* @throws \UnexpectedValueException
*/
private function ensureExistenceOfBodyElement(): void
{
if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
return;
}
$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
if (!$htmlElement instanceof \DOMElement) {
throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
}
$htmlElement->appendChild($this->getDomDocument()->createElement('body'));
}
}