diff --git a/src/Behavior.php b/src/Behavior.php index 1cc88b7..d26c10c 100644 --- a/src/Behavior.php +++ b/src/Behavior.php @@ -15,6 +15,8 @@ namespace TYPO3\HtmlSanitizer; use LogicException; +use TYPO3\HtmlSanitizer\Behavior\CdataSection; +use TYPO3\HtmlSanitizer\Behavior\Comment; use TYPO3\HtmlSanitizer\Behavior\NodeInterface; use TYPO3\HtmlSanitizer\Behavior\Tag; @@ -74,11 +76,16 @@ class Behavior * Node names as array index, e.g. `['strong' => new Tag('strong', '#comment' => new Comment()]` * @var array */ - protected $nodes = [ + protected $nodes = []; + + public function __construct() + { // v2.1.0: adding `#comment` and `#cdata-section` hints for backward compatibility, will be removed with v3.0.0 - '#comment' => null, - '#cdata-section' => null, - ]; + $this->nodes = array_merge($this->nodes, [ + '#comment' => new Comment(), + '#cdata-section' => new CdataSection(), + ]); + } public function withFlags(int $flags): self { @@ -125,7 +132,6 @@ public function withNodes(NodeInterface ...$nodes): self if (!is_array($indexedNodes)) { return $this; } - $this->assertNodeUniqueness($indexedNodes); $target = clone $this; $target->nodes = array_merge($target->nodes, $indexedNodes); return $target; @@ -136,9 +142,8 @@ public function withoutNodes(NodeInterface ...$nodes): self $names = array_map([$this, 'getNodeName'], $nodes); $filteredNodes = array_filter( $this->nodes, - static function (?NodeInterface $node, string $name) use ($nodes, $names) { - return $node === null && !in_array($name, $names, true) - || $node !== null && !in_array($node, $nodes, true); + static function (NodeInterface $node, string $name) use ($nodes, $names) { + return !in_array($name, $names, true) && !in_array($node, $nodes, true); }, ARRAY_FILTER_USE_BOTH ); @@ -247,23 +252,6 @@ protected function assertScalarUniqueness(array $names): void } } - /** - * @param array $nodes - */ - protected function assertNodeUniqueness(array $nodes): void - { - $existingNodeNames = array_intersect_key(array_filter($this->nodes), $nodes); - if ($existingNodeNames !== []) { - throw new LogicException( - sprintf( - 'Cannot redeclare node names %s. Remove duplicates first', - implode(', ', array_keys($existingNodeNames)) - ), - 1625391217 - ); - } - } - protected function getNodeName(NodeInterface $node): string { return strtolower($node->getName()); diff --git a/src/Behavior/CdataSection.php b/src/Behavior/CdataSection.php index df2a432..7640dc1 100644 --- a/src/Behavior/CdataSection.php +++ b/src/Behavior/CdataSection.php @@ -14,13 +14,36 @@ namespace TYPO3\HtmlSanitizer\Behavior; +use DOMNode; +use DOMText; +use TYPO3\HtmlSanitizer\Behavior; +use TYPO3\HtmlSanitizer\Context; + /** * Model of CDATA node. */ -class CdataSection implements NodeInterface +class CdataSection implements NodeInterface, HandlerInterface { + /** + * @var bool + */ + protected $secure = true; + + public function __construct(bool $secure = true) + { + $this->secure = $secure; + } + public function getName(): string { return '#cdata-section'; } + + public function handle(NodeInterface $node, ?DOMNode $domNode, Context $context, Behavior $behavior = null): ?DOMNode + { + if (!$this->secure || $domNode === null) { + return $domNode; + } + return new DOMText(trim($domNode->nodeValue)); + } } diff --git a/src/Behavior/Comment.php b/src/Behavior/Comment.php index ea798ca..e271943 100644 --- a/src/Behavior/Comment.php +++ b/src/Behavior/Comment.php @@ -14,13 +14,36 @@ namespace TYPO3\HtmlSanitizer\Behavior; +use DOMComment; +use DOMNode; +use TYPO3\HtmlSanitizer\Behavior; +use TYPO3\HtmlSanitizer\Context; + /** * Model of comment node. */ -class Comment implements NodeInterface +class Comment implements NodeInterface, HandlerInterface { + /** + * @var bool + */ + protected $secure = true; + + public function __construct(bool $secure = true) + { + $this->secure = $secure; + } + public function getName(): string { return '#comment'; } + + public function handle(NodeInterface $node, ?DOMNode $domNode, Context $context, Behavior $behavior = null): ?DOMNode + { + if (!$this->secure || $domNode === null) { + return $domNode; + } + return new DOMComment(htmlspecialchars($domNode->textContent, ENT_QUOTES, 'UTF-8', false)); + } } diff --git a/src/Builder/CommonBuilder.php b/src/Builder/CommonBuilder.php index 16529e2..9e38f08 100644 --- a/src/Builder/CommonBuilder.php +++ b/src/Builder/CommonBuilder.php @@ -14,11 +14,8 @@ namespace TYPO3\HtmlSanitizer\Builder; -use DOMNode; -use DOMText; use TYPO3\HtmlSanitizer\Behavior; use TYPO3\HtmlSanitizer\Behavior\Attr\UriAttrValueBuilder; -use TYPO3\HtmlSanitizer\Behavior\NodeInterface; use TYPO3\HtmlSanitizer\Sanitizer; use TYPO3\HtmlSanitizer\Visitor\CommonVisitor; @@ -83,8 +80,7 @@ protected function createBehavior(): Behavior ->withName('common') ->withTags(...array_values($this->createBasicTags())) ->withTags(...array_values($this->createMediaTags())) - ->withTags(...array_values($this->createTableTags())) - ->withNodes(...array_values($this->createSpecialNodes())); + ->withTags(...array_values($this->createTableTags())); } protected function createBasicTags(): array @@ -211,23 +207,6 @@ protected function createTableTags(): array return $tags; } - /** - * @return array - */ - protected function createSpecialNodes(): array - { - $nodes = []; - $nodes['#cdata-section'] = (new Behavior\NodeHandler( - new Behavior\CdataSection(), - new Behavior\Handler\ClosureHandler( - static function (NodeInterface $node, ?DOMNode $domNode) { - return $domNode !== null ? new DOMText(trim($domNode->nodeValue)) : null; - } - ) - )); - return $nodes; - } - /** * @return Behavior\Attr[] */ diff --git a/src/Sanitizer.php b/src/Sanitizer.php index d3fb41a..5f299fb 100644 --- a/src/Sanitizer.php +++ b/src/Sanitizer.php @@ -41,6 +41,7 @@ class Sanitizer protected const mastermindsDefaultOptions = [ // Whether the serializer should aggressively encode all characters as entities. 'encode_entities' => false, + 'encode_attributes' => true, // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document. // (adjusted due to https://github.com/Masterminds/html5-php/issues/181#issuecomment-643767471) 'disable_html_ns' => true, diff --git a/src/Serializer/Rules.php b/src/Serializer/Rules.php index 5888e4a..10ca0b0 100644 --- a/src/Serializer/Rules.php +++ b/src/Serializer/Rules.php @@ -45,6 +45,11 @@ class Rules extends OutputRules implements RulesInterface */ protected $initiator; + /** + * @var bool + */ + protected $encodeAttributes; + /** * @param Behavior $behavior * @param resource$output @@ -66,6 +71,7 @@ public static function create(Behavior $behavior, $output, array $options = []): public function __construct($output, $options = []) { $this->options = (array)$options; + $this->encodeAttributes = !empty($options['encode_attributes']); parent::__construct($output, $this->options); } @@ -158,6 +164,32 @@ public function text($domNode): void $this->wr($domNode->data); } + protected function enc($text, $attribute = false): string + { + if ($attribute && $this->encodeAttributes && !$this->encode) { + // In contrast to parent::enc() (when $this->encode is true), + // we are using htmlspecialchars() instead of htmlentities() as + // colons and slashes do not need to be aggressively escaped. + $value = htmlspecialchars( + $text, + ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, + 'UTF-8', + // $double_encode: true + // (name is misleading, it actually means: disable-automagic/always-encode) + // Our input is always entity decoded by the parser and we do not + // want to consider our input to possibly contain valid entities + // we rather want to treat it as pure text, that is *always* to be encoded. + true + ); + // htmlspecialchars does escaping, but it doesn't match the requirements of + // HTML5 section 8.3 to ecape non breaking spaces + // https://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString + $value = implode(' ', mb_split("\xc2\xa0", $value)); + return $value; + } + return parent::enc($text, $attribute); + } + /** * If the element has a declared namespace in the HTML, MathML or * SVG namespaces, we use the localName instead of the tagName. diff --git a/src/Visitor/CommonVisitor.php b/src/Visitor/CommonVisitor.php index 7ad0d0d..92c5602 100644 --- a/src/Visitor/CommonVisitor.php +++ b/src/Visitor/CommonVisitor.php @@ -86,6 +86,9 @@ public function enterNode(DOMNode $domNode): ?DOMNode if (!$node->shallHandleFirst()) { $domNode = $node->getHandler()->handle($node->getNode(), $domNode, $this->context, $this->behavior); } + } elseif ($node instanceof Behavior\HandlerInterface) { + $domNode = $node->handle($node, $domNode, $this->context, $this->behavior); + $domNode = $domNode instanceof DOMElement ? $this->enterDomElement($domNode, $node) : $domNode; } elseif ($domNode instanceof DOMElement) { $domNode = $this->enterDomElement($domNode, $node); } diff --git a/tests/BehaviorTest.php b/tests/BehaviorTest.php index 2e33d2e..4575d14 100644 --- a/tests/BehaviorTest.php +++ b/tests/BehaviorTest.php @@ -24,7 +24,6 @@ class BehaviorTest extends TestCase public function ambiguityIsDetectedDataProvider(): array { return [ - [ ['same'], ['same'], 1625391217 ], [ ['same', 'same'], [], 1625591503 ], [ ['same', 'same'], ['same'], 1625591503 ], [ [], ['same', 'same'], 1625591503 ], diff --git a/tests/CommonBuilderTest.php b/tests/CommonBuilderTest.php index 2feb80a..a4ca310 100644 --- a/tests/CommonBuilderTest.php +++ b/tests/CommonBuilderTest.php @@ -256,13 +256,21 @@ public function isSanitizedDataProvider(): array '', ], '#910' => [ - '', - '#cdata', + '', + '', ], '#911' => [ + '', + '', + ], + '#915' => [ '#text', '#text', ], + '#920' => [ + '', + '#cdata', + ], '#921' => [ '*/]]>', '<any><span data-value="value"></any>*/', @@ -287,6 +295,14 @@ public function isSanitizedDataProvider(): array 'value', '<any>value</any>', ], + '#935' => [ + '

value

', + '

value

', + ], + '#936' => [ + '

value

', + '

value

', + ], ]; } diff --git a/tests/ScenarioTest.php b/tests/ScenarioTest.php index 96c94fb..27771be 100644 --- a/tests/ScenarioTest.php +++ b/tests/ScenarioTest.php @@ -231,23 +231,41 @@ public static function commentsAreHandledDataProvider(): array return [ 'not allowed' => [ false, + null, Behavior::BLUNT, '
test
', '
test
' ], - 'allowed' => [ + 'allowed, insecure' => [ true, + false, Behavior::BLUNT, '
test
', '
test
' ], - 'not allowed, encoded' => [ + 'allowed, secure' => [ + true, + true, + Behavior::BLUNT, + '
test
', + '
test
' + ], + 'not allowed, encode invalid' => [ false, + null, Behavior::ENCODE_INVALID_COMMENT, '
test
', '
<!-- before -->test<!-- after -->
', ], - 'allowed, encoded' => [ + 'allowed, insecure, encode invalid' => [ + true, + false, + Behavior::ENCODE_INVALID_COMMENT, + '
test
', + '
test
' + ], + 'allowed, secure, encode invalid' => [ + true, true, Behavior::ENCODE_INVALID_COMMENT, '
test
', @@ -260,13 +278,13 @@ public static function commentsAreHandledDataProvider(): array * @test * @dataProvider commentsAreHandledDataProvider */ - public function commentsAreHandled(bool $allowed, int $flags, string $payload, string $expectation): void + public function commentsAreHandled(bool $allowed, ?bool $secure, int $flags, string $payload, string $expectation): void { $behavior = (new Behavior()) ->withFlags($flags) ->withName('scenario-test') ->withTags(new Behavior\Tag('div', Behavior\Tag::ALLOW_CHILDREN)); - $comment = new Behavior\Comment(); + $comment = new Behavior\Comment($secure ?? true); $behavior = $allowed ? $behavior->withNodes($comment) : $behavior->withoutNodes($comment); $sanitizer = new Sanitizer( $behavior, @@ -280,27 +298,45 @@ public static function cdataSectionsAreHandledDataProvider(): array return [ 'not allowed' => [ false, + null, Behavior::BLUNT, - '
test
', - '
test
' + '
.test.
', + '
.test.
' ], - 'allowed' => [ + 'allowed, insecure' => [ true, + false, Behavior::BLUNT, - '
test
', - '
test
' + '
.test.
', + '
.test.
' ], - 'not allowed, encoded' => [ + 'allowed, secure' => [ + true, + true, + Behavior::BLUNT, + '
.test.
', + '
before.test.after
' + ], + 'not allowed, encode invalid' => [ false, + null, Behavior::ENCODE_INVALID_CDATA_SECTION, - '
test
', - '
<![CDATA[ before ]]>test<![CDATA[ after ]]>
', + '
.test.
', + '
<![CDATA[ before ]]>.test.<![CDATA[ after ]]>
', ], - 'allowed, encoded' => [ + 'allowed, insecure, encode invalid' => [ true, + false, Behavior::ENCODE_INVALID_CDATA_SECTION, - '
test
', - '
test
' + '
.test.
', + '
.test.
' + ], + 'allowed, secure, encode invalid' => [ + true, + true, + Behavior::ENCODE_INVALID_CDATA_SECTION, + '
.test.
', + '
before.test.after
' ], ]; } @@ -309,13 +345,13 @@ public static function cdataSectionsAreHandledDataProvider(): array * @test * @dataProvider cdataSectionsAreHandledDataProvider */ - public function cdataSectionsAreHandled(bool $allowed, int $flags, string $payload, string $expectation): void + public function cdataSectionsAreHandled(bool $allowed, ?bool $secure, int $flags, string $payload, string $expectation): void { $behavior = (new Behavior()) ->withFlags($flags) ->withName('scenario-test') ->withTags(new Behavior\Tag('div', Behavior\Tag::ALLOW_CHILDREN)); - $cdataSection = new Behavior\CdataSection(); + $cdataSection = new Behavior\CdataSection($secure ?? true); $behavior = $allowed ? $behavior->withNodes($cdataSection) : $behavior->withoutNodes($cdataSection); $sanitizer = new Sanitizer( $behavior, @@ -482,4 +518,136 @@ public function iframeSandboxIsAllowed(): void ); self::assertSame($expectation, $sanitizer->sanitize($payload)); } + + public static function attributesAreEncodedDataProvider(): \Generator + { + yield 'preserve entities' => [ + '', + '', + ]; + yield 'encode single quotes' => [ + '', + '', + ]; + yield 'encode single quotes from entity' => [ + '', + '', + ]; + yield 'encode double quotes' => [ + "", + '', + ]; + yield 'preserve double quote encoding' => [ + '', + '', + ]; + yield 'preserve double encoded entities' => [ + '', + '', + ]; + yield 'preserve URLs without "agressive" entity encoding' => [ + '', + '', + ]; + yield 'encode tag specifiers' => [ + '', + '', + ]; + // Invalid input seems to be removed during parsing step (where?) + // therefore ENT_SUBSTITUTE can not operate during serialization + // @todo: check masterminds/html5-php whether that behavior is + // intended + //yield 'substitute invalid unicode in attributes' => [ + // "", + // "", + //]; + yield 'escape non breaking space' => [ + "", + '', + ]; + yield 'encodes json values' => [ + "
", + '
' + ]; + yield 'encodes json values containing html' => [ + "
", + '
' + ]; + + } + + /** + * @test + * @dataProvider attributesAreEncodedDataProvider + */ + public function attributesAreEncoded(string $payload, string $expectation): void + { + $behavior = (new Behavior()) + ->withFlags(Behavior::ENCODE_INVALID_TAG | Behavior::REMOVE_UNEXPECTED_CHILDREN) + ->withName('scenario-test') + ->withTags( + (new Behavior\Tag('a', Behavior\Tag::ALLOW_CHILDREN))->addAttrs( + new Behavior\Attr('id'), + new Behavior\Attr('title') + ), + (new Behavior\Tag('div', Behavior\Tag::ALLOW_CHILDREN))->addAttrs( + new Behavior\Attr('data-value') + ) + ); + + $sanitizer = new Sanitizer( + $behavior, + new CommonVisitor($behavior) + ); + self::assertSame($expectation, $sanitizer->sanitize($payload)); + } + + public static function specialTagsAreHandledDataProvider(): \Generator + { + yield 'noscript attribute' => [ + '