Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Fix links being parsed as markdown links improperly #7200

Merged
merged 9 commits into from
Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 125 additions & 3 deletions src/Markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.

import * as commonmark from 'commonmark';
import { escape } from "lodash";
import { logger } from 'matrix-js-sdk/src/logger';
import * as linkify from 'linkifyjs';

const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];

Expand All @@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
link: (node: commonmark.Node, entering: boolean) => void;
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
text: (node: commonmark.Node) => void;
out: (text: string) => void;
emph: (node: commonmark.Node) => void;
}

function isAllowedHtmlTag(node: commonmark.Node): boolean {
Expand Down Expand Up @@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
return par.firstChild != par.lastChild;
}

function getTextUntilEndOrLinebreak(node: commonmark.Node) {
let currentNode = node;
let text = '';
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
const { literal, type } = currentNode;
if (type === 'text' && literal) {
let n = 0;
let char = literal[n];
while (char !== ' ' && char !== null && n <= literal.length) {
if (char === ' ') {
break;
}
if (char) {
text += char;
}
n += 1;
char = literal[n];
}
if (char === ' ') {
break;
}
}
currentNode = currentNode.next;
}
return text;
}

/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
Expand All @@ -70,11 +102,103 @@ export default class Markdown {
private input: string;
private parsed: commonmark.Node;

constructor(input) {
constructor(input: string) {
this.input = input;

const parser = new commonmark.Parser();
this.parsed = parser.parse(this.input);
this.parsed = this.repairLinks(this.parsed);
}

/**
* This method is modifying the parsed AST in such a way that links are always
* properly linkified instead of sometimes being wrongly emphasised in case
* if you were to write a link like the example below:
* https://my_weird-link_domain.domain.com
* ^ this link would be parsed to something like this:
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
* This method makes it so the link gets properly modified to a version where it is
* not emphasised until it actually ends.
* See: https://github.com/vector-im/element-web/issues/4674
* @param parsed
*/
private repairLinks(parsed: commonmark.Node) {
const walker = parsed.walker();
let event: commonmark.NodeWalkingStep = null;
let text = '';
let isInPara = false;
let previousNode: commonmark.Node | null = null;
let shouldUnlinkEmphasisNode = false;
while ((event = walker.next())) {
const { node } = event;
if (node.type === 'paragraph') {
if (event.entering) {
isInPara = true;
} else {
isInPara = false;
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === 'softbreak' ||
node.type === 'linebreak' ||
// Also start calculating the text from the beginning on any spaces
(node.type === 'text' && node.literal === ' ')
) {
text = '';
}
if (node.type === 'text') {
text += node.literal;
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if (node.type === 'emph' && previousNode.type === 'text') {
if (event.entering) {
const foundLinks = linkify.find(text);
for (const { value } of foundLinks) {
if (node.firstChild.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const nonEmphasizedText = `_${node.firstChild.literal}_`;
const f = getTextUntilEndOrLinebreak(node);
const newText = value + nonEmphasizedText + f;
const newLinks = linkify.find(newText);
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node('text');
emphasisTextNode.literal = nonEmphasizedText;
previousNode.insertAfter(emphasisTextNode);
node.firstChild.literal = '';
event = node.walker().next();
// Remove `em` opening and closing nodes
node.unlink();
previousNode.insertAfter(event.node);
shouldUnlinkEmphasisNode = true;
} else {
logger.error(
"Markdown links escaping found too many links for following text: ",
text,
);
logger.error(
"Markdown links escaping found too many links for modified text: ",
newText,
);
}
}
}
} else {
if (shouldUnlinkEmphasisNode) {
node.unlink();
shouldUnlinkEmphasisNode = false;
}
}
}
}
previousNode = node;
}
return parsed;
}

isPlainText(): boolean {
Expand Down Expand Up @@ -120,9 +244,7 @@ export default class Markdown {
// you can nest them.
//
// Let's try sending with <p/>s anyway for now, though.

const realParagraph = renderer.paragraph;

renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
// If there is only one top level node, just return the
// bare text: it's a single line of text and so should be
Expand Down
143 changes: 143 additions & 0 deletions test/Markdown-test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import * as linkifyjs from 'linkifyjs';
import Markdown from "../src/Markdown";
import matrixLinkify from '../src/linkify-matrix';

beforeAll(() => {
// We need to call linkifier plugins before running those tests
matrixLinkify(linkifyjs);
});

describe("Markdown parser test", () => {
describe("fixing HTML links", () => {
const testString = [
"Test1:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test1A:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test2:",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"",
"Test3:",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
].join("\n");

it('tests that links are getting properly HTML formatted', () => {
/* eslint-disable max-len */
const expectedResult = [
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(testString);
expect(md.toHTML()).toEqual(expectedResult);
});
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
const test = [
"Test1:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test1A:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test2:",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"",
"Test3:",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
].join("\n");
/* eslint-disable max-len */
/**
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
* but it seems to be actually working properly
*/
const expectedResult = [
"<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(test);
expect(md.toHTML()).toEqual(expectedResult);
});

it('expects that links in codeblock are not modified', () => {
const expectedResult = [
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test1A:',
'#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test2:',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'',
'Test3:',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
'</code></pre>',
'',
].join('\n');
const md = new Markdown("```" + testString + "```");
expect(md.toHTML()).toEqual(expectedResult);
});

it('expects that links in one line will be "escaped" properly', () => {
/* eslint-disable max-len */
const testString = [
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
].join('\n');
const expectedResult = [
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
].join('<br />');
/* eslint-enable max-len */
const md = new Markdown(testString);
expect(md.toHTML()).toEqual(expectedResult);
});
});
});
1 change: 0 additions & 1 deletion test/editor/deserialize-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ describe('editor/deserialize', function() {
it('code block with no trailing text', function() {
const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
console.log(parts);
expect(parts.length).toBe(5);
expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });
Expand Down