♻️ Refactor: Add linkifier rule to inline chain for full links (#279)

chrisjsewell · web-flow · commit ea27cc86ca52 · 2023-06-02T20:02:17.000+02:00
Fixes collision of emphasis and linkifier (so `http://example.org/foo._bar_-_baz` is now a single link, not emphasized). Emails and fuzzy links are not affected by this. Implements upstream: markdown-it/markdown-it@6b58ec4
diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py
@@ -304,3 +304,15 @@ def normalizeReference(string: str) -> str:
     # most notably, `__proto__`)
     #
     return string.lower().upper()
+
+
+LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
+LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
+
+
+def isLinkOpen(string: str) -> bool:
+    return bool(LINK_OPEN_RE.search(string))
+
+
+def isLinkClose(string: str) -> bool:
+    return bool(LINK_CLOSE_RE.search(string))
diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py
@@ -16,6 +16,7 @@
 # Parser rules
 _rules: list[tuple[str, RuleFunc]] = [
     ("text", rules_inline.text),
+    ("linkify", rules_inline.linkify),
     ("newline", rules_inline.newline),
     ("escape", rules_inline.escape),
     ("backticks", rules_inline.backtick),
diff --git a/markdown_it/presets/__init__.py b/markdown_it/presets/__init__.py
@@ -21,7 +21,7 @@ def make() -> PresetType:
         config = commonmark.make()
         config["components"]["core"]["rules"].append("linkify")
         config["components"]["block"]["rules"].append("table")
-        config["components"]["inline"]["rules"].append("strikethrough")
+        config["components"]["inline"]["rules"].extend(["strikethrough", "linkify"])
         config["components"]["inline"]["rules2"].append("strikethrough")
         config["options"]["linkify"] = True
         config["options"]["html"] = True
diff --git a/markdown_it/rules_core/linkify.py b/markdown_it/rules_core/linkify.py
@@ -1,41 +1,32 @@
+from __future__ import annotations
+
 import re
+from typing import Protocol
 
-from ..common.utils import arrayReplaceAt
+from ..common.utils import arrayReplaceAt, isLinkClose, isLinkOpen
 from ..token import Token
 from .state_core import StateCore
 
-LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
-LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
-
 HTTP_RE = re.compile(r"^http://")
 MAILTO_RE = re.compile(r"^mailto:")
 TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
 
 
-def isLinkOpen(string: str) -> bool:
-    return bool(LINK_OPEN_RE.search(string))
-
-
-def isLinkClose(string: str) -> bool:
-    return bool(LINK_CLOSE_RE.search(string))
-
-
 def linkify(state: StateCore) -> None:
-    blockTokens = state.tokens
-
+    """Rule for identifying plain-text links."""
     if not state.md.options.linkify:
         return
 
     if not state.md.linkify:
         raise ModuleNotFoundError("Linkify enabled but not installed.")
 
-    for j in range(len(blockTokens)):
-        if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
-            blockTokens[j].content
+    for inline_token in state.tokens:
+        if inline_token.type != "inline" or not state.md.linkify.pretest(
+            inline_token.content
         ):
             continue
 
-        tokens = blockTokens[j].children
+        tokens = inline_token.children
 
         htmlLinkLevel = 0
 
@@ -71,38 +62,47 @@ def linkify(state: StateCore) -> None:
                 currentToken.content
             ):
                 text = currentToken.content
-                links = state.md.linkify.match(text)
+                links: list[_LinkType] = state.md.linkify.match(text) or []
 
                 # Now split string to nodes
                 nodes = []
                 level = currentToken.level
                 lastPos = 0
 
-                for ln in range(len(links)):
-                    url = links[ln].url
+                # forbid escape sequence at the start of the string,
+                # this avoids http\://example.com/ from being linkified as
+                # http:<a href="//example.com/">//example.com/</a>
+                if (
+                    links
+                    and links[0].index == 0
+                    and i > 0
+                    and tokens[i - 1].type == "text_special"
+                ):
+                    links = links[1:]
+
+                for link in links:
+                    url = link.url
                     fullUrl = state.md.normalizeLink(url)
                     if not state.md.validateLink(fullUrl):
                         continue
 
-                    urlText = links[ln].text
+                    urlText = link.text
 
                     # Linkifier might send raw hostnames like "example.com", where url
                     # starts with domain name. So we prepend http:// in those cases,
                     # and remove it afterwards.
-                    if not links[ln].schema:
+                    if not link.schema:
                         urlText = HTTP_RE.sub(
                             "", state.md.normalizeLinkText("http://" + urlText)
                         )
-                    elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
-                        urlText
-                    ):
+                    elif link.schema == "mailto:" and TEST_MAILTO_RE.search(urlText):
                         urlText = MAILTO_RE.sub(
                             "", state.md.normalizeLinkText("mailto:" + urlText)
                         )
                     else:
                         urlText = state.md.normalizeLinkText(urlText)
 
-                    pos = links[ln].index
+                    pos = link.index
 
                     if pos > lastPos:
                         token = Token("text", "", 0)
@@ -130,12 +130,20 @@ def linkify(state: StateCore) -> None:
                     token.info = "auto"
                     nodes.append(token)
 
-                    lastPos = links[ln].last_index
+                    lastPos = link.last_index
 
                 if lastPos < len(text):
                     token = Token("text", "", 0)
                     token.content = text[lastPos:]
                     token.level = level
                     nodes.append(token)
 
-                blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)
+                inline_token.children = tokens = arrayReplaceAt(tokens, i, nodes)
+
+
+class _LinkType(Protocol):
+    url: str
+    text: str
+    index: int
+    last_index: int
+    schema: str | None
diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py
@@ -3,6 +3,7 @@
     "text",
     "fragments_join",
     "link_pairs",
+    "linkify",
     "escape",
     "newline",
     "backtick",
@@ -24,6 +25,7 @@
 from .html_inline import html_inline
 from .image import image
 from .link import link
+from .linkify import linkify
 from .newline import newline
 from .state_inline import StateInline
 from .text import text
diff --git a/markdown_it/rules_inline/html_inline.py b/markdown_it/rules_inline/html_inline.py
@@ -1,5 +1,6 @@
 # Process html tags
 from ..common.html_re import HTML_TAG_RE
+from ..common.utils import isLinkClose, isLinkOpen
 from .state_inline import StateInline
 
 
@@ -33,5 +34,10 @@ def html_inline(state: StateInline, silent: bool) -> bool:
         token = state.push("html_inline", "", 0)
         token.content = state.src[pos : pos + len(match.group(0))]
 
+        if isLinkOpen(token.content):
+            state.linkLevel += 1
+        if isLinkClose(token.content):
+            state.linkLevel -= 1
+
     state.pos += len(match.group(0))
     return True
diff --git a/markdown_it/rules_inline/link.py b/markdown_it/rules_inline/link.py
@@ -140,7 +140,9 @@ def link(state: StateInline, silent: bool) -> bool:
         if label and state.md.options.get("store_labels", False):
             token.meta["label"] = label
 
+        state.linkLevel += 1
         state.md.inline.tokenize(state)
+        state.linkLevel -= 1
 
         token = state.push("link_close", "a", -1)
 
diff --git a/markdown_it/rules_inline/linkify.py b/markdown_it/rules_inline/linkify.py
@@ -0,0 +1,61 @@
+"""Process links like https://example.org/"""
+import re
+
+from .state_inline import StateInline
+
+# RFC3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+SCHEME_RE = re.compile(r"(?:^|[^a-z0-9.+-])([a-z][a-z0-9.+-]*)$", re.IGNORECASE)
+
+
+def linkify(state: StateInline, silent: bool) -> bool:
+    """Rule for identifying plain-text links."""
+    if not state.md.options.linkify:
+        return False
+    if state.linkLevel > 0:
+        return False
+    if not state.md.linkify:
+        raise ModuleNotFoundError("Linkify enabled but not installed.")
+
+    pos = state.pos
+    maximum = state.posMax
+
+    if (
+        (pos + 3) > maximum
+        or state.src[pos] != ":"
+        or state.src[pos + 1] != "/"
+        or state.src[pos + 2] != "/"
+    ):
+        return False
+
+    if not (match := SCHEME_RE.match(state.pending)):
+        return False
+
+    proto = match.group(1)
+    if not (link := state.md.linkify.match_at_start(state.src[pos - len(proto) :])):
+        return False
+    url: str = link.url
+
+    # disallow '*' at the end of the link (conflicts with emphasis)
+    url = url.rstrip("*")
+
+    full_url = state.md.normalizeLink(url)
+    if not state.md.validateLink(full_url):
+        return False
+
+    if not silent:
+        state.pending = state.pending[: -len(proto)]
+
+        token = state.push("link_open", "a", 1)
+        token.attrs = {"href": full_url}
+        token.markup = "linkify"
+        token.info = "auto"
+
+        token = state.push("text", "", 0)
+        token.content = state.md.normalizeLinkText(url)
+
+        token = state.push("link_close", "a", -1)
+        token.markup = "linkify"
+        token.info = "auto"
+
+    state.pos += len(url) - len(proto)
+    return True
diff --git a/markdown_it/rules_inline/state_inline.py b/markdown_it/rules_inline/state_inline.py
@@ -70,6 +70,10 @@ def __init__(
         self.backticks: dict[int, int] = {}
         self.backticksScanned = False
 
+        # Counter used to disable inline linkify-it execution
+        # inside <a> and markdown links
+        self.linkLevel = 0
+
     def __repr__(self) -> str:
         return (
             f"{self.__class__.__name__}"
diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py
@@ -30,6 +30,7 @@ def test_get_rules():
         ],
         "inline": [
             "text",
+            "linkify",
             "newline",
             "escape",
             "backticks",
diff --git a/tests/test_port/fixtures/linkify.md b/tests/test_port/fixtures/linkify.md