1- import html
21import re
32from typing import Callable , Optional
43from urllib .parse import urlparse , urlunparse , quote , unquote # noqa: F401
54
6- from . utils import ESCAPABLE
5+ import mdurl
76
8- # TODO below we port the use of the JS packages:
9- # var mdurl = require('mdurl')
10- # var punycode = require('punycode')
11- #
12- # e.g. mdurl: parsed = mdurl.parse(url, True)
13- #
14- # but need to check these fixes from https://www.npmjs.com/package/mdurl:
15- #
16- # Parse url string. Similar to node's url.parse,
17- # but without any normalizations and query string parse.
18- # url - input url (string)
19- # slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false.
20- # Difference with node's url:
7+ from .. import _punycode
218
22- # No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not /
23- # Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path
24- # Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo
25- # Nothing is URL-encoded in the resulting object,
26- # (in joyent/node some chars in auth and paths are encoded)
27- # url.parse() does not have parseQueryString argument
28- # Removed extraneous result properties: host, path, query, etc.,
29- # which can be constructed using other parts of the url.
309
31-
32- # ################# Copied from Commonmark.py #################
33-
34- ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"
35- reBackslashOrAmp = re .compile (r"[\\&]" )
36- reEntityOrEscapedChar = re .compile (
37- "\\ \\ " + "[" + ESCAPABLE + "]|" + ENTITY , re .IGNORECASE
38- )
39-
40-
41- def unescape_char (s : str ) -> str :
42- if s [0 ] == "\\ " :
43- return s [1 ]
44- else :
45- return html .unescape (s )
46-
47-
48- def unescape_string (s : str ) -> str :
49- """Replace entities and backslash escapes with literal characters."""
50- if re .search (reBackslashOrAmp , s ):
51- return re .sub (reEntityOrEscapedChar , lambda m : unescape_char (m .group ()), s )
52- else :
53- return s
54-
55-
56- def normalize_uri (uri : str ) -> str :
57- return quote (uri , safe = "/@:+?=&()%#*," )
58-
59-
60- ##################
61-
62-
63- RECODE_HOSTNAME_FOR = ("http" , "https" , "mailto" )
64-
65-
66- def unescape_normalize_uri (x : str ) -> str :
67- return normalize_uri (unescape_string (x ))
10+ RECODE_HOSTNAME_FOR = ("http:" , "https:" , "mailto:" )
6811
6912
7013def normalizeLink (url : str ) -> str :
@@ -75,91 +18,49 @@ def normalizeLink(url: str) -> str:
7518 [label]: destination 'title'
7619 ^^^^^^^^^^^
7720 """
78- (scheme , netloc , path , params , query , fragment ) = urlparse (url )
79- if scheme in RECODE_HOSTNAME_FOR :
80- url = urlunparse (
81- (
82- scheme ,
83- unescape_normalize_uri (netloc ),
84- normalize_uri (path ),
85- unescape_normalize_uri (params ),
86- normalize_uri (query ),
87- unescape_normalize_uri (fragment ),
88- )
89- )
90- else :
91- url = unescape_normalize_uri (url )
92-
93- return url
94-
95- # TODO the selective encoding below should probably be done here,
96- # something like:
97- # url_check = urllib.parse.urlparse(destination)
98- # if url_check.scheme in RECODE_HOSTNAME_FOR: ...
99-
100- # parsed = urlparse(url)
101- # if parsed.hostname:
102- # # Encode hostnames in urls like:
103- # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
104- # #
105- # # We don't encode unknown schemas, because it's likely that we encode
106- # # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
107- # #
108- # if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR:
109- # try:
110- # parsed.hostname = punycode.toASCII(parsed.hostname)
111- # except Exception:
112- # pass
113- # return quote(urlunparse(parsed))
114-
115-
116- def unescape_unquote (x : str ) -> str :
117- return unquote (unescape_string (x ))
118-
119-
120- def normalizeLinkText (link : str ) -> str :
21+ parsed = mdurl .parse (url , slashes_denote_host = True )
22+
23+ if parsed .hostname :
24+ # Encode hostnames in urls like:
25+ # `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
26+ #
27+ # We don't encode unknown schemas, because it's likely that we encode
28+ # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
29+ #
30+ if not parsed .protocol or parsed .protocol in RECODE_HOSTNAME_FOR :
31+ try :
32+ parsed = parsed ._replace (hostname = _punycode .to_ascii (parsed .hostname ))
33+ except Exception :
34+ pass
35+
36+ return mdurl .encode (mdurl .format (parsed ))
37+
38+
39+ def normalizeLinkText (url : str ) -> str :
12140 """Normalize autolink content
12241
12342 ::
12443
12544 <destination>
12645 ~~~~~~~~~~~
12746 """
128- (scheme , netloc , path , params , query , fragment ) = urlparse (link )
129- if scheme in RECODE_HOSTNAME_FOR :
130- url = urlunparse (
131- (
132- scheme ,
133- unescape_unquote (netloc ),
134- unquote (path ),
135- unescape_unquote (params ),
136- unquote (query ),
137- unescape_unquote (fragment ),
138- )
139- )
140- else :
141- url = unescape_unquote (link )
142- return url
143-
144- # TODO the selective encoding below should probably be done here,
145- # something like:
146- # url_check = urllib.parse.urlparse(destination)
147- # if url_check.scheme in RECODE_HOSTNAME_FOR: ...
148-
149- # parsed = urlparse(url)
150- # if parsed.hostname:
151- # # Encode hostnames in urls like:
152- # # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
153- # #
154- # # We don't encode unknown schemas, because it's likely that we encode
155- # # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
156- # #
157- # if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR:
158- # try:
159- # parsed.hostname = punycode.toUnicode(parsed.hostname)
160- # except Exception:
161- # pass
162- # return unquote(urlunparse(parsed))
47+ parsed = mdurl .parse (url , slashes_denote_host = True )
48+
49+ if parsed .hostname :
50+ # Encode hostnames in urls like:
51+ # `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
52+ #
53+ # We don't encode unknown schemas, because it's likely that we encode
54+ # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
55+ #
56+ if not parsed .protocol or parsed .protocol in RECODE_HOSTNAME_FOR :
57+ try :
58+ parsed = parsed ._replace (hostname = _punycode .to_unicode (parsed .hostname ))
59+ except Exception :
60+ pass
61+
62+ # add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
63+ return mdurl .decode (mdurl .format (parsed ), mdurl .DECODE_DEFAULT_CHARS + "%" )
16364
16465
16566BAD_PROTO_RE = re .compile (r"^(vbscript|javascript|file|data):" )
0 commit comments