From b48d3f78f55020dfa43d5cf95c1b8a1ab4da1773 Mon Sep 17 00:00:00 2001 From: Alessandro Motta Date: Fri, 30 Jun 2023 16:17:56 +0200 Subject: [PATCH 1/3] Add support for WikiLinks This commit adds support for parsing "WikiLinks". Specifically, it introduces `current-parse-wikilinks?`, a parameter to enable the parsing of "internal links" and "piped links" as defined by MediaWiki. See https://www.mediawiki.org/wiki/Help:Links --- commonmark-doc/scribblings/commonmark.scrbl | 21 ++++- .../commonmark/private/scribble-render.rkt | 2 + commonmark-lib/commonmark/parse.rkt | 6 +- .../commonmark/private/parse/inline.rkt | 77 ++++++++++++++++++- commonmark-lib/commonmark/private/render.rkt | 5 +- commonmark-lib/commonmark/private/struct.rkt | 7 +- commonmark-lib/commonmark/render/html.rkt | 2 + commonmark-lib/commonmark/struct.rkt | 3 +- .../tests/commonmark/parse/wikilink.rkt | 19 +++++ 9 files changed, 132 insertions(+), 10 deletions(-) create mode 100644 commonmark-test/tests/commonmark/parse/wikilink.rkt diff --git a/commonmark-doc/scribblings/commonmark.scrbl b/commonmark-doc/scribblings/commonmark.scrbl index 5a1c0d9..e52b4cd 100644 --- a/commonmark-doc/scribblings/commonmark.scrbl +++ b/commonmark-doc/scribblings/commonmark.scrbl @@ -68,6 +68,9 @@ @(define-syntax-rule (cm-examples body ...) (examples #:eval (make-commonmark-eval) #:once body ...)) +@(define MediaWiki @hyperlink["https://www.mediawiki.org/"]{MediaWiki}) +@(define WikiLink @hyperlink["https://www.mediawiki.org/wiki/Help:Links#Internal_links"]{WikiLink}) + @defmodule[commonmark]{ The @racketmodname[commonmark] library implements a @|CommonMark|-compliant Markdown parser. Currently, it passes all test cases in @hyperlink["https://spec.commonmark.org/0.30/"]{v0.30 of the specification}. By default, only the Markdown features specified by @CommonMark are supported, but non-standard support for @tech{footnotes} can be optionally enabled; see the @secref{extensions} section of this manual for more details. @@ -145,6 +148,16 @@ Note that the value of @racket[current-parse-footnotes?] only affects parsing, @ @history[#:added "1.1"]} +@defboolparam[current-parse-wikilinks? parse-wikilinks? #:value #f]{ +Enables or disables the parsing of @|WikiLink|s. Specifically, settings this @reftech{parameter} to a value other than @racket[#f] enables the parsing of @MediaWiki internal links and piped links as @tech{wikilink}s. More complex MediaWiki link types, such as the pipe trick or word-ending links are not supported. + +Importantly, @tech{wikilink} parsing breaks compliance with the @CommonMark specification! + +@(cm-examples + #:label "Example:" + (parameterize ([current-parse-wikilinks? #t]) + (string->document "[[link]] and [[link|label]]")))} + @section[#:tag "rendering-html"]{Rendering HTML} @declare-exporting[commonmark/render/html commonmark] @defmodule[commonmark/render/html #:no-declare]{ @@ -310,7 +323,7 @@ A @deftech{thematic break} is a @tech{block}. It is usually rendered as a horizo @defproc[(inline? [v any/c]) boolean?]{ @see-cm[@tech{inline content} @cm-section{Blocks and inlines}] -Returns @racket[#t] if @racket[v] is @deftech{inline content}: a @reftech{string}, @tech{italic span}, @tech{bold span}, @tech{code span}, @tech{link}, @tech{image}, @tech{footnote reference}, @tech{HTML span}, @tech{hard line break}, or @reftech{list} of @tech{inline content}. Otherwise, returns @racket[#f].} +Returns @racket[#t] if @racket[v] is @deftech{inline content}: a @reftech{string}, @tech{italic span}, @tech{bold span}, @tech{code span}, @tech{link}, @tech{wikilink}, @tech{image}, @tech{footnote reference}, @tech{HTML span}, @tech{hard line break}, or @reftech{list} of @tech{inline content}. Otherwise, returns @racket[#f].} @defstruct*[italic ([content inline?]) #:transparent]{ @see-cm[@tech{italic spans} @cm-section{Emphasis and strong emphasis}] @@ -344,6 +357,12 @@ A @tech{footnote reference} is @tech{inline content} that references a @tech{foo @history[#:added "1.1"]} +@defstruct*[wikilink ([content inline?] [dest string?]) #:transparent]{ + +A @deftech{wikilink} is @tech{inline content} that contains nested @tech{inline content} and a link destination. Following the definitions of @|WikiLink|s in @MediaWiki, the content is identical to the link destination in case of internal links. + +In HTML output, a @tech{wikilink} corresponds to an @tt{} element.} + @defstruct*[html ([content string?]) #:transparent]{ @see-cm[@tech{HTML spans} @cm-section{Raw HTML}] diff --git a/commonmark-doc/scribblings/commonmark/private/scribble-render.rkt b/commonmark-doc/scribblings/commonmark/private/scribble-render.rkt index 2ccad49..64cb6cf 100644 --- a/commonmark-doc/scribblings/commonmark/private/scribble-render.rkt +++ b/commonmark-doc/scribblings/commonmark/private/scribble-render.rkt @@ -126,6 +126,8 @@ (link-element #f _ (footnote-definition-tag label)) (target-element #f _ (footnote-reference-tag label ref-num)) (element 'superscript))) + (define/override (render-wikilink content dest) + (element (style #f (list (make-target-url dest))) content)) (define/override (render-footnote-definition blocks label ref-count) (define multiple-refs? (> ref-count 1)) diff --git a/commonmark-lib/commonmark/parse.rkt b/commonmark-lib/commonmark/parse.rkt index f4e2b90..d9a495c 100644 --- a/commonmark-lib/commonmark/parse.rkt +++ b/commonmark-lib/commonmark/parse.rkt @@ -2,10 +2,12 @@ (require racket/contract "private/struct.rkt" - "private/parse/block.rkt") + "private/parse/block.rkt" + "private/parse/inline.rkt") (provide (contract-out [read-document (-> input-port? document?)] [string->document (-> string? document?)] - [current-parse-footnotes? (parameter/c any/c boolean?)])) + [current-parse-footnotes? (parameter/c any/c boolean?)] + [current-parse-wikilinks? (parameter/c any/c boolean?)])) diff --git a/commonmark-lib/commonmark/private/parse/inline.rkt b/commonmark-lib/commonmark/private/parse/inline.rkt index 3255b0d..b69dc4f 100644 --- a/commonmark-lib/commonmark/private/parse/inline.rkt +++ b/commonmark-lib/commonmark/private/parse/inline.rkt @@ -10,6 +10,7 @@ "common.rkt") (provide string->inline + current-parse-wikilinks? (struct-out link-reference)) ;; ----------------------------------------------------------------------------- @@ -84,7 +85,30 @@ be wrong for multibyte characters. Fortunately, enabling line counting has the convenient side effect of tracking positions in characters rather than bytes, which explains why we need to call -`port-count-lines!` even though we never actually use line information. |# +`port-count-lines!` even though we never actually use line information. + + +Note [Nested WikiLinks] +~~~~~~~~~~~~~~~~~~~~~~~ +How should we handle nested WikiLinks, such as [[this [[nested]] example]]? + +MediaWiki, the engine behind Wikipedia, renders the above example as: + +

[[this nested example

+ +In other words, MediaWiki avoids nested links by only considering the inner-most +link. This is consistent with the handling of nested link in CommonMark. + +Other parsers handle nested WikiLinks differently. Obsidian, for instance, +matches double-brackets "greedily" and parses the above example as + +

this [[nested example]]

. + +Our implementation attempts to match the behavior of MediaWiki. However, we do +not (yet) handle more complex Media Wiki features, such as the "pipe trick": + |# + +(define current-parse-wikilinks? (make-parameter #f (λ (x) (and x #t)))) (struct link-reference (dest title) #:transparent) @@ -118,6 +142,31 @@ positions in characters rather than bytes, which explains why we need to call [(or (? eof-object?) 'link-close) (values '() node last-char #f)] + ['wikilink-open + (define open-text "[[") + (match-define-values [_ _ open-pos] (port-next-location in)) + (let loop ([last-char last-char] [nodes '()] [has-link? #f]) + (define-values [nodes* closer* last-char* has-link?*] (read-sequence last-char)) + (match closer* + [(? eof-object?) + (values (cons open-text (append nodes nodes*)) closer* last-char* has-link?*)] + ['link-close + (cond + [(eqv? (peek-char in) #\]) + (read-char in) + (cond + [(or has-link? has-link?*) + (match-define-values [nodes** closer** last-char** _] (read-sequence last-char*)) + (values (cons open-text (append nodes nodes* (cons "]]" nodes**))) closer** last-char** #t)] + [else + (match-define-values [_ _ close-pos] (port-next-location in)) + (define inner-text (substring str (sub1 open-pos) (- close-pos 3))) + (define node (parse-wikilink inner-text)) + (define-values [nodes** closer** last-char** has-link?**] (read-sequence last-char*)) + (values (cons node nodes**) closer** last-char** (or (wikilink? node) has-link?**))])] + [else + (loop last-char* (append nodes nodes* '("]")) (or has-link? has-link?*))])]))] + [(or 'link-open 'image-open) (define image? (eq? node 'image-open)) (define open-text (if image? "![" "[")) @@ -254,8 +303,16 @@ positions in characters rather than bytes, which explains why we need to call (values (delimiter-run c len len opener? closer?) c)] - ;; § 6.3 Links - [#\[ (read-char in) (values 'link-open #\[)] + ;; § 6.3 Links (and WikiLinks) + [#\[ + (read-char in) + (cond + [(and (current-parse-wikilinks?) + (eqv? (peek-char in) #\[)) + (read-char in) + (values 'wikilink-open #\[)] + [else + (values 'link-open #\[)])] [#\] (read-char in) (values 'link-close #\])] [#\! @@ -310,6 +367,20 @@ positions in characters rather than bytes, which explains why we need to call "next char" (peek-char in) "expected regexp" rx)])) + (define (parse-wikilink inner-text) + (define (process-label label-text) + (define label-node + (string->inline label-text + #:link-defns #hash() + #:footnote-defns #hash())) + (process-emphasis (list label-node))) + + (match (string-split inner-text #px"\\|" #:trim? #f #:repeat? #f) + [(list target label) (wikilink (process-label label) target)] + [(list target) (wikilink (process-label target) target)] + ; MediaWiki does not parse the string "[[]]" as link. + [(list) "[[]]"])) + (define (try-read-link-target content-label-str) (or ;; Full reference links diff --git a/commonmark-lib/commonmark/private/render.rkt b/commonmark-lib/commonmark/private/render.rkt index 3ac8998..d5fa110 100644 --- a/commonmark-lib/commonmark/private/render.rkt +++ b/commonmark-lib/commonmark/private/render.rkt @@ -86,6 +86,7 @@ render-image render-html render-footnote-reference + render-wikilink render-footnote-definition) @@ -156,7 +157,9 @@ (render-html content)] [(footnote-reference label) (match-define (footnote-info defn-num ref-num) (resolve-footnote-reference label)) - (render-footnote-reference label defn-num ref-num)])))) + (render-footnote-reference label defn-num ref-num)] + [(wikilink content dest) + (render-wikilink (render-inline content) dest)])))) (define/public (render-inlines contents) (for*/list ([content (in-list contents)] diff --git a/commonmark-lib/commonmark/private/struct.rkt b/commonmark-lib/commonmark/private/struct.rkt index 5806925..0858f62 100644 --- a/commonmark-lib/commonmark/private/struct.rkt +++ b/commonmark-lib/commonmark/private/struct.rkt @@ -20,7 +20,8 @@ (struct-out link) (struct-out image) (struct-out html) - (struct-out footnote-reference)) + (struct-out footnote-reference) + (struct-out wikilink)) ;; ----------------------------------------------------------------------------- @@ -57,7 +58,8 @@ (link? v) (image? v) (html? v) - (footnote-reference? v))) + (footnote-reference? v) + (wikilink? v))) (define-values [line-break line-break?] (let () @@ -70,3 +72,4 @@ (struct image (description source title) #:transparent) (struct html (content) #:transparent) (struct footnote-reference (label) #:transparent) +(struct wikilink (content dest) #:transparent) diff --git a/commonmark-lib/commonmark/render/html.rkt b/commonmark-lib/commonmark/render/html.rkt index fd44525..5c536b6 100644 --- a/commonmark-lib/commonmark/render/html.rkt +++ b/commonmark-lib/commonmark/render/html.rkt @@ -98,6 +98,8 @@ (a ([id ,(footnote-reference-anchor label ref-num)] [href ,(~a "#" (footnote-definition-anchor (uri-path-segment-encode label)))]) ,(~a defn-num)))) + (define/override (render-wikilink content dest) + `(a ([href ,dest]) ,@content)) (define/override (render-footnote-definition blocks label ref-count) (define encoded-label (uri-path-segment-encode label)) diff --git a/commonmark-lib/commonmark/struct.rkt b/commonmark-lib/commonmark/struct.rkt index 252553c..e0aa072 100644 --- a/commonmark-lib/commonmark/struct.rkt +++ b/commonmark-lib/commonmark/struct.rkt @@ -26,4 +26,5 @@ (struct link ([content inline?] [dest string?] [title (or/c string? #f)])) (struct image ([description inline?] [source string?] [title (or/c string? #f)])) (struct html ([content string?])) - (struct footnote-reference ([label string?])))) + (struct footnote-reference ([label string?])) + (struct wikilink ([content inline?] [dest string?])))) diff --git a/commonmark-test/tests/commonmark/parse/wikilink.rkt b/commonmark-test/tests/commonmark/parse/wikilink.rkt new file mode 100644 index 0000000..9910610 --- /dev/null +++ b/commonmark-test/tests/commonmark/parse/wikilink.rkt @@ -0,0 +1,19 @@ +#lang racket/base + +(require commonmark + commonmark/struct + rackunit) + +(parameterize ([current-parse-wikilinks? #t]) + (check-equal? (string->document "[[example]]") + (document (list (paragraph (wikilink "example" "example"))) '())) + (check-equal? (string->document "[[destination|label]]") + (document (list (paragraph (wikilink "label" "destination"))) '())) + (check-equal? (string->document "[[destination|label with **bold** markup]]") + (document (list (paragraph (wikilink (list "label with " (bold "bold") " markup") "destination"))) '())) + (check-equal? (string->document "[[lorem [[link]] ipsum]]") + (document (list (paragraph (list "[[lorem " (wikilink "link" "link") " ipsum]]"))) '())) + (check-equal? (string->document "[[unclosed") + (document (list (paragraph "[[unclosed")) '())) + (check-equal? (string->document "[[]]") + (document (list (paragraph "[[]]")) '()))) \ No newline at end of file From 4478a32658f009b4a6616cbb20400c4eef107e7f Mon Sep 17 00:00:00 2001 From: Alessandro Motta Date: Thu, 2 Jan 2025 20:21:35 +0100 Subject: [PATCH 2/3] Fix edge cases in parsing of WikiLinks --- .../commonmark/private/parse/inline.rkt | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/commonmark-lib/commonmark/private/parse/inline.rkt b/commonmark-lib/commonmark/private/parse/inline.rkt index b69dc4f..167ebcf 100644 --- a/commonmark-lib/commonmark/private/parse/inline.rkt +++ b/commonmark-lib/commonmark/private/parse/inline.rkt @@ -376,10 +376,21 @@ not (yet) handle more complex Media Wiki features, such as the "pipe trick": (process-emphasis (list label-node))) (match (string-split inner-text #px"\\|" #:trim? #f #:repeat? #f) - [(list target label) (wikilink (process-label label) target)] + ; The following rules are based on the behavior of MediaWiki: + ; 1. "[[]]" is not parsed as link. + [(list) "[[]]"] + ; 2. "[[Link]]" is parsed as link with target and label "Link". [(list target) (wikilink (process-label target) target)] - ; MediaWiki does not parse the string "[[]]" as link. - [(list) "[[]]"])) + ; 3. "[[|Link]]" is not parsed as link. + [(list "" _) (string-append "[[" inner-text "]]")] + ; 4. "[[Link|]]" is parsed as "[[Link]]" (see rule 2). + [(list target "") (wikilink (process-label target) target)] + ; 5. "[[Link|Label]]" is parsed as link with target "Link" and label "Label". + [(list target label) (wikilink (process-label label) target)] + ; 6. "[[Link|La|bel]]" is parsed as link with target "Link" and label "La|bel". + [(list* target label-parts) + (let ([label (string-join label-parts "|")]) + (wikilink (process-label label) target))])) (define (try-read-link-target content-label-str) (or From 4970c1decb92ee8902c9ee4115180700c897349f Mon Sep 17 00:00:00 2001 From: Alessandro Motta Date: Thu, 2 Jan 2025 21:13:46 +0100 Subject: [PATCH 3/3] Simplify parsing of WikiLinks --- .../commonmark/private/parse/inline.rkt | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/commonmark-lib/commonmark/private/parse/inline.rkt b/commonmark-lib/commonmark/private/parse/inline.rkt index 167ebcf..e705074 100644 --- a/commonmark-lib/commonmark/private/parse/inline.rkt +++ b/commonmark-lib/commonmark/private/parse/inline.rkt @@ -374,23 +374,29 @@ not (yet) handle more complex Media Wiki features, such as the "pipe trick": #:link-defns #hash() #:footnote-defns #hash())) (process-emphasis (list label-node))) - - (match (string-split inner-text #px"\\|" #:trim? #f #:repeat? #f) - ; The following rules are based on the behavior of MediaWiki: - ; 1. "[[]]" is not parsed as link. - [(list) "[[]]"] - ; 2. "[[Link]]" is parsed as link with target and label "Link". - [(list target) (wikilink (process-label target) target)] - ; 3. "[[|Link]]" is not parsed as link. - [(list "" _) (string-append "[[" inner-text "]]")] - ; 4. "[[Link|]]" is parsed as "[[Link]]" (see rule 2). - [(list target "") (wikilink (process-label target) target)] - ; 5. "[[Link|Label]]" is parsed as link with target "Link" and label "Label". - [(list target label) (wikilink (process-label label) target)] - ; 6. "[[Link|La|bel]]" is parsed as link with target "Link" and label "La|bel". - [(list* target label-parts) - (let ([label (string-join label-parts "|")]) - (wikilink (process-label label) target))])) + + (define (string-split-on-first str char) + (define char-pos + (for/first + ([p (in-naturals)] + [c (in-string str)] + #:when (char=? char c)) + p)) + + (if char-pos + (let ([head (substring str 0 char-pos)] + [tail (substring str (add1 char-pos))]) + (cons head tail)) + (cons str ""))) + + (match (string-split-on-first inner-text #\|) + ; "[[]]" and "[[|Link]" are not parsed as links. + [(cons "" _) (string-append "[[" inner-text "]]")] + ; "[[Link]]" and "[[Link|]]" are parsed as links with target and link "Link". + [(cons target+label "") (wikilink (process-label target+label) target+label)] + ; "[[Target|Label]]" is parsed as link with target "Target" and label "Label". + ; "[[Target|La|bel]]" is parsed as link with target "Target" and label "La|bel". + [(cons target label) (wikilink (process-label label) target)])) (define (try-read-link-target content-label-str) (or