Handling of numbered markdown lists.

Lukasz Anforowicz · calebcartwright · commit f4201ef2cb89 · 2023-06-19T20:50:01.000-05:00
Fixes issue rust-lang#5416
diff --git a/src/comment.rs b/src/comment.rs
@@ -432,12 +432,18 @@ impl CodeBlockAttribute {
 
 /// Block that is formatted as an item.
 ///
-/// An item starts with either a star `*` a dash `-` a greater-than `>` or a plus '+'.
+/// An item starts with either a star `*`, a dash `-`, a greater-than `>`, a plus '+', or a number
+/// `12.` or `34)` (with at most 2 digits). An item represents CommonMark's ["list
+/// items"](https://spec.commonmark.org/0.30/#list-items) and/or ["block
+/// quotes"](https://spec.commonmark.org/0.30/#block-quotes), but note that only a subset of
+/// CommonMark is recognized - see the doc comment of [`ItemizedBlock::get_marker_length`] for more
+/// details.
+///
 /// Different level of indentation are handled by shrinking the shape accordingly.
 struct ItemizedBlock {
     /// the lines that are identified as part of an itemized block
     lines: Vec<String>,
-    /// the number of characters (typically whitespaces) up to the item sigil
+    /// the number of characters (typically whitespaces) up to the item marker
     indent: usize,
     /// the string that marks the start of an item
     opener: String,
@@ -446,37 +452,70 @@ struct ItemizedBlock {
 }
 
 impl ItemizedBlock {
-    /// Returns `true` if the line is formatted as an item
-    fn is_itemized_line(line: &str) -> bool {
-        let trimmed = line.trim_start();
+    /// Checks whether the `trimmed` line includes an item marker. Returns `None` if there is no
+    /// marker. Returns the length of the marker (in bytes) if one is present. Note that the length
+    /// includes the whitespace that follows the marker, for example the marker in `"* list item"`
+    /// has the length of 2.
+    ///
+    /// This function recognizes item markers that correspond to CommonMark's
+    /// ["bullet list marker"](https://spec.commonmark.org/0.30/#bullet-list-marker),
+    /// ["block quote marker"](https://spec.commonmark.org/0.30/#block-quote-marker), and/or
+    /// ["ordered list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker).
+    ///
+    /// Compared to CommonMark specification, the number of digits that are allowed in an ["ordered
+    /// list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker) is more limited (to at
+    /// most 2 digits). Limiting the length of the marker helps reduce the risk of recognizing
+    /// arbitrary numbers as markers. See also
+    /// <https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990> which gives the
+    /// following example where a number (i.e. "1868") doesn't signify an ordered list:
+    /// ```md
+    /// The Captain died in
+    /// 1868. He wes buried in...
+    /// ```
+    fn get_marker_length(trimmed: &str) -> Option<usize> {
+        // https://spec.commonmark.org/0.30/#bullet-list-marker or
+        // https://spec.commonmark.org/0.30/#block-quote-marker
         let itemized_start = ["* ", "- ", "> ", "+ "];
-        itemized_start.iter().any(|s| trimmed.starts_with(s))
+        if itemized_start.iter().any(|s| trimmed.starts_with(s)) {
+            return Some(2); // All items in `itemized_start` have length 2.
+        }
+
+        // https://spec.commonmark.org/0.30/#ordered-list-marker, where at most 2 digits are
+        // allowed.
+        for suffix in [". ", ") "] {
+            if let Some((prefix, _)) = trimmed.split_once(suffix) {
+                if prefix.len() <= 2 && prefix.chars().all(|c| char::is_ascii_digit(&c)) {
+                    return Some(prefix.len() + suffix.len());
+                }
+            }
+        }
+
+        None // No markers found.
     }
 
-    /// Creates a new ItemizedBlock described with the given line.
-    /// The `is_itemized_line` needs to be called first.
-    fn new(line: &str) -> ItemizedBlock {
-        let space_to_sigil = line.chars().take_while(|c| c.is_whitespace()).count();
-        // +2 = '* ', which will add the appropriate amount of whitespace to keep itemized
-        // content formatted correctly.
-        let mut indent = space_to_sigil + 2;
+    /// Creates a new `ItemizedBlock` described with the given `line`.
+    /// Returns `None` if `line` doesn't start an item.
+    fn new(line: &str) -> Option<ItemizedBlock> {
+        let marker_length = ItemizedBlock::get_marker_length(line.trim_start())?;
+        let space_to_marker = line.chars().take_while(|c| c.is_whitespace()).count();
+        let mut indent = space_to_marker + marker_length;
         let mut line_start = " ".repeat(indent);
 
         // Markdown blockquote start with a "> "
         if line.trim_start().starts_with(">") {
             // remove the original +2 indent because there might be multiple nested block quotes
             // and it's easier to reason about the final indent by just taking the length
-            // of th new line_start. We update the indent because it effects the max width
+            // of the new line_start. We update the indent because it effects the max width
             // of each formatted line.
             line_start = itemized_block_quote_start(line, line_start, 2);
             indent = line_start.len();
         }
-        ItemizedBlock {
+        Some(ItemizedBlock {
             lines: vec![line[indent..].to_string()],
             indent,
             opener: line[..indent].to_string(),
             line_start,
-        }
+        })
     }
 
     /// Returns a `StringFormat` used for formatting the content of an item.
@@ -495,7 +534,7 @@ impl ItemizedBlock {
     /// Returns `true` if the line is part of the current itemized block.
     /// If it is, then it is added to the internal lines list.
     fn add_line(&mut self, line: &str) -> bool {
-        if !ItemizedBlock::is_itemized_line(line)
+        if ItemizedBlock::get_marker_length(line.trim_start()).is_none()
             && self.indent <= line.chars().take_while(|c| c.is_whitespace()).count()
         {
             self.lines.push(line.to_string());
@@ -766,10 +805,11 @@ impl<'a> CommentRewrite<'a> {
         self.item_block = None;
         if let Some(stripped) = line.strip_prefix("```") {
             self.code_block_attr = Some(CodeBlockAttribute::new(stripped))
-        } else if self.fmt.config.wrap_comments() && ItemizedBlock::is_itemized_line(line) {
-            let ib = ItemizedBlock::new(line);
-            self.item_block = Some(ib);
-            return false;
+        } else if self.fmt.config.wrap_comments() {
+            if let Some(ib) = ItemizedBlock::new(line) {
+                self.item_block = Some(ib);
+                return false;
+            }
         }
 
         if self.result == self.opener {
@@ -2020,4 +2060,96 @@ fn main() {
 "#;
         assert_eq!(s, filter_normal_code(s_with_comment));
     }
+
+    #[test]
+    fn test_itemized_block_first_line_handling() {
+        fn run_test(
+            test_input: &str,
+            expected_line: &str,
+            expected_indent: usize,
+            expected_opener: &str,
+            expected_line_start: &str,
+        ) {
+            let block = ItemizedBlock::new(test_input).unwrap();
+            assert_eq!(1, block.lines.len(), "test_input: {:?}", test_input);
+            assert_eq!(
+                expected_line, &block.lines[0],
+                "test_input: {:?}",
+                test_input
+            );
+            assert_eq!(
+                expected_indent, block.indent,
+                "test_input: {:?}",
+                test_input
+            );
+            assert_eq!(
+                expected_opener, &block.opener,
+                "test_input: {:?}",
+                test_input
+            );
+            assert_eq!(
+                expected_line_start, &block.line_start,
+                "test_input: {:?}",
+                test_input
+            );
+        }
+
+        run_test("- foo", "foo", 2, "- ", "  ");
+        run_test("* foo", "foo", 2, "* ", "  ");
+        run_test("> foo", "foo", 2, "> ", "> ");
+
+        run_test("1. foo", "foo", 3, "1. ", "   ");
+        run_test("12. foo", "foo", 4, "12. ", "    ");
+        run_test("1) foo", "foo", 3, "1) ", "   ");
+        run_test("12) foo", "foo", 4, "12) ", "    ");
+
+        run_test("    - foo", "foo", 6, "    - ", "      ");
+
+        // https://spec.commonmark.org/0.30 says: "A start number may begin with 0s":
+        run_test("0. foo", "foo", 3, "0. ", "   ");
+        run_test("01. foo", "foo", 4, "01. ", "    ");
+    }
+
+    #[test]
+    fn test_itemized_block_nonobvious_markers_are_rejected() {
+        let test_inputs = vec![
+            // Non-numeric item markers (e.g. `a.` or `iv.`) are not allowed by
+            // https://spec.commonmark.org/0.30/#ordered-list-marker. We also note that allowing
+            // them would risk misidentifying regular words as item markers. See also the
+            // discussion in https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990
+            "word.  rest of the paragraph.",
+            "a.  maybe this is a list item?  maybe not?",
+            "iv.  maybe this is a list item?  maybe not?",
+            // Numbers with 3 or more digits are not recognized as item markers, to avoid
+            // formatting the following example as a list:
+            //
+            // ```
+            // The Captain died in
+            // 1868. He was buried in...
+            // ```
+            "123.  only 2-digit numbers are recognized as item markers.",
+            // Parens:
+            "123)  giving some coverage to parens as well.",
+            "a)  giving some coverage to parens as well.",
+            // https://spec.commonmark.org/0.30 says that "at least one space or tab is needed
+            // between the list marker and any following content":
+            "1.Not a list item.",
+            "1.2.3. Not a list item.",
+            "1)Not a list item.",
+            "-Not a list item.",
+            "+Not a list item.",
+            "+1 not a list item.",
+            // https://spec.commonmark.org/0.30 says: "A start number may not be negative":
+            "-1. Not a list item.",
+            "-1 Not a list item.",
+        ];
+        for line in test_inputs.iter() {
+            let maybe_block = ItemizedBlock::new(line);
+            assert!(
+                maybe_block.is_none(),
+                "The following line shouldn't be classified as a list item: {}",
+                line
+            );
+        }
+    }
 }
diff --git a/tests/source/itemized-blocks/no_wrap.rs b/tests/source/itemized-blocks/no_wrap.rs
@@ -1,7 +1,7 @@
 // rustfmt-normalize_comments: true
 // rustfmt-format_code_in_doc_comments: true
 
-//! This is a list:
+//! This is an itemized markdown list (see also issue #3224):
 //!  * Outer
 //!  * Outer
 //!   * Inner
@@ -13,6 +13,40 @@
 //! - when the log level is info, the level name is green and the rest of the line is white
 //! - when the log level is debug, the whole line is white
 //! - when the log level is trace, the whole line is gray ("bright black")
+//!
+//! This is a numbered markdown list (see also issue #5416):
+//! 1. Long long long long long long long long long long long long long long long long long line
+//! 2. Another very long long long long long long long long long long long long long long long line
+//! 3. Nested list
+//!    1. Long long long long long long long long long long long long long long long long line
+//!    2. Another very long long long long long long long long long long long long long long line
+//! 4. Last item
+//!
+//! Using the ')' instead of '.' character after the number:
+//! 1) Long long long long long long long long long long long long long long long long long line
+//! 2) Another very long long long long long long long long long long long long long long long line
+//!
+//! Deep list that mixes various bullet and number formats:
+//! 1. First level with a long long long long long long long long long long long long long long
+//!    long long long line
+//! 2. First level with another very long long long long long long long long long long long long
+//!    long long long line
+//!     * Second level with a long long long long long long long long long long long long long
+//!       long long long line
+//!     * Second level with another very long long long long long long long long long long long
+//!       long long long line
+//!         1) Third level with a long long long long long long long long long long long long long
+//!            long long long line
+//!         2) Third level with another very long long long long long long long long long long
+//!            long long long long line
+//!             - Forth level with a long long long long long long long long long long long long
+//!               long long long long line
+//!             - Forth level with another very long long long long long long long long long long
+//!               long long long long line
+//!         3) One more item at the third level
+//!         4) Last item of the third level
+//!     * Last item of second level
+//! 3. Last item of first level
 
 /// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote
 /// theater, i.e., as passed to [`Theater::send`] on the remote actor:
diff --git a/tests/source/itemized-blocks/wrap.rs b/tests/source/itemized-blocks/wrap.rs
@@ -2,7 +2,7 @@
 // rustfmt-format_code_in_doc_comments: true
 // rustfmt-max_width: 50
 
-//! This is a list:
+//! This is an itemized markdown list (see also issue #3224):
 //!  * Outer
 //!  * Outer
 //!   * Inner
@@ -14,6 +14,40 @@
 //! - when the log level is info, the level name is green and the rest of the line is white
 //! - when the log level is debug, the whole line is white
 //! - when the log level is trace, the whole line is gray ("bright black")
+//!
+//! This is a numbered markdown list (see also issue #5416):
+//! 1. Long long long long long long long long long long long long long long long long long line
+//! 2. Another very long long long long long long long long long long long long long long long line
+//! 3. Nested list
+//!    1. Long long long long long long long long long long long long long long long long line
+//!    2. Another very long long long long long long long long long long long long long long line
+//! 4. Last item
+//!
+//! Using the ')' instead of '.' character after the number:
+//! 1) Long long long long long long long long long long long long long long long long long line
+//! 2) Another very long long long long long long long long long long long long long long long line
+//!
+//! Deep list that mixes various bullet and number formats:
+//! 1. First level with a long long long long long long long long long long long long long long
+//!    long long long line
+//! 2. First level with another very long long long long long long long long long long long long
+//!    long long long line
+//!     * Second level with a long long long long long long long long long long long long long
+//!       long long long line
+//!     * Second level with another very long long long long long long long long long long long
+//!       long long long line
+//!         1) Third level with a long long long long long long long long long long long long long
+//!            long long long line
+//!         2) Third level with another very long long long long long long long long long long
+//!            long long long long line
+//!             - Forth level with a long long long long long long long long long long long long
+//!               long long long long line
+//!             - Forth level with another very long long long long long long long long long long
+//!               long long long long line
+//!         3) One more item at the third level
+//!         4) Last item of the third level
+//!     * Last item of second level
+//! 3. Last item of first level
 
 // This example shows how to configure fern to output really nicely colored logs
 // - when the log level is error, the whole line is red
diff --git a/tests/target/itemized-blocks/no_wrap.rs b/tests/target/itemized-blocks/no_wrap.rs
@@ -1,7 +1,7 @@
 // rustfmt-normalize_comments: true
 // rustfmt-format_code_in_doc_comments: true
 
-//! This is a list:
+//! This is an itemized markdown list (see also issue #3224):
 //!  * Outer
 //!  * Outer
 //!   * Inner
@@ -13,6 +13,40 @@
 //! - when the log level is info, the level name is green and the rest of the line is white
 //! - when the log level is debug, the whole line is white
 //! - when the log level is trace, the whole line is gray ("bright black")
+//!
+//! This is a numbered markdown list (see also issue #5416):
+//! 1. Long long long long long long long long long long long long long long long long long line
+//! 2. Another very long long long long long long long long long long long long long long long line
+//! 3. Nested list
+//!    1. Long long long long long long long long long long long long long long long long line
+//!    2. Another very long long long long long long long long long long long long long long line
+//! 4. Last item
+//!
+//! Using the ')' instead of '.' character after the number:
+//! 1) Long long long long long long long long long long long long long long long long long line
+//! 2) Another very long long long long long long long long long long long long long long long line
+//!
+//! Deep list that mixes various bullet and number formats:
+//! 1. First level with a long long long long long long long long long long long long long long
+//!    long long long line
+//! 2. First level with another very long long long long long long long long long long long long
+//!    long long long line
+//!     * Second level with a long long long long long long long long long long long long long
+//!       long long long line
+//!     * Second level with another very long long long long long long long long long long long
+//!       long long long line
+//!         1) Third level with a long long long long long long long long long long long long long
+//!            long long long line
+//!         2) Third level with another very long long long long long long long long long long
+//!            long long long long line
+//!             - Forth level with a long long long long long long long long long long long long
+//!               long long long long line
+//!             - Forth level with another very long long long long long long long long long long
+//!               long long long long line
+//!         3) One more item at the third level
+//!         4) Last item of the third level
+//!     * Last item of second level
+//! 3. Last item of first level
 
 /// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote
 /// theater, i.e., as passed to [`Theater::send`] on the remote actor:
diff --git a/tests/target/itemized-blocks/wrap.rs b/tests/target/itemized-blocks/wrap.rs