choldgraf · hukkin · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/docs/users/changelog.md b/docs/users/changelog.md
@@ -12,6 +12,8 @@ Note that there is currently no guarantee for a stable Markdown formatting style
   - Incorrect line wrap on lines right after a hard break.
     Thank you, [MDW](https://github.com/mdeweerd), for the issue.
   - Adding an extra leading space to paragraphs that start with space in line wrap modes.
+  - An error on empty paragraph (Unicode space only) surrounded by non-paragraph elements.
+    Thank you, [Nico Schlömer](https://github.com/nschloe), for the issue.
 - Added
   - Plugin interface: `mdformat.plugins.ParserExtensionInterface.add_cli_argument_group`.
     With this plugins can now read CLI arguments merged with values from `.mdformat.toml`.

diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
@@ -13,7 +13,10 @@
 
 NULL_CTX = nullcontext()
 EMPTY_MAP: MappingProxyType = MappingProxyType({})
+
 RE_NEWLINES = re.compile(r"\r\n|\r|\n")
+RE_HTML_START_SPACE_PREFIX = re.compile(r" (<[a-zA-Z][-a-zA-Z0-9]*>)")
+RE_HTML_END_SPACE_SUFFIX = re.compile(r"(</[a-zA-Z][-a-zA-Z0-9]*>) ")
 
 
 def build_mdit(
@@ -62,17 +65,11 @@ def is_md_equal(
     for key, text in [("md1", md1), ("md2", md2)]:
         html = mdit.render(text)
 
-        # The HTML can start with whitespace if Markdown starts with raw HTML
-        # preceded by whitespace. This whitespace should be safe to lstrip.
-        # Also, the trailing newline we add at the end of a document that ends
-        # in a raw html block not followed by a newline, seems to propagate to
-        # an HTML rendering. This newline should be safe to rstrip.
-        html = html.strip()
-
         # Remove codeblocks because code formatter plugins do arbitrary changes.
-        for codeclass in codeformatters:
+        if codeformatters:
+            langs_re = "|".join(re.escape(lang) for lang in codeformatters)
             html = re.sub(
-                f'<code class="language-{codeclass}">.*</code>',
+                rf'<code class="language-(?:{langs_re})">.*</code>',
                 "",
                 html,
                 flags=re.DOTALL,
@@ -85,17 +82,19 @@ def is_md_equal(
         html = html.replace("<p> ", "<p>")
         html = html.replace(" </p>", "</p>")
 
-        # Also strip whitespace leading/trailing the <p> elements so that we can
-        # safely remove empty paragraphs below without introducing extra whitespace.
-        html = html.replace(" <p>", "<p>")
-        html = html.replace("</p> ", "</p>")
+        # Also remove whitespace preceding opening tags, and trailing
+        # closing tags, so that we can safely remove empty paragraphs
+        # below without introducing extra whitespace.
+        html = RE_HTML_END_SPACE_SUFFIX.sub(r"\g<1>", html)
+        html = RE_HTML_START_SPACE_PREFIX.sub(r"\g<1>", html)
 
         # empty p elements should be ignored by user agents
         # (https://www.w3.org/TR/REC-html40/struct/text.html#edef-P)
         html = html.replace("<p></p>", "")
 
-        # If it's nothing but whitespace, it's equal
-        html = re.sub(r"^\s+$", "", html)
+        # Leading and trailing whitespace should be safe to ignore. This
+        # also makes any documents that are whitespace-only equal.
+        html = html.strip()
 
         html_texts[key] = html
 

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -54,7 +54,7 @@ def test_fmt_string():
         pytest.param("a\n\n\xa0\n\nb"),  # lone NBSP between two paragraphs
         pytest.param("\xa0\n\n# heading"),  # lone NBSP followed by a heading
         pytest.param(
-            "```\na\n```\n\u2003\n# A\n", marks=pytest.mark.xfail()
+            "```\na\n```\n\u2003\n# A\n"
         ),  # em space surrounded by code and header
     ],
 )

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -0,0 +1,23 @@
+from mdformat._util import is_md_equal
+
+
+def test_is_md_equal():
+    md1 = """
+paragraph
+
+```js
+console.log()
+```
+
+paragr
+"""
+    md2 = """
+paragraph
+
+```js
+bonsole.l()g
+```
+
+paragr"""
+    assert not is_md_equal(md1, md2)
+    assert is_md_equal(md1, md2, codeformatters=("js", "go"))