@@ -31,42 +31,79 @@ function ConvertTo-ZtMarkdown {
3131
3232 $md = $Html
3333
34- # Convert anchor tags to Markdown links
34+ # Convert <a href="url">text</a> → [text](url)
35+ # <a\s+[^>]*href="([^"]*)"[^>]*> — opening <a> tag with any attributes; captures href value in group 1
36+ # ([^<]*) — captures the visible link text (no nested tags) in group 2
37+ # </a> — closing tag
3538 $md = [regex ]::Replace($md , ' <a\s+[^>]*href="([^"]*)"[^>]*>([^<]*)</a>' , ' [$2]($1)' )
3639
37- # Convert ordered list items with sequential numbering
38- $liCounter = [ref ]0
39- $md = [regex ]::Replace($md , ' <li[^>]*>' , {
40- $liCounter.Value ++
41- " `n $ ( $liCounter.Value ) . "
40+ # Process <ol> and <ul> blocks: numbered items for <ol>, '-' bullets for <ul>.
41+ # The counter resets to 0 for each list so separate <ol> blocks both start at 1.
42+ # (?si) — s: dot matches newlines (multi-line list content); i: case-insensitive tags
43+ # <(ol|ul)[^>]*> — opening list tag; captures tag name (ol/ul) in group 1
44+ # (.*?) — lazily captures everything inside the list in group 2
45+ # </\1> — closing tag matching the same tag name captured in group 1 (ol or ul)
46+ $md = [regex ]::Replace($md , ' (?si)<(ol|ul)[^>]*>(.*?)</\1>' , {
47+ param ($m )
48+ $isOrdered = $m.Groups [1 ].Value -ieq ' ol'
49+ $inner = $m.Groups [2 ].Value
50+ $idx = 0
51+ $liOut = [System.Text.StringBuilder ]::new()
52+ $last = 0
53+ # (?si)<li[^>]*>(.*?)</li> — matches each list item; group 1 is the item content
54+ foreach ($li in ([regex ]' (?si)<li[^>]*>(.*?)</li>' ).Matches($inner )) {
55+ $null = $liOut.Append ($inner.Substring ($last , $li.Index - $last ))
56+ $content = $li.Groups [1 ].Value.Trim()
57+ if ($isOrdered ) { $idx ++ ; $null = $liOut.Append (" `n $idx . $content " ) }
58+ else { $null = $liOut.Append (" `n - $content " ) }
59+ $last = $li.Index + $li.Length
60+ }
61+ if ($last -lt $inner.Length ) { $null = $liOut.Append ($inner.Substring ($last )) }
62+ # Remove any <li>/</ li> tags not matched above (e.g. malformed HTML without closing </li>)
63+ $processedInner = [regex ]::Replace($liOut.ToString (), ' </?li[^>]*>' , ' ' )
64+ return $processedInner + " `n "
4265 })
4366
44- $md = $md -replace ' </li>' , ' '
45- $md = $md -replace ' <[ou]l[^>]*>' , ' '
46- $md = $md -replace ' </[ou]l>' , ' '
67+ # <br\s*/?> — self-closing or open <br>, with optional whitespace before the slash
4768 $md = $md -replace ' <br\s*/?>' , " `n "
69+
70+ # </?p[^>]*> — opening or closing <p> with any attributes
71+ # </?div[^>]*> — opening or closing <div> with any attributes
4872 $md = $md -replace ' </?p[^>]*>' , " `n "
4973 $md = $md -replace ' </?div[^>]*>' , " `n "
74+
75+ # <(?:b|strong)[^>]*>([^<]*)</(?:b|strong)> — bold tags wrapping plain text; group 1 = content
76+ # (?:...) is a non-capturing group so $1 refers to the text content, not the tag name
5077 $md = $md -replace ' <(?:b|strong)[^>]*>([^<]*)</(?:b|strong)>' , ' **$1**'
78+
79+ # <(?:i|em)[^>]*>([^<]*)</(?:i|em)> — italic tags wrapping plain text; group 1 = content
5180 $md = $md -replace ' <(?:i|em)[^>]*>([^<]*)</(?:i|em)>' , ' *$1*'
5281
53- # Strip any remaining HTML tags
82+ # <[^>]+> — any remaining HTML tag: one or more non-'>' characters between angle brackets
5483 $md = $md -replace ' <[^>]+>' , ' '
5584
5685 # Decode HTML entities
57- $md = $md -replace ' &' , ' &'
58- $md = $md -replace ' <' , ' <'
59- $md = $md -replace ' >' , ' >'
60- $md = $md -replace ' "' , ' "'
61- $md = $md -replace ' '' , " '"
62- $md = $md -replace ' ' , ' '
63-
64- # Convert bare URLs (not already wrapped in a Markdown link) → [url](url)
86+ $md = $md -replace ' &' , ' &'
87+ $md = $md -replace ' <' , ' <'
88+ $md = $md -replace ' >' , ' >'
89+ $md = $md -replace ' "' , ' "'
90+ $md = $md -replace ' '' , " '"
91+ $md = $md -replace ' ' , ' '
92+
93+ # Convert bare URLs that are not already inside a Markdown link → [url](url)
94+ # (?<!\() — negative lookbehind: not preceded by '(' (already a Markdown link)
95+ # (https?://[^\s<>"\[\]()]+?) — captures the URL lazily (stops before whitespace or special chars)
96+ # ([.,;]?) — optionally captures a trailing punctuation character
97+ # (?=\s|$) — lookahead: must be followed by whitespace or end of string
6598 $md = [regex ]::Replace($md , ' (?<!\()(https?://[^\s<>"\[\]()]+?)([.,;]?)(?=\s|$)' , ' [${1}](${1})${2}' )
6699
67100 # Break inline numbered steps onto separate lines: "text 2. Word" → "text\n2. Word"
101+ # (?<=\S) — positive lookbehind: preceded by a non-whitespace character (mid-sentence)
102+ # (\d{1,2})\. — captures 1–2 digit number followed by a literal dot
103+ # ([A-Z]) — lookahead-style capture: next word starts with uppercase (new sentence/step)
68104 $md = [regex ]::Replace($md , ' (?<=\S) (\d{1,2})\. ([A-Z])' , " `n " + ' $1. $2' )
69105
106+ # Trim each line and remove blank lines
70107 $md = ($md -split " `n " | ForEach-Object { $_.Trim () } | Where-Object { $_ -ne ' ' }) -join " `n "
71108
72109 return $md
0 commit comments