diff --git a/functions/core/html_to_markdown.go b/functions/core/html_to_markdown.go new file mode 100644 index 00000000..48c2e074 --- /dev/null +++ b/functions/core/html_to_markdown.go @@ -0,0 +1,281 @@ +package core + +import ( + "html" + "regexp" + "strings" + + "golang.org/x/net/html/atom" + + ghtml "golang.org/x/net/html" +) + +// skipAtoms are tags whose entire subtree is discarded. +var skipAtoms = map[atom.Atom]bool{ + atom.Script: true, + atom.Style: true, + atom.Noscript: true, +} + +// HtmlToMarkdown converts an HTML string to readable markdown (best-effort). +// +// Supported elements (in priority order): +// -

..

→ ATX headings (#..######) +// -

→ paragraph separated by blank line +// - → [text](href) +// - , → **text** +// - , → *text* +// - → `text` +// -

       → fenced code block
+//   -