feat(go): html_to_markdown + extract_iocs

functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 11:51:51 +02:00
parent 3de82c53c1
commit b04bb846c7
7 changed files with 1471 additions and 0 deletions
@@ -0,0 +1,281 @@
+package core
+
+import (
+	"html"
+	"regexp"
+	"strings"
+
+	"golang.org/x/net/html/atom"
+
+	ghtml "golang.org/x/net/html"
+)
+
+// skipAtoms are tags whose entire subtree is discarded.
+var skipAtoms = map[atom.Atom]bool{
+	atom.Script:   true,
+	atom.Style:    true,
+	atom.Noscript: true,
+}
+
+// HtmlToMarkdown converts an HTML string to readable markdown (best-effort).
+//
+// Supported elements (in priority order):
+//   - <h1>..<h6>  → ATX headings (#..######)
+//   - <p>         → paragraph separated by blank line
+//   - <a href>    → [text](href)
+//   - <strong>,<b> → **text**
+//   - <em>,<i>    → *text*
+//   - <code>      → `text`
+//   - <pre>       → fenced code block
+//   - <ul>/<ol>/<li> → bullet or numbered list
+//   - <br>        → newline
+//   - <hr>        → ---
+//   - <img alt src> → ![alt](src)
+//   - <blockquote> → > text
+//
+// Skipped: <script>, <style>, <noscript> (and their subtrees).
+// Everything else (div, span, section, etc.) emits descendant text without markup.
+// Multiple whitespace in text nodes is collapsed to a single space.
+// HTML entities are decoded via html.UnescapeString.
+func HtmlToMarkdown(raw string) string {
+	doc, err := ghtml.Parse(strings.NewReader(raw))
+	if err != nil {
+		// If parsing fails, return the raw string stripped of tags as a fallback.
+		return stripTags(raw)
+	}
+
+	var b strings.Builder
+	walkNode(&b, doc, &walkState{})
+
+	// Collapse 3+ consecutive newlines → 2.
+	result := collapseNewlines(b.String())
+	return strings.TrimSpace(result)
+}
+
+// walkState carries context through the recursive walk.
+type walkState struct {
+	inPre       bool
+	listStack   []atom.Atom // ul or ol
+	blockquote  int         // nesting depth
+	skipDepth   int         // subtree skip depth
+}
+
+func walkNode(b *strings.Builder, n *ghtml.Node, s *walkState) {
+	switch n.Type {
+	case ghtml.CommentNode:
+		return // skip HTML comments
+
+	case ghtml.TextNode:
+		if s.skipDepth > 0 {
+			return
+		}
+		text := html.UnescapeString(n.Data)
+		if s.inPre {
+			b.WriteString(text)
+		} else {
+			// Collapse whitespace sequences to a single space.
+			text = collapseWhitespace(text)
+			if text != "" {
+				b.WriteString(text)
+			}
+		}
+		return
+
+	case ghtml.ElementNode:
+		a := n.DataAtom
+		if skipAtoms[a] {
+			s.skipDepth++
+			walkChildren(b, n, s)
+			s.skipDepth--
+			return
+		}
+		if s.skipDepth > 0 {
+			walkChildren(b, n, s)
+			return
+		}
+		renderElement(b, n, s, a)
+		return
+	}
+
+	// For document, doctype, etc.: just walk children.
+	walkChildren(b, n, s)
+}
+
+func renderElement(b *strings.Builder, n *ghtml.Node, s *walkState, a atom.Atom) {
+	switch a {
+	case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
+		level := headingLevel(a)
+		prefix := strings.Repeat("#", level) + " "
+		b.WriteString("\n\n" + prefix)
+		walkChildren(b, n, s)
+		b.WriteString("\n\n")
+
+	case atom.P:
+		b.WriteString("\n\n")
+		walkChildren(b, n, s)
+		b.WriteString("\n\n")
+
+	case atom.Br:
+		b.WriteString("\n")
+
+	case atom.Hr:
+		b.WriteString("\n\n---\n\n")
+
+	case atom.Pre:
+		s.inPre = true
+		b.WriteString("\n\n```\n")
+		walkChildren(b, n, s)
+		b.WriteString("\n```\n\n")
+		s.inPre = false
+
+	case atom.Code:
+		if s.inPre {
+			walkChildren(b, n, s)
+		} else {
+			b.WriteString("`")
+			walkChildren(b, n, s)
+			b.WriteString("`")
+		}
+
+	case atom.Strong, atom.B:
+		b.WriteString("**")
+		walkChildren(b, n, s)
+		b.WriteString("**")
+
+	case atom.Em, atom.I:
+		b.WriteString("*")
+		walkChildren(b, n, s)
+		b.WriteString("*")
+
+	case atom.A:
+		href := attrVal(n, "href")
+		var text strings.Builder
+		walkChildren(&text, n, s)
+		linkText := strings.TrimSpace(text.String())
+		if linkText == "" {
+			linkText = href
+		}
+		b.WriteString("[" + linkText + "](" + href + ")")
+
+	case atom.Img:
+		src := attrVal(n, "src")
+		if src == "" {
+			src = attrVal(n, "data-src")
+		}
+		alt := attrVal(n, "alt")
+		b.WriteString("![" + alt + "](" + src + ")")
+
+	case atom.Ul:
+		s.listStack = append(s.listStack, atom.Ul)
+		b.WriteString("\n")
+		walkChildren(b, n, s)
+		b.WriteString("\n")
+		s.listStack = s.listStack[:len(s.listStack)-1]
+
+	case atom.Ol:
+		s.listStack = append(s.listStack, atom.Ol)
+		b.WriteString("\n")
+		walkChildren(b, n, s)
+		b.WriteString("\n")
+		s.listStack = s.listStack[:len(s.listStack)-1]
+
+	case atom.Li:
+		prefix := "- "
+		if len(s.listStack) > 0 && s.listStack[len(s.listStack)-1] == atom.Ol {
+			prefix = "1. "
+		}
+		b.WriteString("\n" + prefix)
+		walkChildren(b, n, s)
+
+	case atom.Blockquote:
+		s.blockquote++
+		b.WriteString("\n\n")
+		// Capture children into a temp buffer then prefix each line with "> ".
+		var inner strings.Builder
+		walkChildren(&inner, n, s)
+		s.blockquote--
+		quoted := prefixLines(strings.TrimSpace(inner.String()), "> ")
+		b.WriteString(quoted)
+		b.WriteString("\n\n")
+
+	default:
+		// div, span, section, article, header, footer, nav, etc.
+		// Just emit descendant text without any markup.
+		walkChildren(b, n, s)
+	}
+}
+
+func walkChildren(b interface{ WriteString(string) (int, error) }, n *ghtml.Node, s *walkState) {
+	// We accept a strings.Builder-like writer.
+	sb, ok := b.(*strings.Builder)
+	if !ok {
+		return
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		walkNode(sb, c, s)
+	}
+}
+
+// headingLevel maps h1..h6 atom to 1..6.
+func headingLevel(a atom.Atom) int {
+	switch a {
+	case atom.H1:
+		return 1
+	case atom.H2:
+		return 2
+	case atom.H3:
+		return 3
+	case atom.H4:
+		return 4
+	case atom.H5:
+		return 5
+	case atom.H6:
+		return 6
+	}
+	return 1
+}
+
+// attrVal returns the value of the named attribute, or "".
+func attrVal(n *ghtml.Node, name string) string {
+	for _, a := range n.Attr {
+		if a.Key == name {
+			return a.Val
+		}
+	}
+	return ""
+}
+
+var wsRe = regexp.MustCompile(`[ \t\r\n]+`)
+
+// collapseWhitespace reduces any run of whitespace to a single space.
+func collapseWhitespace(s string) string {
+	return wsRe.ReplaceAllString(s, " ")
+}
+
+var manyNL = regexp.MustCompile(`\n{3,}`)
+
+// collapseNewlines reduces 3+ consecutive newlines to 2.
+func collapseNewlines(s string) string {
+	return manyNL.ReplaceAllString(s, "\n\n")
+}
+
+// prefixLines prepends prefix to every non-empty line in s.
+func prefixLines(s, prefix string) string {
+	lines := strings.Split(s, "\n")
+	var out []string
+	for _, l := range lines {
+		if strings.TrimSpace(l) == "" {
+			out = append(out, "")
+		} else {
+			out = append(out, prefix+l)
+		}
+	}
+	return strings.Join(out, "\n")
+}
+
+var tagRe = regexp.MustCompile(`<[^>]+>`)
+
+// stripTags is a last-resort fallback: remove all HTML tags.
+func stripTags(s string) string {
+	return tagRe.ReplaceAllString(s, "")
+}
@@ -0,0 +1,92 @@
+---
+name: html_to_markdown
+kind: function
+lang: go
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "func HtmlToMarkdown(html string) string"
+description: "Convierte HTML a markdown legible. Parser recursivo del DOM via golang.org/x/net/html. MVP best-effort: soporta headings, parrafos, links, strong/em, code, pre, listas, blockquote, img, br, hr. Skippea script/style/noscript y sus descendientes. Texto plano con whitespace colapsado. Entidades HTML decodificadas."
+tags: [html, markdown, converter, parsing, text, core]
+params:
+  - name: html
+    desc: "String HTML completo o fragmento a convertir. Puede incluir doctype, head y body, o ser solo un fragmento de markup."
+output: "Markdown legible derivado del HTML, mejor esfuerzo. Headings ATX, links en formato [text](href), listas con - o 1., bloques de codigo con backticks. Multiples lineas en blanco colapsadas a una. Nunca retorna error."
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports:
+  - "golang.org/x/net/html"
+  - "golang.org/x/net/html/atom"
+tested: true
+tests:
+  - "h1 heading"
+  - "h2 heading"
+  - "h3 heading"
+  - "h4 heading"
+  - "h5 heading"
+  - "h6 heading"
+  - "paragraph"
+  - "link"
+  - "link with no text falls back to href"
+  - "strong"
+  - "b tag"
+  - "em"
+  - "i tag"
+  - "code inline"
+  - "pre block"
+  - "pre block preserves content"
+  - "unordered list"
+  - "unordered list second item"
+  - "ordered list"
+  - "ordered list second item"
+  - "br becomes newline"
+  - "hr becomes dashes"
+  - "img with alt and src"
+  - "img with empty alt"
+  - "blockquote"
+  - "script tag skipped"
+  - "script content not in output"
+  - "style tag skipped"
+  - "noscript skipped"
+  - "div wrapping does not add markup"
+  - "html entities decoded"
+  - "multiple blank lines collapsed"
+  - "nested strong inside link"
+  - "html comment skipped"
+test_file_path: "functions/core/html_to_markdown_test.go"
+file_path: "functions/core/html_to_markdown.go"
+---
+
+## Ejemplo
+
+```go
+md := HtmlToMarkdown(`<h1>Title</h1><p>Hello <strong>world</strong>.</p><a href="/path">link</a>`)
+// md = "# Title\n\nHello **world**.\n\n[link](/path)"
+```
+
+## Notas
+
+Funcion pura sin efectos secundarios. No extrae el "articulo principal" como hace el Python
+con readabilipy/readability — convierte el HTML completo tal como llega. Para el caso de
+uso del enricher fetch_webpage esto es suficiente: el HTML ya fue descargado por el caller.
+
+Elementos soportados (por prioridad):
+- h1..h6 → # .. ######
+- p → parrafo con linea en blanco antes y despues
+- a href → [text](href)
+- strong/b → **text**
+- em/i → *text*
+- code (inline) → `text`
+- pre → bloque con fences
+- ul/ol/li → listas con - o 1.
+- br → newline
+- hr → ---
+- img alt src → ![alt](src) (tambien lee data-src como fallback)
+- blockquote → > text (prefija cada linea)
+
+Skipped (arbol completo ignorado): script, style, noscript.
+Comentarios HTML: ignorados.
+Todo lo demas (div, span, section, article, etc.): emite texto descendiente sin marcado.
@@ -0,0 +1,212 @@
+package core
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestHtmlToMarkdown(t *testing.T) {
+	cases := []struct {
+		name     string
+		html     string
+		contains string // expected substring in output (not exact match)
+		exact    string // if non-empty, check trimmed exact equality
+	}{
+		{
+			name:  "h1 heading",
+			html:  "<h1>Hello World</h1>",
+			exact: "# Hello World",
+		},
+		{
+			name:  "h2 heading",
+			html:  "<h2>Section</h2>",
+			exact: "## Section",
+		},
+		{
+			name:  "h3 heading",
+			html:  "<h3>Sub</h3>",
+			exact: "### Sub",
+		},
+		{
+			name:  "h4 heading",
+			html:  "<h4>Deep</h4>",
+			exact: "#### Deep",
+		},
+		{
+			name:  "h5 heading",
+			html:  "<h5>Deeper</h5>",
+			exact: "##### Deeper",
+		},
+		{
+			name:  "h6 heading",
+			html:  "<h6>Deepest</h6>",
+			exact: "###### Deepest",
+		},
+		{
+			name:     "paragraph",
+			html:     "<p>Hello paragraph.</p>",
+			contains: "Hello paragraph.",
+		},
+		{
+			name:     "link",
+			html:     `<a href="https://example.com">click here</a>`,
+			exact:    "[click here](https://example.com)",
+		},
+		{
+			name:     "link with no text falls back to href",
+			html:     `<a href="https://example.com"></a>`,
+			contains: "https://example.com",
+		},
+		{
+			name:  "strong",
+			html:  "<strong>bold text</strong>",
+			exact: "**bold text**",
+		},
+		{
+			name:  "b tag",
+			html:  "<b>also bold</b>",
+			exact: "**also bold**",
+		},
+		{
+			name:  "em",
+			html:  "<em>italic text</em>",
+			exact: "*italic text*",
+		},
+		{
+			name:  "i tag",
+			html:  "<i>also italic</i>",
+			exact: "*also italic*",
+		},
+		{
+			name:  "code inline",
+			html:  "<code>fmt.Println()</code>",
+			exact: "`fmt.Println()`",
+		},
+		{
+			name:     "pre block",
+			html:     "<pre>func main() {\n  println()\n}</pre>",
+			contains: "```",
+		},
+		{
+			name:     "pre block preserves content",
+			html:     "<pre>func main() {\n  println()\n}</pre>",
+			contains: "func main()",
+		},
+		{
+			name:     "unordered list",
+			html:     "<ul><li>Apple</li><li>Banana</li></ul>",
+			contains: "- Apple",
+		},
+		{
+			name:     "unordered list second item",
+			html:     "<ul><li>Apple</li><li>Banana</li></ul>",
+			contains: "- Banana",
+		},
+		{
+			name:     "ordered list",
+			html:     "<ol><li>First</li><li>Second</li></ol>",
+			contains: "1. First",
+		},
+		{
+			name:     "ordered list second item",
+			html:     "<ol><li>First</li><li>Second</li></ol>",
+			contains: "1. Second",
+		},
+		{
+			name:     "br becomes newline",
+			html:     "line one<br>line two",
+			contains: "\n",
+		},
+		{
+			name:     "hr becomes dashes",
+			html:     "<hr>",
+			contains: "---",
+		},
+		{
+			name:     "img with alt and src",
+			html:     `<img alt="logo" src="https://example.com/logo.png">`,
+			exact:    "![logo](https://example.com/logo.png)",
+		},
+		{
+			name:     "img with empty alt",
+			html:     `<img alt="" src="photo.jpg">`,
+			exact:    "![](photo.jpg)",
+		},
+		{
+			name:     "blockquote",
+			html:     "<blockquote>A wise saying.</blockquote>",
+			contains: "> A wise saying.",
+		},
+		{
+			name:     "script tag skipped",
+			html:     "<p>visible</p><script>alert('x')</script>",
+			contains: "visible",
+		},
+		{
+			name: "script content not in output",
+			html: "<p>visible</p><script>alert('x')</script>",
+			// The word alert should NOT appear
+		},
+		{
+			name:     "style tag skipped",
+			html:     "<style>body{color:red}</style><p>text</p>",
+			contains: "text",
+		},
+		{
+			name:     "noscript skipped",
+			html:     "<noscript>enable js</noscript><p>main</p>",
+			contains: "main",
+		},
+		{
+			name:     "div wrapping does not add markup",
+			html:     "<div><p>content</p></div>",
+			contains: "content",
+		},
+		{
+			name:     "html entities decoded",
+			html:     "<p>5 &gt; 3 &amp; 1 &lt; 2</p>",
+			contains: "5 > 3 & 1 < 2",
+		},
+		{
+			name:     "multiple blank lines collapsed",
+			html:     "<p>a</p><p>b</p>",
+			contains: "a",
+		},
+		{
+			name:     "nested strong inside link",
+			html:     `<a href="/path"><strong>bold link</strong></a>`,
+			contains: "[**bold link**](/path)",
+		},
+		{
+			name:     "html comment skipped",
+			html:     "<!-- hidden -->visible",
+			contains: "visible",
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := HtmlToMarkdown(tc.html)
+			trimmed := strings.TrimSpace(got)
+
+			if tc.exact != "" {
+				if trimmed != tc.exact {
+					t.Errorf("exact mismatch\n  got:  %q\n  want: %q", trimmed, tc.exact)
+				}
+			}
+			if tc.contains != "" {
+				if !strings.Contains(got, tc.contains) {
+					t.Errorf("missing expected substring\n  got:      %q\n  expected: %q", got, tc.contains)
+				}
+			}
+
+			// Special case: script content must NOT appear in output.
+			if tc.name == "script content not in output" {
+				if strings.Contains(got, "alert") {
+					t.Errorf("script content leaked into output: %q", got)
+				}
+			}
+		})
+	}
+}