feat(go): html_to_markdown + extract_iocs

functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only
sin dependencias externas). util como prep para LLMs y para indexar contenido
web.

functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae
indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs,
crypto wallets) de texto libre. Devuelve []IOC tipado.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 11:51:51 +02:00
parent 3de82c53c1
commit b04bb846c7
7 changed files with 1471 additions and 0 deletions
+281
View File
@@ -0,0 +1,281 @@
package core
import (
"html"
"regexp"
"strings"
"golang.org/x/net/html/atom"
ghtml "golang.org/x/net/html"
)
// skipAtoms are tags whose entire subtree is discarded.
var skipAtoms = map[atom.Atom]bool{
atom.Script: true,
atom.Style: true,
atom.Noscript: true,
}
// HtmlToMarkdown converts an HTML string to readable markdown (best-effort).
//
// Supported elements (in priority order):
// - <h1>..<h6> → ATX headings (#..######)
// - <p> → paragraph separated by blank line
// - <a href> → [text](href)
// - <strong>,<b> → **text**
// - <em>,<i> → *text*
// - <code> → `text`
// - <pre> → fenced code block
// - <ul>/<ol>/<li> → bullet or numbered list
// - <br> → newline
// - <hr> → ---
// - <img alt src> → ![alt](src)
// - <blockquote> → > text
//
// Skipped: <script>, <style>, <noscript> (and their subtrees).
// Everything else (div, span, section, etc.) emits descendant text without markup.
// Multiple whitespace in text nodes is collapsed to a single space.
// HTML entities are decoded via html.UnescapeString.
func HtmlToMarkdown(raw string) string {
doc, err := ghtml.Parse(strings.NewReader(raw))
if err != nil {
// If parsing fails, return the raw string stripped of tags as a fallback.
return stripTags(raw)
}
var b strings.Builder
walkNode(&b, doc, &walkState{})
// Collapse 3+ consecutive newlines → 2.
result := collapseNewlines(b.String())
return strings.TrimSpace(result)
}
// walkState carries context through the recursive walk.
type walkState struct {
inPre bool
listStack []atom.Atom // ul or ol
blockquote int // nesting depth
skipDepth int // subtree skip depth
}
func walkNode(b *strings.Builder, n *ghtml.Node, s *walkState) {
switch n.Type {
case ghtml.CommentNode:
return // skip HTML comments
case ghtml.TextNode:
if s.skipDepth > 0 {
return
}
text := html.UnescapeString(n.Data)
if s.inPre {
b.WriteString(text)
} else {
// Collapse whitespace sequences to a single space.
text = collapseWhitespace(text)
if text != "" {
b.WriteString(text)
}
}
return
case ghtml.ElementNode:
a := n.DataAtom
if skipAtoms[a] {
s.skipDepth++
walkChildren(b, n, s)
s.skipDepth--
return
}
if s.skipDepth > 0 {
walkChildren(b, n, s)
return
}
renderElement(b, n, s, a)
return
}
// For document, doctype, etc.: just walk children.
walkChildren(b, n, s)
}
func renderElement(b *strings.Builder, n *ghtml.Node, s *walkState, a atom.Atom) {
switch a {
case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
level := headingLevel(a)
prefix := strings.Repeat("#", level) + " "
b.WriteString("\n\n" + prefix)
walkChildren(b, n, s)
b.WriteString("\n\n")
case atom.P:
b.WriteString("\n\n")
walkChildren(b, n, s)
b.WriteString("\n\n")
case atom.Br:
b.WriteString("\n")
case atom.Hr:
b.WriteString("\n\n---\n\n")
case atom.Pre:
s.inPre = true
b.WriteString("\n\n```\n")
walkChildren(b, n, s)
b.WriteString("\n```\n\n")
s.inPre = false
case atom.Code:
if s.inPre {
walkChildren(b, n, s)
} else {
b.WriteString("`")
walkChildren(b, n, s)
b.WriteString("`")
}
case atom.Strong, atom.B:
b.WriteString("**")
walkChildren(b, n, s)
b.WriteString("**")
case atom.Em, atom.I:
b.WriteString("*")
walkChildren(b, n, s)
b.WriteString("*")
case atom.A:
href := attrVal(n, "href")
var text strings.Builder
walkChildren(&text, n, s)
linkText := strings.TrimSpace(text.String())
if linkText == "" {
linkText = href
}
b.WriteString("[" + linkText + "](" + href + ")")
case atom.Img:
src := attrVal(n, "src")
if src == "" {
src = attrVal(n, "data-src")
}
alt := attrVal(n, "alt")
b.WriteString("![" + alt + "](" + src + ")")
case atom.Ul:
s.listStack = append(s.listStack, atom.Ul)
b.WriteString("\n")
walkChildren(b, n, s)
b.WriteString("\n")
s.listStack = s.listStack[:len(s.listStack)-1]
case atom.Ol:
s.listStack = append(s.listStack, atom.Ol)
b.WriteString("\n")
walkChildren(b, n, s)
b.WriteString("\n")
s.listStack = s.listStack[:len(s.listStack)-1]
case atom.Li:
prefix := "- "
if len(s.listStack) > 0 && s.listStack[len(s.listStack)-1] == atom.Ol {
prefix = "1. "
}
b.WriteString("\n" + prefix)
walkChildren(b, n, s)
case atom.Blockquote:
s.blockquote++
b.WriteString("\n\n")
// Capture children into a temp buffer then prefix each line with "> ".
var inner strings.Builder
walkChildren(&inner, n, s)
s.blockquote--
quoted := prefixLines(strings.TrimSpace(inner.String()), "> ")
b.WriteString(quoted)
b.WriteString("\n\n")
default:
// div, span, section, article, header, footer, nav, etc.
// Just emit descendant text without any markup.
walkChildren(b, n, s)
}
}
func walkChildren(b interface{ WriteString(string) (int, error) }, n *ghtml.Node, s *walkState) {
// We accept a strings.Builder-like writer.
sb, ok := b.(*strings.Builder)
if !ok {
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walkNode(sb, c, s)
}
}
// headingLevel maps h1..h6 atom to 1..6.
func headingLevel(a atom.Atom) int {
switch a {
case atom.H1:
return 1
case atom.H2:
return 2
case atom.H3:
return 3
case atom.H4:
return 4
case atom.H5:
return 5
case atom.H6:
return 6
}
return 1
}
// attrVal returns the value of the named attribute, or "".
func attrVal(n *ghtml.Node, name string) string {
for _, a := range n.Attr {
if a.Key == name {
return a.Val
}
}
return ""
}
var wsRe = regexp.MustCompile(`[ \t\r\n]+`)
// collapseWhitespace reduces any run of whitespace to a single space.
func collapseWhitespace(s string) string {
return wsRe.ReplaceAllString(s, " ")
}
var manyNL = regexp.MustCompile(`\n{3,}`)
// collapseNewlines reduces 3+ consecutive newlines to 2.
func collapseNewlines(s string) string {
return manyNL.ReplaceAllString(s, "\n\n")
}
// prefixLines prepends prefix to every non-empty line in s.
func prefixLines(s, prefix string) string {
lines := strings.Split(s, "\n")
var out []string
for _, l := range lines {
if strings.TrimSpace(l) == "" {
out = append(out, "")
} else {
out = append(out, prefix+l)
}
}
return strings.Join(out, "\n")
}
var tagRe = regexp.MustCompile(`<[^>]+>`)
// stripTags is a last-resort fallback: remove all HTML tags.
func stripTags(s string) string {
return tagRe.ReplaceAllString(s, "")
}
+92
View File
@@ -0,0 +1,92 @@
---
name: html_to_markdown
kind: function
lang: go
domain: core
version: "1.0.0"
purity: pure
signature: "func HtmlToMarkdown(html string) string"
description: "Convierte HTML a markdown legible. Parser recursivo del DOM via golang.org/x/net/html. MVP best-effort: soporta headings, parrafos, links, strong/em, code, pre, listas, blockquote, img, br, hr. Skippea script/style/noscript y sus descendientes. Texto plano con whitespace colapsado. Entidades HTML decodificadas."
tags: [html, markdown, converter, parsing, text, core]
params:
- name: html
desc: "String HTML completo o fragmento a convertir. Puede incluir doctype, head y body, o ser solo un fragmento de markup."
output: "Markdown legible derivado del HTML, mejor esfuerzo. Headings ATX, links en formato [text](href), listas con - o 1., bloques de codigo con backticks. Multiples lineas en blanco colapsadas a una. Nunca retorna error."
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports:
- "golang.org/x/net/html"
- "golang.org/x/net/html/atom"
tested: true
tests:
- "h1 heading"
- "h2 heading"
- "h3 heading"
- "h4 heading"
- "h5 heading"
- "h6 heading"
- "paragraph"
- "link"
- "link with no text falls back to href"
- "strong"
- "b tag"
- "em"
- "i tag"
- "code inline"
- "pre block"
- "pre block preserves content"
- "unordered list"
- "unordered list second item"
- "ordered list"
- "ordered list second item"
- "br becomes newline"
- "hr becomes dashes"
- "img with alt and src"
- "img with empty alt"
- "blockquote"
- "script tag skipped"
- "script content not in output"
- "style tag skipped"
- "noscript skipped"
- "div wrapping does not add markup"
- "html entities decoded"
- "multiple blank lines collapsed"
- "nested strong inside link"
- "html comment skipped"
test_file_path: "functions/core/html_to_markdown_test.go"
file_path: "functions/core/html_to_markdown.go"
---
## Ejemplo
```go
md := HtmlToMarkdown(`<h1>Title</h1><p>Hello <strong>world</strong>.</p><a href="/path">link</a>`)
// md = "# Title\n\nHello **world**.\n\n[link](/path)"
```
## Notas
Funcion pura sin efectos secundarios. No extrae el "articulo principal" como hace el Python
con readabilipy/readability — convierte el HTML completo tal como llega. Para el caso de
uso del enricher fetch_webpage esto es suficiente: el HTML ya fue descargado por el caller.
Elementos soportados (por prioridad):
- h1..h6 → # .. ######
- p → parrafo con linea en blanco antes y despues
- a href → [text](href)
- strong/b → **text**
- em/i → *text*
- code (inline) → `text`
- pre → bloque con fences
- ul/ol/li → listas con - o 1.
- br → newline
- hr → ---
- img alt src → ![alt](src) (tambien lee data-src como fallback)
- blockquote → > text (prefija cada linea)
Skipped (arbol completo ignorado): script, style, noscript.
Comentarios HTML: ignorados.
Todo lo demas (div, span, section, article, etc.): emite texto descendiente sin marcado.
+212
View File
@@ -0,0 +1,212 @@
package core
import (
"strings"
"testing"
)
func TestHtmlToMarkdown(t *testing.T) {
cases := []struct {
name string
html string
contains string // expected substring in output (not exact match)
exact string // if non-empty, check trimmed exact equality
}{
{
name: "h1 heading",
html: "<h1>Hello World</h1>",
exact: "# Hello World",
},
{
name: "h2 heading",
html: "<h2>Section</h2>",
exact: "## Section",
},
{
name: "h3 heading",
html: "<h3>Sub</h3>",
exact: "### Sub",
},
{
name: "h4 heading",
html: "<h4>Deep</h4>",
exact: "#### Deep",
},
{
name: "h5 heading",
html: "<h5>Deeper</h5>",
exact: "##### Deeper",
},
{
name: "h6 heading",
html: "<h6>Deepest</h6>",
exact: "###### Deepest",
},
{
name: "paragraph",
html: "<p>Hello paragraph.</p>",
contains: "Hello paragraph.",
},
{
name: "link",
html: `<a href="https://example.com">click here</a>`,
exact: "[click here](https://example.com)",
},
{
name: "link with no text falls back to href",
html: `<a href="https://example.com"></a>`,
contains: "https://example.com",
},
{
name: "strong",
html: "<strong>bold text</strong>",
exact: "**bold text**",
},
{
name: "b tag",
html: "<b>also bold</b>",
exact: "**also bold**",
},
{
name: "em",
html: "<em>italic text</em>",
exact: "*italic text*",
},
{
name: "i tag",
html: "<i>also italic</i>",
exact: "*also italic*",
},
{
name: "code inline",
html: "<code>fmt.Println()</code>",
exact: "`fmt.Println()`",
},
{
name: "pre block",
html: "<pre>func main() {\n println()\n}</pre>",
contains: "```",
},
{
name: "pre block preserves content",
html: "<pre>func main() {\n println()\n}</pre>",
contains: "func main()",
},
{
name: "unordered list",
html: "<ul><li>Apple</li><li>Banana</li></ul>",
contains: "- Apple",
},
{
name: "unordered list second item",
html: "<ul><li>Apple</li><li>Banana</li></ul>",
contains: "- Banana",
},
{
name: "ordered list",
html: "<ol><li>First</li><li>Second</li></ol>",
contains: "1. First",
},
{
name: "ordered list second item",
html: "<ol><li>First</li><li>Second</li></ol>",
contains: "1. Second",
},
{
name: "br becomes newline",
html: "line one<br>line two",
contains: "\n",
},
{
name: "hr becomes dashes",
html: "<hr>",
contains: "---",
},
{
name: "img with alt and src",
html: `<img alt="logo" src="https://example.com/logo.png">`,
exact: "![logo](https://example.com/logo.png)",
},
{
name: "img with empty alt",
html: `<img alt="" src="photo.jpg">`,
exact: "![](photo.jpg)",
},
{
name: "blockquote",
html: "<blockquote>A wise saying.</blockquote>",
contains: "> A wise saying.",
},
{
name: "script tag skipped",
html: "<p>visible</p><script>alert('x')</script>",
contains: "visible",
},
{
name: "script content not in output",
html: "<p>visible</p><script>alert('x')</script>",
// The word alert should NOT appear
},
{
name: "style tag skipped",
html: "<style>body{color:red}</style><p>text</p>",
contains: "text",
},
{
name: "noscript skipped",
html: "<noscript>enable js</noscript><p>main</p>",
contains: "main",
},
{
name: "div wrapping does not add markup",
html: "<div><p>content</p></div>",
contains: "content",
},
{
name: "html entities decoded",
html: "<p>5 &gt; 3 &amp; 1 &lt; 2</p>",
contains: "5 > 3 & 1 < 2",
},
{
name: "multiple blank lines collapsed",
html: "<p>a</p><p>b</p>",
contains: "a",
},
{
name: "nested strong inside link",
html: `<a href="/path"><strong>bold link</strong></a>`,
contains: "[**bold link**](/path)",
},
{
name: "html comment skipped",
html: "<!-- hidden -->visible",
contains: "visible",
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := HtmlToMarkdown(tc.html)
trimmed := strings.TrimSpace(got)
if tc.exact != "" {
if trimmed != tc.exact {
t.Errorf("exact mismatch\n got: %q\n want: %q", trimmed, tc.exact)
}
}
if tc.contains != "" {
if !strings.Contains(got, tc.contains) {
t.Errorf("missing expected substring\n got: %q\n expected: %q", got, tc.contains)
}
}
// Special case: script content must NOT appear in output.
if tc.name == "script content not in output" {
if strings.Contains(got, "alert") {
t.Errorf("script content leaked into output: %q", got)
}
}
})
}
}