feat(go): html_to_markdown + extract_iocs
functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,281 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"html"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html/atom"
|
||||
|
||||
ghtml "golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// skipAtoms are tags whose entire subtree is discarded.
|
||||
var skipAtoms = map[atom.Atom]bool{
|
||||
atom.Script: true,
|
||||
atom.Style: true,
|
||||
atom.Noscript: true,
|
||||
}
|
||||
|
||||
// HtmlToMarkdown converts an HTML string to readable markdown (best-effort).
|
||||
//
|
||||
// Supported elements (in priority order):
|
||||
// - <h1>..<h6> → ATX headings (#..######)
|
||||
// - <p> → paragraph separated by blank line
|
||||
// - <a href> → [text](href)
|
||||
// - <strong>,<b> → **text**
|
||||
// - <em>,<i> → *text*
|
||||
// - <code> → `text`
|
||||
// - <pre> → fenced code block
|
||||
// - <ul>/<ol>/<li> → bullet or numbered list
|
||||
// - <br> → newline
|
||||
// - <hr> → ---
|
||||
// - <img alt src> → 
|
||||
// - <blockquote> → > text
|
||||
//
|
||||
// Skipped: <script>, <style>, <noscript> (and their subtrees).
|
||||
// Everything else (div, span, section, etc.) emits descendant text without markup.
|
||||
// Multiple whitespace in text nodes is collapsed to a single space.
|
||||
// HTML entities are decoded via html.UnescapeString.
|
||||
func HtmlToMarkdown(raw string) string {
|
||||
doc, err := ghtml.Parse(strings.NewReader(raw))
|
||||
if err != nil {
|
||||
// If parsing fails, return the raw string stripped of tags as a fallback.
|
||||
return stripTags(raw)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
walkNode(&b, doc, &walkState{})
|
||||
|
||||
// Collapse 3+ consecutive newlines → 2.
|
||||
result := collapseNewlines(b.String())
|
||||
return strings.TrimSpace(result)
|
||||
}
|
||||
|
||||
// walkState carries context through the recursive walk.
|
||||
type walkState struct {
|
||||
inPre bool
|
||||
listStack []atom.Atom // ul or ol
|
||||
blockquote int // nesting depth
|
||||
skipDepth int // subtree skip depth
|
||||
}
|
||||
|
||||
func walkNode(b *strings.Builder, n *ghtml.Node, s *walkState) {
|
||||
switch n.Type {
|
||||
case ghtml.CommentNode:
|
||||
return // skip HTML comments
|
||||
|
||||
case ghtml.TextNode:
|
||||
if s.skipDepth > 0 {
|
||||
return
|
||||
}
|
||||
text := html.UnescapeString(n.Data)
|
||||
if s.inPre {
|
||||
b.WriteString(text)
|
||||
} else {
|
||||
// Collapse whitespace sequences to a single space.
|
||||
text = collapseWhitespace(text)
|
||||
if text != "" {
|
||||
b.WriteString(text)
|
||||
}
|
||||
}
|
||||
return
|
||||
|
||||
case ghtml.ElementNode:
|
||||
a := n.DataAtom
|
||||
if skipAtoms[a] {
|
||||
s.skipDepth++
|
||||
walkChildren(b, n, s)
|
||||
s.skipDepth--
|
||||
return
|
||||
}
|
||||
if s.skipDepth > 0 {
|
||||
walkChildren(b, n, s)
|
||||
return
|
||||
}
|
||||
renderElement(b, n, s, a)
|
||||
return
|
||||
}
|
||||
|
||||
// For document, doctype, etc.: just walk children.
|
||||
walkChildren(b, n, s)
|
||||
}
|
||||
|
||||
func renderElement(b *strings.Builder, n *ghtml.Node, s *walkState, a atom.Atom) {
|
||||
switch a {
|
||||
case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
|
||||
level := headingLevel(a)
|
||||
prefix := strings.Repeat("#", level) + " "
|
||||
b.WriteString("\n\n" + prefix)
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("\n\n")
|
||||
|
||||
case atom.P:
|
||||
b.WriteString("\n\n")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("\n\n")
|
||||
|
||||
case atom.Br:
|
||||
b.WriteString("\n")
|
||||
|
||||
case atom.Hr:
|
||||
b.WriteString("\n\n---\n\n")
|
||||
|
||||
case atom.Pre:
|
||||
s.inPre = true
|
||||
b.WriteString("\n\n```\n")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("\n```\n\n")
|
||||
s.inPre = false
|
||||
|
||||
case atom.Code:
|
||||
if s.inPre {
|
||||
walkChildren(b, n, s)
|
||||
} else {
|
||||
b.WriteString("`")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("`")
|
||||
}
|
||||
|
||||
case atom.Strong, atom.B:
|
||||
b.WriteString("**")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("**")
|
||||
|
||||
case atom.Em, atom.I:
|
||||
b.WriteString("*")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("*")
|
||||
|
||||
case atom.A:
|
||||
href := attrVal(n, "href")
|
||||
var text strings.Builder
|
||||
walkChildren(&text, n, s)
|
||||
linkText := strings.TrimSpace(text.String())
|
||||
if linkText == "" {
|
||||
linkText = href
|
||||
}
|
||||
b.WriteString("[" + linkText + "](" + href + ")")
|
||||
|
||||
case atom.Img:
|
||||
src := attrVal(n, "src")
|
||||
if src == "" {
|
||||
src = attrVal(n, "data-src")
|
||||
}
|
||||
alt := attrVal(n, "alt")
|
||||
b.WriteString("")
|
||||
|
||||
case atom.Ul:
|
||||
s.listStack = append(s.listStack, atom.Ul)
|
||||
b.WriteString("\n")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("\n")
|
||||
s.listStack = s.listStack[:len(s.listStack)-1]
|
||||
|
||||
case atom.Ol:
|
||||
s.listStack = append(s.listStack, atom.Ol)
|
||||
b.WriteString("\n")
|
||||
walkChildren(b, n, s)
|
||||
b.WriteString("\n")
|
||||
s.listStack = s.listStack[:len(s.listStack)-1]
|
||||
|
||||
case atom.Li:
|
||||
prefix := "- "
|
||||
if len(s.listStack) > 0 && s.listStack[len(s.listStack)-1] == atom.Ol {
|
||||
prefix = "1. "
|
||||
}
|
||||
b.WriteString("\n" + prefix)
|
||||
walkChildren(b, n, s)
|
||||
|
||||
case atom.Blockquote:
|
||||
s.blockquote++
|
||||
b.WriteString("\n\n")
|
||||
// Capture children into a temp buffer then prefix each line with "> ".
|
||||
var inner strings.Builder
|
||||
walkChildren(&inner, n, s)
|
||||
s.blockquote--
|
||||
quoted := prefixLines(strings.TrimSpace(inner.String()), "> ")
|
||||
b.WriteString(quoted)
|
||||
b.WriteString("\n\n")
|
||||
|
||||
default:
|
||||
// div, span, section, article, header, footer, nav, etc.
|
||||
// Just emit descendant text without any markup.
|
||||
walkChildren(b, n, s)
|
||||
}
|
||||
}
|
||||
|
||||
func walkChildren(b interface{ WriteString(string) (int, error) }, n *ghtml.Node, s *walkState) {
|
||||
// We accept a strings.Builder-like writer.
|
||||
sb, ok := b.(*strings.Builder)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
walkNode(sb, c, s)
|
||||
}
|
||||
}
|
||||
|
||||
// headingLevel maps h1..h6 atom to 1..6.
|
||||
func headingLevel(a atom.Atom) int {
|
||||
switch a {
|
||||
case atom.H1:
|
||||
return 1
|
||||
case atom.H2:
|
||||
return 2
|
||||
case atom.H3:
|
||||
return 3
|
||||
case atom.H4:
|
||||
return 4
|
||||
case atom.H5:
|
||||
return 5
|
||||
case atom.H6:
|
||||
return 6
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
// attrVal returns the value of the named attribute, or "".
|
||||
func attrVal(n *ghtml.Node, name string) string {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == name {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var wsRe = regexp.MustCompile(`[ \t\r\n]+`)
|
||||
|
||||
// collapseWhitespace reduces any run of whitespace to a single space.
|
||||
func collapseWhitespace(s string) string {
|
||||
return wsRe.ReplaceAllString(s, " ")
|
||||
}
|
||||
|
||||
var manyNL = regexp.MustCompile(`\n{3,}`)
|
||||
|
||||
// collapseNewlines reduces 3+ consecutive newlines to 2.
|
||||
func collapseNewlines(s string) string {
|
||||
return manyNL.ReplaceAllString(s, "\n\n")
|
||||
}
|
||||
|
||||
// prefixLines prepends prefix to every non-empty line in s.
|
||||
func prefixLines(s, prefix string) string {
|
||||
lines := strings.Split(s, "\n")
|
||||
var out []string
|
||||
for _, l := range lines {
|
||||
if strings.TrimSpace(l) == "" {
|
||||
out = append(out, "")
|
||||
} else {
|
||||
out = append(out, prefix+l)
|
||||
}
|
||||
}
|
||||
return strings.Join(out, "\n")
|
||||
}
|
||||
|
||||
var tagRe = regexp.MustCompile(`<[^>]+>`)
|
||||
|
||||
// stripTags is a last-resort fallback: remove all HTML tags.
|
||||
func stripTags(s string) string {
|
||||
return tagRe.ReplaceAllString(s, "")
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
---
|
||||
name: html_to_markdown
|
||||
kind: function
|
||||
lang: go
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "func HtmlToMarkdown(html string) string"
|
||||
description: "Convierte HTML a markdown legible. Parser recursivo del DOM via golang.org/x/net/html. MVP best-effort: soporta headings, parrafos, links, strong/em, code, pre, listas, blockquote, img, br, hr. Skippea script/style/noscript y sus descendientes. Texto plano con whitespace colapsado. Entidades HTML decodificadas."
|
||||
tags: [html, markdown, converter, parsing, text, core]
|
||||
params:
|
||||
- name: html
|
||||
desc: "String HTML completo o fragmento a convertir. Puede incluir doctype, head y body, o ser solo un fragmento de markup."
|
||||
output: "Markdown legible derivado del HTML, mejor esfuerzo. Headings ATX, links en formato [text](href), listas con - o 1., bloques de codigo con backticks. Multiples lineas en blanco colapsadas a una. Nunca retorna error."
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports:
|
||||
- "golang.org/x/net/html"
|
||||
- "golang.org/x/net/html/atom"
|
||||
tested: true
|
||||
tests:
|
||||
- "h1 heading"
|
||||
- "h2 heading"
|
||||
- "h3 heading"
|
||||
- "h4 heading"
|
||||
- "h5 heading"
|
||||
- "h6 heading"
|
||||
- "paragraph"
|
||||
- "link"
|
||||
- "link with no text falls back to href"
|
||||
- "strong"
|
||||
- "b tag"
|
||||
- "em"
|
||||
- "i tag"
|
||||
- "code inline"
|
||||
- "pre block"
|
||||
- "pre block preserves content"
|
||||
- "unordered list"
|
||||
- "unordered list second item"
|
||||
- "ordered list"
|
||||
- "ordered list second item"
|
||||
- "br becomes newline"
|
||||
- "hr becomes dashes"
|
||||
- "img with alt and src"
|
||||
- "img with empty alt"
|
||||
- "blockquote"
|
||||
- "script tag skipped"
|
||||
- "script content not in output"
|
||||
- "style tag skipped"
|
||||
- "noscript skipped"
|
||||
- "div wrapping does not add markup"
|
||||
- "html entities decoded"
|
||||
- "multiple blank lines collapsed"
|
||||
- "nested strong inside link"
|
||||
- "html comment skipped"
|
||||
test_file_path: "functions/core/html_to_markdown_test.go"
|
||||
file_path: "functions/core/html_to_markdown.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
md := HtmlToMarkdown(`<h1>Title</h1><p>Hello <strong>world</strong>.</p><a href="/path">link</a>`)
|
||||
// md = "# Title\n\nHello **world**.\n\n[link](/path)"
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Funcion pura sin efectos secundarios. No extrae el "articulo principal" como hace el Python
|
||||
con readabilipy/readability — convierte el HTML completo tal como llega. Para el caso de
|
||||
uso del enricher fetch_webpage esto es suficiente: el HTML ya fue descargado por el caller.
|
||||
|
||||
Elementos soportados (por prioridad):
|
||||
- h1..h6 → # .. ######
|
||||
- p → parrafo con linea en blanco antes y despues
|
||||
- a href → [text](href)
|
||||
- strong/b → **text**
|
||||
- em/i → *text*
|
||||
- code (inline) → `text`
|
||||
- pre → bloque con fences
|
||||
- ul/ol/li → listas con - o 1.
|
||||
- br → newline
|
||||
- hr → ---
|
||||
- img alt src →  (tambien lee data-src como fallback)
|
||||
- blockquote → > text (prefija cada linea)
|
||||
|
||||
Skipped (arbol completo ignorado): script, style, noscript.
|
||||
Comentarios HTML: ignorados.
|
||||
Todo lo demas (div, span, section, article, etc.): emite texto descendiente sin marcado.
|
||||
@@ -0,0 +1,212 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHtmlToMarkdown(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
html string
|
||||
contains string // expected substring in output (not exact match)
|
||||
exact string // if non-empty, check trimmed exact equality
|
||||
}{
|
||||
{
|
||||
name: "h1 heading",
|
||||
html: "<h1>Hello World</h1>",
|
||||
exact: "# Hello World",
|
||||
},
|
||||
{
|
||||
name: "h2 heading",
|
||||
html: "<h2>Section</h2>",
|
||||
exact: "## Section",
|
||||
},
|
||||
{
|
||||
name: "h3 heading",
|
||||
html: "<h3>Sub</h3>",
|
||||
exact: "### Sub",
|
||||
},
|
||||
{
|
||||
name: "h4 heading",
|
||||
html: "<h4>Deep</h4>",
|
||||
exact: "#### Deep",
|
||||
},
|
||||
{
|
||||
name: "h5 heading",
|
||||
html: "<h5>Deeper</h5>",
|
||||
exact: "##### Deeper",
|
||||
},
|
||||
{
|
||||
name: "h6 heading",
|
||||
html: "<h6>Deepest</h6>",
|
||||
exact: "###### Deepest",
|
||||
},
|
||||
{
|
||||
name: "paragraph",
|
||||
html: "<p>Hello paragraph.</p>",
|
||||
contains: "Hello paragraph.",
|
||||
},
|
||||
{
|
||||
name: "link",
|
||||
html: `<a href="https://example.com">click here</a>`,
|
||||
exact: "[click here](https://example.com)",
|
||||
},
|
||||
{
|
||||
name: "link with no text falls back to href",
|
||||
html: `<a href="https://example.com"></a>`,
|
||||
contains: "https://example.com",
|
||||
},
|
||||
{
|
||||
name: "strong",
|
||||
html: "<strong>bold text</strong>",
|
||||
exact: "**bold text**",
|
||||
},
|
||||
{
|
||||
name: "b tag",
|
||||
html: "<b>also bold</b>",
|
||||
exact: "**also bold**",
|
||||
},
|
||||
{
|
||||
name: "em",
|
||||
html: "<em>italic text</em>",
|
||||
exact: "*italic text*",
|
||||
},
|
||||
{
|
||||
name: "i tag",
|
||||
html: "<i>also italic</i>",
|
||||
exact: "*also italic*",
|
||||
},
|
||||
{
|
||||
name: "code inline",
|
||||
html: "<code>fmt.Println()</code>",
|
||||
exact: "`fmt.Println()`",
|
||||
},
|
||||
{
|
||||
name: "pre block",
|
||||
html: "<pre>func main() {\n println()\n}</pre>",
|
||||
contains: "```",
|
||||
},
|
||||
{
|
||||
name: "pre block preserves content",
|
||||
html: "<pre>func main() {\n println()\n}</pre>",
|
||||
contains: "func main()",
|
||||
},
|
||||
{
|
||||
name: "unordered list",
|
||||
html: "<ul><li>Apple</li><li>Banana</li></ul>",
|
||||
contains: "- Apple",
|
||||
},
|
||||
{
|
||||
name: "unordered list second item",
|
||||
html: "<ul><li>Apple</li><li>Banana</li></ul>",
|
||||
contains: "- Banana",
|
||||
},
|
||||
{
|
||||
name: "ordered list",
|
||||
html: "<ol><li>First</li><li>Second</li></ol>",
|
||||
contains: "1. First",
|
||||
},
|
||||
{
|
||||
name: "ordered list second item",
|
||||
html: "<ol><li>First</li><li>Second</li></ol>",
|
||||
contains: "1. Second",
|
||||
},
|
||||
{
|
||||
name: "br becomes newline",
|
||||
html: "line one<br>line two",
|
||||
contains: "\n",
|
||||
},
|
||||
{
|
||||
name: "hr becomes dashes",
|
||||
html: "<hr>",
|
||||
contains: "---",
|
||||
},
|
||||
{
|
||||
name: "img with alt and src",
|
||||
html: `<img alt="logo" src="https://example.com/logo.png">`,
|
||||
exact: "",
|
||||
},
|
||||
{
|
||||
name: "img with empty alt",
|
||||
html: `<img alt="" src="photo.jpg">`,
|
||||
exact: "",
|
||||
},
|
||||
{
|
||||
name: "blockquote",
|
||||
html: "<blockquote>A wise saying.</blockquote>",
|
||||
contains: "> A wise saying.",
|
||||
},
|
||||
{
|
||||
name: "script tag skipped",
|
||||
html: "<p>visible</p><script>alert('x')</script>",
|
||||
contains: "visible",
|
||||
},
|
||||
{
|
||||
name: "script content not in output",
|
||||
html: "<p>visible</p><script>alert('x')</script>",
|
||||
// The word alert should NOT appear
|
||||
},
|
||||
{
|
||||
name: "style tag skipped",
|
||||
html: "<style>body{color:red}</style><p>text</p>",
|
||||
contains: "text",
|
||||
},
|
||||
{
|
||||
name: "noscript skipped",
|
||||
html: "<noscript>enable js</noscript><p>main</p>",
|
||||
contains: "main",
|
||||
},
|
||||
{
|
||||
name: "div wrapping does not add markup",
|
||||
html: "<div><p>content</p></div>",
|
||||
contains: "content",
|
||||
},
|
||||
{
|
||||
name: "html entities decoded",
|
||||
html: "<p>5 > 3 & 1 < 2</p>",
|
||||
contains: "5 > 3 & 1 < 2",
|
||||
},
|
||||
{
|
||||
name: "multiple blank lines collapsed",
|
||||
html: "<p>a</p><p>b</p>",
|
||||
contains: "a",
|
||||
},
|
||||
{
|
||||
name: "nested strong inside link",
|
||||
html: `<a href="/path"><strong>bold link</strong></a>`,
|
||||
contains: "[**bold link**](/path)",
|
||||
},
|
||||
{
|
||||
name: "html comment skipped",
|
||||
html: "<!-- hidden -->visible",
|
||||
contains: "visible",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := HtmlToMarkdown(tc.html)
|
||||
trimmed := strings.TrimSpace(got)
|
||||
|
||||
if tc.exact != "" {
|
||||
if trimmed != tc.exact {
|
||||
t.Errorf("exact mismatch\n got: %q\n want: %q", trimmed, tc.exact)
|
||||
}
|
||||
}
|
||||
if tc.contains != "" {
|
||||
if !strings.Contains(got, tc.contains) {
|
||||
t.Errorf("missing expected substring\n got: %q\n expected: %q", got, tc.contains)
|
||||
}
|
||||
}
|
||||
|
||||
// Special case: script content must NOT appear in output.
|
||||
if tc.name == "script content not in output" {
|
||||
if strings.Contains(got, "alert") {
|
||||
t.Errorf("script content leaked into output: %q", got)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user