From 92297e02c5ad88a22092df67bdbe26bee7fc5d4a Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Mon, 4 May 2026 11:51:51 +0200 Subject: [PATCH] feat(go): html_to_markdown + extract_iocs functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) --- functions/core/html_to_markdown.go | 281 +++++++++++ functions/core/html_to_markdown.md | 92 ++++ functions/core/html_to_markdown_test.go | 212 ++++++++ functions/cybersecurity/extract_iocs.go | 494 +++++++++++++++++++ functions/cybersecurity/extract_iocs.md | 75 +++ functions/cybersecurity/extract_iocs_test.go | 292 +++++++++++ types/cybersecurity/ioc.md | 25 + 7 files changed, 1471 insertions(+) create mode 100644 functions/core/html_to_markdown.go create mode 100644 functions/core/html_to_markdown.md create mode 100644 functions/core/html_to_markdown_test.go create mode 100644 functions/cybersecurity/extract_iocs.go create mode 100644 functions/cybersecurity/extract_iocs.md create mode 100644 functions/cybersecurity/extract_iocs_test.go create mode 100644 types/cybersecurity/ioc.md diff --git a/functions/core/html_to_markdown.go b/functions/core/html_to_markdown.go new file mode 100644 index 00000000..48c2e074 --- /dev/null +++ b/functions/core/html_to_markdown.go @@ -0,0 +1,281 @@ +package core + +import ( + "html" + "regexp" + "strings" + + "golang.org/x/net/html/atom" + + ghtml "golang.org/x/net/html" +) + +// skipAtoms are tags whose entire subtree is discarded. +var skipAtoms = map[atom.Atom]bool{ + atom.Script: true, + atom.Style: true, + atom.Noscript: true, +} + +// HtmlToMarkdown converts an HTML string to readable markdown (best-effort). +// +// Supported elements (in priority order): +// -

..

→ ATX headings (#..######) +// -

→ paragraph separated by blank line +// - → [text](href) +// - , → **text** +// - , → *text* +// - → `text` +// -

       → fenced code block
+//   -