diff --git a/functions/core/html_to_markdown.go b/functions/core/html_to_markdown.go
new file mode 100644
index 00000000..48c2e074
--- /dev/null
+++ b/functions/core/html_to_markdown.go
@@ -0,0 +1,281 @@
+package core
+
+import (
+ "html"
+ "regexp"
+ "strings"
+
+ "golang.org/x/net/html/atom"
+
+ ghtml "golang.org/x/net/html"
+)
+
+// skipAtoms are tags whose entire subtree is discarded.
+var skipAtoms = map[atom.Atom]bool{
+ atom.Script: true,
+ atom.Style: true,
+ atom.Noscript: true,
+}
+
+// HtmlToMarkdown converts an HTML string to readable markdown (best-effort).
+//
+// Supported elements (in priority order):
+// -
.. → ATX headings (#..######)
+// -
→ paragraph separated by blank line
+// - → [text](href)
+// - , → **text**
+// - , → *text*
+// - → `text`
+// - → fenced code block
+// -
//- → bullet or numbered list
+// -
→ newline
+// -
→ ---
+// -
→ 
+// - → > text
+//
+// Skipped: ",
+ contains: "visible",
+ },
+ {
+ name: "script content not in output",
+ html: "visible
",
+ // The word alert should NOT appear
+ },
+ {
+ name: "style tag skipped",
+ html: "text
",
+ contains: "text",
+ },
+ {
+ name: "noscript skipped",
+ html: "main
",
+ contains: "main",
+ },
+ {
+ name: "div wrapping does not add markup",
+ html: "",
+ contains: "content",
+ },
+ {
+ name: "html entities decoded",
+ html: "5 > 3 & 1 < 2
",
+ contains: "5 > 3 & 1 < 2",
+ },
+ {
+ name: "multiple blank lines collapsed",
+ html: "a
b
",
+ contains: "a",
+ },
+ {
+ name: "nested strong inside link",
+ html: `bold link`,
+ contains: "[**bold link**](/path)",
+ },
+ {
+ name: "html comment skipped",
+ html: "visible",
+ contains: "visible",
+ },
+ }
+
+ for _, tc := range cases {
+ tc := tc
+ t.Run(tc.name, func(t *testing.T) {
+ got := HtmlToMarkdown(tc.html)
+ trimmed := strings.TrimSpace(got)
+
+ if tc.exact != "" {
+ if trimmed != tc.exact {
+ t.Errorf("exact mismatch\n got: %q\n want: %q", trimmed, tc.exact)
+ }
+ }
+ if tc.contains != "" {
+ if !strings.Contains(got, tc.contains) {
+ t.Errorf("missing expected substring\n got: %q\n expected: %q", got, tc.contains)
+ }
+ }
+
+ // Special case: script content must NOT appear in output.
+ if tc.name == "script content not in output" {
+ if strings.Contains(got, "alert") {
+ t.Errorf("script content leaked into output: %q", got)
+ }
+ }
+ })
+ }
+}
diff --git a/functions/cybersecurity/extract_iocs.go b/functions/cybersecurity/extract_iocs.go
new file mode 100644
index 00000000..dde0296b
--- /dev/null
+++ b/functions/cybersecurity/extract_iocs.go
@@ -0,0 +1,494 @@
+package cybersecurity
+
+import (
+ "net"
+ "regexp"
+ "sort"
+)
+
+// IoC represents a single Indicator of Compromise extracted from text.
+// Type is one of: "email", "ip_address", "domain", "file_hash",
+// "crypto_wallet", "cve_id", "mac_address", "phone_number".
+// Start and End are byte offsets into the original text.
+// Extra holds type-specific fields (e.g. "algorithm" for file_hash,
+// "asset" for crypto_wallet).
+type IoC struct {
+ Type string
+ Value string
+ Start int
+ End int
+ Extra map[string]string // optional: algorithm, asset, etc.
+}
+
+// --- compiled regexes (module-level, compiled once) ---
+
+var (
+ reIPv4 = regexp.MustCompile(`\b\d{1,3}(?:\.\d{1,3}){3}\b`)
+
+ // IPv6: at least two colon-separated groups of hex digits.
+ reIPv6 = regexp.MustCompile(
+ `(?:^|[^0-9A-Fa-f:])` +
+ `([0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}(?:%[0-9A-Za-z]+)?)` +
+ `(?:[^0-9A-Fa-f:]|$)`,
+ )
+
+ reEmail = regexp.MustCompile(
+ `(?:^|[^A-Za-z0-9._%+\-])` +
+ `([A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?` +
+ `(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+)` +
+ `(?:[^A-Za-z0-9._%+\-]|$)`,
+ )
+
+ // Domain label: starts and ends with alnum, internal can have hyphens.
+ _label = `[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?`
+ reDomain = regexp.MustCompile(
+ `(?:^|[^A-Za-z0-9.\-])` +
+ `((?:` + _label + `\.)+[A-Za-z]{2,63})` +
+ `(?:[^A-Za-z0-9.\-]|$)`,
+ )
+
+ // Hex hashes: 32, 40, 64, or 128 chars.
+ reHash = regexp.MustCompile(`\b([A-Fa-f0-9]{32,128})\b`)
+
+ // Crypto wallets.
+ reBTCLegacy = regexp.MustCompile(`(?:^|[^A-Za-z0-9])([13][1-9A-HJ-NP-Za-km-z]{25,34})(?:[^A-Za-z0-9]|$)`)
+ reBTCBech32 = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(bc1[02-9ac-hj-np-z]{6,87})(?:[^A-Za-z0-9]|$)`)
+ reETH = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(0x[a-fA-F0-9]{40})(?:[^A-Za-z0-9]|$)`)
+
+ reCVE = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(CVE-\d{4}-\d{4,7})(?:[^A-Za-z0-9]|$)`)
+
+ reMAC = regexp.MustCompile(
+ `(?:^|[^A-Fa-f0-9:\-])([A-Fa-f0-9]{2}[:\-](?:[A-Fa-f0-9]{2}[:\-]){4}[A-Fa-f0-9]{2})(?:[^A-Fa-f0-9:\-]|$)`,
+ )
+
+ reE164 = regexp.MustCompile(
+ `(?:^|[^A-Za-z0-9])(\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4})(?:[^A-Za-z0-9]|$)`,
+ )
+ reESLocal = regexp.MustCompile(
+ `(?:^|[^A-Za-z0-9+])([6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3})(?:[^A-Za-z0-9]|$)`,
+ )
+
+ reNonDigit = regexp.MustCompile(`[^0-9]`)
+)
+
+// validTLDs is the same static set as the Python implementation.
+var validTLDs = map[string]bool{
+ // original gTLD
+ "com": true, "org": true, "net": true, "edu": true, "gov": true, "mil": true, "int": true,
+ // common gTLD
+ "info": true, "biz": true, "name": true, "pro": true, "mobi": true, "asia": true,
+ "jobs": true, "tel": true, "travel": true, "xxx": true, "post": true,
+ // popular new gTLD
+ "app": true, "dev": true, "io": true, "ai": true, "tech": true, "cloud": true,
+ "online": true, "site": true, "store": true, "xyz": true, "top": true, "shop": true,
+ "club": true, "fun": true, "live": true, "blog": true, "page": true, "news": true,
+ "media": true, "design": true, "studio": true, "agency": true, "co": true, "me": true, "tv": true,
+ // ccTLD
+ "us": true, "uk": true, "de": true, "fr": true, "es": true, "it": true, "nl": true,
+ "be": true, "se": true, "no": true, "fi": true, "dk": true, "ru": true, "ua": true,
+ "pl": true, "cz": true, "ch": true, "at": true, "pt": true, "gr": true, "ie": true,
+ "tr": true, "ca": true, "mx": true, "br": true, "ar": true, "cl": true, "pe": true,
+ "ve": true, "uy": true, "cn": true, "jp": true, "kr": true, "in": true, "id": true,
+ "th": true, "vn": true, "my": true, "sg": true, "ph": true, "tw": true, "hk": true,
+ "au": true, "nz": true, "za": true, "eg": true, "ma": true, "ng": true, "ke": true,
+ "il": true, "ae": true, "sa": true, "qa": true, "eu": true,
+}
+
+// hashLengths maps valid hash lengths to algorithm names (longest first to
+// avoid SHA1 being mis-identified within longer hex strings).
+var hashLengths = [][2]string{
+ {"128", "sha512"},
+ {"64", "sha256"},
+ {"40", "sha1"},
+ {"32", "md5"},
+}
+
+// --- helper: find submatch positions accounting for leading/trailing context chars ---
+
+// findAll returns all non-overlapping matches of a regex that uses a single
+// capturing group (group 1) for the actual value, adjusting offsets so
+// Start/End point to the captured group, not the full match.
+func findAll(re *regexp.Regexp, text string) [][3]int {
+ // FindAllStringSubmatchIndex returns [][]int where [0],[1] = full match,
+ // [2],[3] = group 1.
+ raw := re.FindAllStringSubmatchIndex(text, -1)
+ out := make([][3]int, 0, len(raw))
+ for _, m := range raw {
+ if len(m) < 4 || m[2] < 0 {
+ continue
+ }
+ out = append(out, [3]int{m[2], m[3], 0})
+ }
+ return out
+}
+
+// --- individual extractors ---
+
+func extractEmails(text string) []IoC {
+ spans := findAll(reEmail, text)
+ out := make([]IoC, 0, len(spans))
+ for _, s := range spans {
+ out = append(out, IoC{
+ Type: "email",
+ Value: text[s[0]:s[1]],
+ Start: s[0],
+ End: s[1],
+ })
+ }
+ return out
+}
+
+func extractIPAddresses(text string) []IoC {
+ var out []IoC
+
+ // IPv4 — validate with net.ParseIP
+ for _, m := range reIPv4.FindAllStringIndex(text, -1) {
+ candidate := text[m[0]:m[1]]
+ ip := net.ParseIP(candidate)
+ if ip == nil || ip.To4() == nil {
+ continue
+ }
+ out = append(out, IoC{
+ Type: "ip_address",
+ Value: candidate,
+ Start: m[0],
+ End: m[1],
+ })
+ }
+
+ // IPv6 — use capturing group regex
+ for _, s := range findAll(reIPv6, text) {
+ candidate := text[s[0]:s[1]]
+ // Strip zone ID before parsing
+ zone := candidate
+ if idx := indexOf(candidate, '%'); idx >= 0 {
+ zone = candidate[:idx]
+ }
+ if countRune(zone, ':') < 2 {
+ continue
+ }
+ ip := net.ParseIP(zone)
+ if ip == nil {
+ continue
+ }
+ if ip.To4() != nil {
+ // IPv4-in-IPv6 — skip, already captured above
+ continue
+ }
+ out = append(out, IoC{
+ Type: "ip_address",
+ Value: candidate,
+ Start: s[0],
+ End: s[1],
+ })
+ }
+
+ sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+ return out
+}
+
+func extractDomains(text string) []IoC {
+ spans := findAll(reDomain, text)
+ var out []IoC
+ for _, s := range spans {
+ candidate := text[s[0]:s[1]]
+ tld := lastPart(candidate, '.')
+ if !validTLDs[toLower(tld)] {
+ continue
+ }
+ out = append(out, IoC{
+ Type: "domain",
+ Value: candidate,
+ Start: s[0],
+ End: s[1],
+ })
+ }
+ return out
+}
+
+func extractFileHashes(text string) []IoC {
+ var out []IoC
+ for _, m := range reHash.FindAllStringSubmatchIndex(text, -1) {
+ if len(m) < 4 || m[2] < 0 {
+ continue
+ }
+ candidate := text[m[2]:m[3]]
+ length := len(candidate)
+ algo := ""
+ for _, pair := range hashLengths {
+ if itoa(length) == pair[0] {
+ algo = pair[1]
+ break
+ }
+ }
+ if algo == "" {
+ continue
+ }
+ out = append(out, IoC{
+ Type: "file_hash",
+ Value: candidate,
+ Start: m[2],
+ End: m[3],
+ Extra: map[string]string{"algorithm": algo},
+ })
+ }
+ return out
+}
+
+func extractCryptoWallets(text string) []IoC {
+ var out []IoC
+ for _, pair := range []struct {
+ re *regexp.Regexp
+ asset string
+ }{
+ {reBTCLegacy, "btc"},
+ {reBTCBech32, "btc"},
+ {reETH, "eth"},
+ } {
+ for _, s := range findAll(pair.re, text) {
+ out = append(out, IoC{
+ Type: "crypto_wallet",
+ Value: text[s[0]:s[1]],
+ Start: s[0],
+ End: s[1],
+ Extra: map[string]string{"asset": pair.asset},
+ })
+ }
+ }
+ sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+ return out
+}
+
+func extractCVEIDs(text string) []IoC {
+ spans := findAll(reCVE, text)
+ out := make([]IoC, 0, len(spans))
+ for _, s := range spans {
+ out = append(out, IoC{
+ Type: "cve_id",
+ Value: text[s[0]:s[1]],
+ Start: s[0],
+ End: s[1],
+ })
+ }
+ return out
+}
+
+func extractMACAddresses(text string) []IoC {
+ var out []IoC
+ for _, s := range findAll(reMAC, text) {
+ candidate := text[s[0]:s[1]]
+ // Reject mixed separators
+ hasColon := contains(candidate, ':')
+ hasDash := contains(candidate, '-')
+ if hasColon && hasDash {
+ continue
+ }
+ out = append(out, IoC{
+ Type: "mac_address",
+ Value: candidate,
+ Start: s[0],
+ End: s[1],
+ })
+ }
+ return out
+}
+
+func extractPhoneNumbers(text string) []IoC {
+ seen := map[[2]int]bool{}
+ var out []IoC
+
+ for _, s := range findAll(reE164, text) {
+ candidate := text[s[0]:s[1]]
+ digits := reNonDigit.ReplaceAllString(candidate, "")
+ if len(digits) < 8 || len(digits) > 15 {
+ continue
+ }
+ key := [2]int{s[0], s[1]}
+ if seen[key] {
+ continue
+ }
+ seen[key] = true
+ out = append(out, IoC{
+ Type: "phone_number",
+ Value: candidate,
+ Start: s[0],
+ End: s[1],
+ })
+ }
+
+ for _, s := range findAll(reESLocal, text) {
+ candidate := text[s[0]:s[1]]
+ digits := reNonDigit.ReplaceAllString(candidate, "")
+ if len(digits) != 9 {
+ continue
+ }
+ key := [2]int{s[0], s[1]}
+ if seen[key] {
+ continue
+ }
+ seen[key] = true
+ out = append(out, IoC{
+ Type: "phone_number",
+ Value: candidate,
+ Start: s[0],
+ End: s[1],
+ })
+ }
+
+ sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+ return out
+}
+
+// --- pipeline ---
+
+// extractorOrder defines the canonical order for running extractors,
+// matching the Python _EXTRACTORS map order.
+var extractorOrder = []string{
+ "email",
+ "ip_address",
+ "crypto_wallet",
+ "cve_id",
+ "mac_address",
+ "file_hash",
+ "phone_number",
+ "domain",
+}
+
+var extractorFuncs = map[string]func(string) []IoC{
+ "email": extractEmails,
+ "ip_address": extractIPAddresses,
+ "crypto_wallet": extractCryptoWallets,
+ "cve_id": extractCVEIDs,
+ "mac_address": extractMACAddresses,
+ "file_hash": extractFileHashes,
+ "phone_number": extractPhoneNumbers,
+ "domain": extractDomains,
+}
+
+// ExtractIocs extracts all IoCs from text and returns a deduplicated,
+// offset-sorted slice. If types is nil, all extractor types are run.
+// Unknown type strings are silently ignored.
+//
+// Deduplication: if a span is fully contained within another already-accepted
+// span, it is discarded (e.g. a domain inside an email). Exact-span ties keep
+// the first match in extractor order.
+func ExtractIocs(text string, types []string) []IoC {
+ if types == nil {
+ types = extractorOrder
+ }
+
+ var raw []IoC
+ for _, t := range types {
+ fn, ok := extractorFuncs[t]
+ if !ok {
+ continue
+ }
+ raw = append(raw, fn(text)...)
+ }
+
+ // Sort: ascending start, then descending length (wider span first).
+ sort.SliceStable(raw, func(i, j int) bool {
+ si, sj := raw[i], raw[j]
+ if si.Start != sj.Start {
+ return si.Start < sj.Start
+ }
+ return (si.End - si.Start) > (sj.End - sj.Start)
+ })
+
+ // Dedup by (Type, Value) and by containment.
+ seen := map[[2]string]bool{}
+ var deduped []IoC
+ for _, m := range raw {
+ key := [2]string{m.Type, m.Value}
+ if seen[key] {
+ continue
+ }
+ // Check if fully contained within an already-accepted span.
+ contained := false
+ for _, d := range deduped {
+ if d.Start <= m.Start && d.End >= m.End &&
+ !(d.Start == m.Start && d.End == m.End) {
+ contained = true
+ break
+ }
+ }
+ if contained {
+ continue
+ }
+ // Exact-span tie: first in order wins.
+ exactTie := false
+ for _, d := range deduped {
+ if d.Start == m.Start && d.End == m.End {
+ exactTie = true
+ break
+ }
+ }
+ if exactTie {
+ continue
+ }
+ seen[key] = true
+ deduped = append(deduped, m)
+ }
+
+ return deduped
+}
+
+// --- small string helpers (avoid importing strings to keep package lean) ---
+
+func indexOf(s string, b byte) int {
+ for i := 0; i < len(s); i++ {
+ if s[i] == b {
+ return i
+ }
+ }
+ return -1
+}
+
+func countRune(s string, b byte) int {
+ n := 0
+ for i := 0; i < len(s); i++ {
+ if s[i] == b {
+ n++
+ }
+ }
+ return n
+}
+
+func contains(s string, b byte) bool {
+ return indexOf(s, b) >= 0
+}
+
+func lastPart(s string, sep byte) string {
+ for i := len(s) - 1; i >= 0; i-- {
+ if s[i] == sep {
+ return s[i+1:]
+ }
+ }
+ return s
+}
+
+func toLower(s string) string {
+ b := make([]byte, len(s))
+ for i := 0; i < len(s); i++ {
+ c := s[i]
+ if c >= 'A' && c <= 'Z' {
+ c += 32
+ }
+ b[i] = c
+ }
+ return string(b)
+}
+
+func itoa(n int) string {
+ if n == 0 {
+ return "0"
+ }
+ buf := [20]byte{}
+ pos := 19
+ for n > 0 {
+ buf[pos] = byte('0' + n%10)
+ pos--
+ n /= 10
+ }
+ return string(buf[pos+1:])
+}
diff --git a/functions/cybersecurity/extract_iocs.md b/functions/cybersecurity/extract_iocs.md
new file mode 100644
index 00000000..1221b0a7
--- /dev/null
+++ b/functions/cybersecurity/extract_iocs.md
@@ -0,0 +1,75 @@
+---
+name: extract_iocs
+kind: function
+lang: go
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "func ExtractIocs(text string, types []string) []IoC"
+description: "Port a Go de extract_iocs_py_cybersecurity. Extrae todos los IoCs (email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number) de un texto usando regex puro. Si types es nil corre todos los extractores; si tiene valores filtra solo esos tipos. Deduplica por (Type, Value) y elimina spans contenidos (ej. dominio dentro de un email). Retorna slice ordenada por offset."
+tags: [ioc, cybersecurity, regex, threat-intel, email, ip, domain, hash, wallet, cve, mac, phone]
+uses_functions: []
+uses_types: [ioc_go_cybersecurity]
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["net", "regexp", "sort"]
+params:
+ - name: text
+ desc: "Texto plano o markdown del que extraer IoCs. Puede contener cualquier contenido — se aplican todos los extractores sobre el texto completo."
+ - name: types
+ desc: "Slice de tipos a extraer. Valores validos: email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number. Si es nil, se corren todos los extractores. Tipos desconocidos se ignoran silenciosamente."
+output: "Slice de IoC ordenada por offset Start ascendente. Cada IoC tiene Type, Value, Start (byte offset inicio), End (byte offset fin) y opcionalmente Extra (algorithm para file_hash, asset para crypto_wallet). Sin duplicados: mismo (Type, Value) aparece una sola vez, y spans completamente contenidos dentro de otro span se descartan."
+tested: true
+tests:
+ - "texto sin IoCs retorna slice vacia"
+ - "un IoC de cada tipo detectado"
+ - "filtro por types=[email] retorna solo emails"
+ - "dedup mismo email aparece dos veces solo una entrada"
+ - "IPv4 valida detectada"
+ - "numero con octeto 999 no es IPv4"
+ - "numero con octeto 256 no es IPv4"
+ - "hash MD5 exactamente 32 hex chars detectado"
+ - "hash SHA1 exactamente 40 hex chars detectado"
+ - "hash SHA256 exactamente 64 hex chars detectado"
+ - "hash SHA512 exactamente 128 hex chars detectado"
+ - "longitud intermedia 60 hex chars ignorada"
+ - "dominio contenido en email span se descarta"
+ - "tipos desconocidos se ignoran sin error"
+ - "CVE-2014-0160 extraido"
+ - "multiples CVEs en mismo texto"
+ - "MAC con dos puntos extraida"
+ - "separadores mezclados rechazados"
+ - "E.164 con prefijo pais extraido"
+ - "formato ES 9 digitos extraido"
+ - "offsets Start/End cubren el valor exacto en el texto"
+ - "pipeline completo detecta email ip cve mac wallet"
+test_file_path: "functions/cybersecurity/extract_iocs_test.go"
+file_path: "functions/cybersecurity/extract_iocs.go"
+---
+
+## Ejemplo
+
+```go
+iocs := ExtractIocs("Contact alice@example.com, vuln CVE-2023-1234, ip 192.0.2.5", nil)
+// iocs[0] = IoC{Type:"email", Value:"alice@example.com", Start:8, End:25}
+// iocs[1] = IoC{Type:"ip_address", Value:"192.0.2.5", ...}
+// iocs[2] = IoC{Type:"cve_id", Value:"CVE-2023-1234", ...}
+
+// Solo emails:
+emails := ExtractIocs(text, []string{"email"})
+
+// Campo extra para hashes:
+// ioc.Extra["algorithm"] == "sha256"
+
+// Campo extra para wallets:
+// ioc.Extra["asset"] == "btc" | "eth"
+```
+
+## Notas
+
+Port directo de `extract_iocs_py_cybersecurity`. La validacion de IPv4/IPv6 usa `net.ParseIP` de stdlib, equivalente al modulo `ipaddress` de Python. Los regex son equivalentes a los Python — Go usa RE2 (sin lookbehind de longitud variable) por lo que los patrones de contexto se implementan con grupos capturadores y ajuste de offsets al grupo 1.
+
+La deduplicacion opera en dos niveles:
+1. (Type, Value) — el mismo valor del mismo tipo solo aparece una vez (primer match gana).
+2. Contencion de spans — si el span de un IoC queda completamente dentro del span de otro ya aceptado, se descarta (ej. "example.com" dentro de "alice@example.com").
diff --git a/functions/cybersecurity/extract_iocs_test.go b/functions/cybersecurity/extract_iocs_test.go
new file mode 100644
index 00000000..403d50bf
--- /dev/null
+++ b/functions/cybersecurity/extract_iocs_test.go
@@ -0,0 +1,292 @@
+package cybersecurity
+
+import (
+ "testing"
+)
+
+// ---- helpers ----
+
+func iocTypes(iocs []IoC) []string {
+ out := make([]string, len(iocs))
+ for i, ioc := range iocs {
+ out[i] = ioc.Type
+ }
+ return out
+}
+
+func iocValues(iocs []IoC) []string {
+ out := make([]string, len(iocs))
+ for i, ioc := range iocs {
+ out[i] = ioc.Value
+ }
+ return out
+}
+
+func containsType(iocs []IoC, t string) bool {
+ for _, ioc := range iocs {
+ if ioc.Type == t {
+ return true
+ }
+ }
+ return false
+}
+
+func countType(iocs []IoC, t string) int {
+ n := 0
+ for _, ioc := range iocs {
+ if ioc.Type == t {
+ n++
+ }
+ }
+ return n
+}
+
+// ---- text sin IoCs ----
+
+func TestExtractIocs_texto_sin_iocs_retorna_slice_vacia(t *testing.T) {
+ t.Run("texto sin IoCs retorna slice vacia", func(t *testing.T) {
+ got := ExtractIocs("nothing interesting here, just plain words.", nil)
+ if len(got) != 0 {
+ t.Errorf("expected empty slice, got %v", got)
+ }
+ })
+}
+
+// ---- un IoC de cada tipo ----
+
+func TestExtractIocs_un_ioc_de_cada_tipo(t *testing.T) {
+ t.Run("un IoC de cada tipo detectado", func(t *testing.T) {
+ text := "email alice@example.com " +
+ "ip 192.0.2.1 " +
+ "hash 5d41402abc4b2a76b9719d911017c592 " +
+ "wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1 " +
+ "cve CVE-2023-1234 " +
+ "mac 00:1A:2B:3C:4D:5E " +
+ "phone +34612345678 " +
+ "domain api.example.com"
+
+ got := ExtractIocs(text, nil)
+ wantTypes := []string{"email", "ip_address", "file_hash", "crypto_wallet", "cve_id", "mac_address", "phone_number", "domain"}
+ for _, wt := range wantTypes {
+ if !containsType(got, wt) {
+ t.Errorf("expected type %q in results, got types: %v", wt, iocTypes(got))
+ }
+ }
+ })
+}
+
+// ---- filtro por types ----
+
+func TestExtractIocs_filtro_por_types_solo_emails(t *testing.T) {
+ t.Run("filtro por types=[email] retorna solo emails", func(t *testing.T) {
+ text := "alice@example.com 192.0.2.1"
+ got := ExtractIocs(text, []string{"email"})
+ for _, ioc := range got {
+ if ioc.Type != "email" {
+ t.Errorf("expected only email type, got %q", ioc.Type)
+ }
+ }
+ if !containsType(got, "email") {
+ t.Errorf("expected at least one email IoC")
+ }
+ })
+}
+
+// ---- dedup por (Type, Value) ----
+
+func TestExtractIocs_dedup_mismo_email_dos_veces_una_entrada(t *testing.T) {
+ t.Run("dedup mismo email aparece dos veces solo una entrada", func(t *testing.T) {
+ text := "alice@example.com and alice@example.com again"
+ got := ExtractIocs(text, []string{"email"})
+ n := countType(got, "email")
+ if n != 1 {
+ t.Errorf("expected 1 email after dedup, got %d: %v", n, iocValues(got))
+ }
+ })
+}
+
+// ---- IPv4 valida vs numero que parece IP ----
+
+func TestExtractIocs_ipv4_valida_vs_octeto_invalido(t *testing.T) {
+ t.Run("IPv4 valida detectada", func(t *testing.T) {
+ got := ExtractIocs("addr 10.0.0.1 end", []string{"ip_address"})
+ if !containsType(got, "ip_address") {
+ t.Errorf("expected ip_address IoC for valid IPv4")
+ }
+ })
+
+ t.Run("numero con octeto 999 no es IPv4", func(t *testing.T) {
+ got := ExtractIocs("bad 999.999.999.999 end", []string{"ip_address"})
+ if containsType(got, "ip_address") {
+ t.Errorf("expected no ip_address IoC for 999.999.999.999, got %v", got)
+ }
+ })
+
+ t.Run("numero con octeto 256 no es IPv4", func(t *testing.T) {
+ got := ExtractIocs("bad 256.0.0.1 end", []string{"ip_address"})
+ if containsType(got, "ip_address") {
+ t.Errorf("expected no ip_address IoC for 256.0.0.1, got %v", got)
+ }
+ })
+}
+
+// ---- hashes exactamente 32/40/64 chars hex ----
+
+func TestExtractIocs_hashes_por_longitud(t *testing.T) {
+ md5val := "5d41402abc4b2a76b9719d911017c592" // 32
+ sha1val := "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d" // 40
+ sha256val := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // 64
+ // 128 hex chars — SHA512 of empty string
+ sha512val := "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"
+
+ t.Run("hash MD5 exactamente 32 hex chars detectado", func(t *testing.T) {
+ got := ExtractIocs(md5val, []string{"file_hash"})
+ if len(got) != 1 || got[0].Extra["algorithm"] != "md5" {
+ t.Errorf("expected md5 hash, got %v", got)
+ }
+ })
+
+ t.Run("hash SHA1 exactamente 40 hex chars detectado", func(t *testing.T) {
+ got := ExtractIocs(sha1val, []string{"file_hash"})
+ if len(got) != 1 || got[0].Extra["algorithm"] != "sha1" {
+ t.Errorf("expected sha1 hash, got %v", got)
+ }
+ })
+
+ t.Run("hash SHA256 exactamente 64 hex chars detectado", func(t *testing.T) {
+ got := ExtractIocs(sha256val, []string{"file_hash"})
+ if len(got) != 1 || got[0].Extra["algorithm"] != "sha256" {
+ t.Errorf("expected sha256 hash, got %v", got)
+ }
+ })
+
+ t.Run("hash SHA512 exactamente 128 hex chars detectado", func(t *testing.T) {
+ got := ExtractIocs(sha512val, []string{"file_hash"})
+ if len(got) != 1 || got[0].Extra["algorithm"] != "sha512" {
+ t.Errorf("expected sha512 hash, got %v", got)
+ }
+ })
+
+ t.Run("longitud intermedia 60 hex chars ignorada", func(t *testing.T) {
+ hex60 := "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" // 60 chars
+ got := ExtractIocs(hex60, []string{"file_hash"})
+ if len(got) != 0 {
+ t.Errorf("expected no hash for 60-char hex, got %v", got)
+ }
+ })
+}
+
+// ---- contenido: dominio dentro de email se descarta ----
+
+func TestExtractIocs_dominio_dentro_de_email_se_descarta(t *testing.T) {
+ t.Run("dominio contenido en email span se descarta", func(t *testing.T) {
+ text := "Email: alice@example.com nothing else"
+ got := ExtractIocs(text, nil)
+ if containsType(got, "domain") {
+ t.Errorf("expected domain to be deduplicated as contained by email span, got %v", got)
+ }
+ if !containsType(got, "email") {
+ t.Errorf("expected email IoC to be present")
+ }
+ })
+}
+
+// ---- tipos desconocidos se ignoran ----
+
+func TestExtractIocs_tipos_desconocidos_se_ignoran(t *testing.T) {
+ t.Run("tipos desconocidos se ignoran sin error", func(t *testing.T) {
+ text := "alice@example.com"
+ got := ExtractIocs(text, []string{"nonexistent", "email"})
+ if len(got) != 1 || got[0].Type != "email" {
+ t.Errorf("expected exactly 1 email IoC, got %v", got)
+ }
+ })
+}
+
+// ---- CVE ----
+
+func TestExtractIocs_cve_ids(t *testing.T) {
+ t.Run("CVE-2014-0160 extraido", func(t *testing.T) {
+ got := ExtractIocs("Patch CVE-2014-0160 immediately", []string{"cve_id"})
+ if len(got) != 1 || got[0].Value != "CVE-2014-0160" {
+ t.Errorf("expected CVE-2014-0160, got %v", got)
+ }
+ })
+
+ t.Run("multiples CVEs en mismo texto", func(t *testing.T) {
+ text := "Affected: CVE-2021-44228, CVE-2021-45046, CVE-2021-45105"
+ got := ExtractIocs(text, []string{"cve_id"})
+ if len(got) != 3 {
+ t.Errorf("expected 3 CVEs, got %d: %v", len(got), iocValues(got))
+ }
+ })
+}
+
+// ---- MAC addresses ----
+
+func TestExtractIocs_mac_addresses(t *testing.T) {
+ t.Run("MAC con dos puntos extraida", func(t *testing.T) {
+ got := ExtractIocs("iface 00:1A:2B:3C:4D:5E up", []string{"mac_address"})
+ if len(got) != 1 || got[0].Value != "00:1A:2B:3C:4D:5E" {
+ t.Errorf("expected MAC 00:1A:2B:3C:4D:5E, got %v", got)
+ }
+ })
+
+ t.Run("separadores mezclados rechazados", func(t *testing.T) {
+ got := ExtractIocs("00:1A-2B:3C-4D:5E", []string{"mac_address"})
+ if len(got) != 0 {
+ t.Errorf("expected no MAC for mixed separators, got %v", got)
+ }
+ })
+}
+
+// ---- telefono ----
+
+func TestExtractIocs_phone_numbers(t *testing.T) {
+ t.Run("E.164 con prefijo pais extraido", func(t *testing.T) {
+ got := ExtractIocs("call +34 612 345 678 now", []string{"phone_number"})
+ if len(got) == 0 {
+ t.Errorf("expected phone_number IoC for +34 612 345 678")
+ }
+ })
+
+ t.Run("formato ES 9 digitos extraido", func(t *testing.T) {
+ got := ExtractIocs("directo 612345678 fijo", []string{"phone_number"})
+ if len(got) == 0 {
+ t.Errorf("expected phone_number IoC for 612345678")
+ }
+ })
+}
+
+// ---- offsets son correctos ----
+
+func TestExtractIocs_offsets_correctos(t *testing.T) {
+ t.Run("offsets Start/End cubren el valor exacto en el texto", func(t *testing.T) {
+ text := "contact alice@example.com for info"
+ got := ExtractIocs(text, []string{"email"})
+ if len(got) == 0 {
+ t.Fatal("expected at least one email IoC")
+ }
+ ioc := got[0]
+ extracted := text[ioc.Start:ioc.End]
+ if extracted != ioc.Value {
+ t.Errorf("text[%d:%d] = %q, want %q", ioc.Start, ioc.End, extracted, ioc.Value)
+ }
+ })
+}
+
+// ---- pipeline completo (equivalente al test Python) ----
+
+func TestExtractIocs_pipeline_completo(t *testing.T) {
+ t.Run("pipeline completo detecta email ip cve mac wallet", func(t *testing.T) {
+ text := "Reach alice@example.com from 10.0.0.5; " +
+ "CVE-2023-1234 vendor 00:1A:2B:3C:4D:5E " +
+ "wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1"
+ got := ExtractIocs(text, nil)
+ for _, wt := range []string{"email", "ip_address", "cve_id", "mac_address", "crypto_wallet"} {
+ if !containsType(got, wt) {
+ t.Errorf("expected type %q in full pipeline results, types present: %v", wt, iocTypes(got))
+ }
+ }
+ })
+}
diff --git a/types/cybersecurity/ioc.md b/types/cybersecurity/ioc.md
new file mode 100644
index 00000000..4f3173ff
--- /dev/null
+++ b/types/cybersecurity/ioc.md
@@ -0,0 +1,25 @@
+---
+name: ioc
+lang: go
+domain: cybersecurity
+version: "1.0.0"
+algebraic: product
+definition: |
+ type IoC struct {
+ Type string
+ Value string
+ Start int
+ End int
+ Extra map[string]string
+ }
+description: "Indicador de Compromiso extraido de texto. Type es uno de: email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number. Start y End son byte offsets en el texto original. Extra contiene campos adicionales dependientes del tipo (algorithm para file_hash, asset para crypto_wallet)."
+tags: [ioc, cybersecurity, indicator, threat-intel]
+uses_types: []
+file_path: "functions/cybersecurity/extract_iocs.go"
+---
+
+## Notas
+
+El struct IoC es el tipo de retorno de `ExtractIocs`. El campo `Extra` es nil para la mayoria de tipos; solo se puebla para:
+- `file_hash`: `Extra["algorithm"]` = "md5" | "sha1" | "sha256" | "sha512"
+- `crypto_wallet`: `Extra["asset"]` = "btc" | "eth"