feat(go): html_to_markdown + extract_iocs

functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 11:51:51 +02:00
parent 3de82c53c1
commit b04bb846c7
7 changed files with 1471 additions and 0 deletions
@@ -0,0 +1,494 @@
+package cybersecurity
+
+import (
+	"net"
+	"regexp"
+	"sort"
+)
+
+// IoC represents a single Indicator of Compromise extracted from text.
+// Type is one of: "email", "ip_address", "domain", "file_hash",
+// "crypto_wallet", "cve_id", "mac_address", "phone_number".
+// Start and End are byte offsets into the original text.
+// Extra holds type-specific fields (e.g. "algorithm" for file_hash,
+// "asset" for crypto_wallet).
+type IoC struct {
+	Type  string
+	Value string
+	Start int
+	End   int
+	Extra map[string]string // optional: algorithm, asset, etc.
+}
+
+// --- compiled regexes (module-level, compiled once) ---
+
+var (
+	reIPv4 = regexp.MustCompile(`\b\d{1,3}(?:\.\d{1,3}){3}\b`)
+
+	// IPv6: at least two colon-separated groups of hex digits.
+	reIPv6 = regexp.MustCompile(
+		`(?:^|[^0-9A-Fa-f:])` +
+			`([0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}(?:%[0-9A-Za-z]+)?)` +
+			`(?:[^0-9A-Fa-f:]|$)`,
+	)
+
+	reEmail = regexp.MustCompile(
+		`(?:^|[^A-Za-z0-9._%+\-])` +
+			`([A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?` +
+			`(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+)` +
+			`(?:[^A-Za-z0-9._%+\-]|$)`,
+	)
+
+	// Domain label: starts and ends with alnum, internal can have hyphens.
+	_label   = `[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?`
+	reDomain = regexp.MustCompile(
+		`(?:^|[^A-Za-z0-9.\-])` +
+			`((?:` + _label + `\.)+[A-Za-z]{2,63})` +
+			`(?:[^A-Za-z0-9.\-]|$)`,
+	)
+
+	// Hex hashes: 32, 40, 64, or 128 chars.
+	reHash = regexp.MustCompile(`\b([A-Fa-f0-9]{32,128})\b`)
+
+	// Crypto wallets.
+	reBTCLegacy = regexp.MustCompile(`(?:^|[^A-Za-z0-9])([13][1-9A-HJ-NP-Za-km-z]{25,34})(?:[^A-Za-z0-9]|$)`)
+	reBTCBech32 = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(bc1[02-9ac-hj-np-z]{6,87})(?:[^A-Za-z0-9]|$)`)
+	reETH       = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(0x[a-fA-F0-9]{40})(?:[^A-Za-z0-9]|$)`)
+
+	reCVE = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(CVE-\d{4}-\d{4,7})(?:[^A-Za-z0-9]|$)`)
+
+	reMAC = regexp.MustCompile(
+		`(?:^|[^A-Fa-f0-9:\-])([A-Fa-f0-9]{2}[:\-](?:[A-Fa-f0-9]{2}[:\-]){4}[A-Fa-f0-9]{2})(?:[^A-Fa-f0-9:\-]|$)`,
+	)
+
+	reE164 = regexp.MustCompile(
+		`(?:^|[^A-Za-z0-9])(\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4})(?:[^A-Za-z0-9]|$)`,
+	)
+	reESLocal = regexp.MustCompile(
+		`(?:^|[^A-Za-z0-9+])([6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3})(?:[^A-Za-z0-9]|$)`,
+	)
+
+	reNonDigit = regexp.MustCompile(`[^0-9]`)
+)
+
+// validTLDs is the same static set as the Python implementation.
+var validTLDs = map[string]bool{
+	// original gTLD
+	"com": true, "org": true, "net": true, "edu": true, "gov": true, "mil": true, "int": true,
+	// common gTLD
+	"info": true, "biz": true, "name": true, "pro": true, "mobi": true, "asia": true,
+	"jobs": true, "tel": true, "travel": true, "xxx": true, "post": true,
+	// popular new gTLD
+	"app": true, "dev": true, "io": true, "ai": true, "tech": true, "cloud": true,
+	"online": true, "site": true, "store": true, "xyz": true, "top": true, "shop": true,
+	"club": true, "fun": true, "live": true, "blog": true, "page": true, "news": true,
+	"media": true, "design": true, "studio": true, "agency": true, "co": true, "me": true, "tv": true,
+	// ccTLD
+	"us": true, "uk": true, "de": true, "fr": true, "es": true, "it": true, "nl": true,
+	"be": true, "se": true, "no": true, "fi": true, "dk": true, "ru": true, "ua": true,
+	"pl": true, "cz": true, "ch": true, "at": true, "pt": true, "gr": true, "ie": true,
+	"tr": true, "ca": true, "mx": true, "br": true, "ar": true, "cl": true, "pe": true,
+	"ve": true, "uy": true, "cn": true, "jp": true, "kr": true, "in": true, "id": true,
+	"th": true, "vn": true, "my": true, "sg": true, "ph": true, "tw": true, "hk": true,
+	"au": true, "nz": true, "za": true, "eg": true, "ma": true, "ng": true, "ke": true,
+	"il": true, "ae": true, "sa": true, "qa": true, "eu": true,
+}
+
+// hashLengths maps valid hash lengths to algorithm names (longest first to
+// avoid SHA1 being mis-identified within longer hex strings).
+var hashLengths = [][2]string{
+	{"128", "sha512"},
+	{"64", "sha256"},
+	{"40", "sha1"},
+	{"32", "md5"},
+}
+
+// --- helper: find submatch positions accounting for leading/trailing context chars ---
+
+// findAll returns all non-overlapping matches of a regex that uses a single
+// capturing group (group 1) for the actual value, adjusting offsets so
+// Start/End point to the captured group, not the full match.
+func findAll(re *regexp.Regexp, text string) [][3]int {
+	// FindAllStringSubmatchIndex returns [][]int where [0],[1] = full match,
+	// [2],[3] = group 1.
+	raw := re.FindAllStringSubmatchIndex(text, -1)
+	out := make([][3]int, 0, len(raw))
+	for _, m := range raw {
+		if len(m) < 4 || m[2] < 0 {
+			continue
+		}
+		out = append(out, [3]int{m[2], m[3], 0})
+	}
+	return out
+}
+
+// --- individual extractors ---
+
+func extractEmails(text string) []IoC {
+	spans := findAll(reEmail, text)
+	out := make([]IoC, 0, len(spans))
+	for _, s := range spans {
+		out = append(out, IoC{
+			Type:  "email",
+			Value: text[s[0]:s[1]],
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+	return out
+}
+
+func extractIPAddresses(text string) []IoC {
+	var out []IoC
+
+	// IPv4 — validate with net.ParseIP
+	for _, m := range reIPv4.FindAllStringIndex(text, -1) {
+		candidate := text[m[0]:m[1]]
+		ip := net.ParseIP(candidate)
+		if ip == nil || ip.To4() == nil {
+			continue
+		}
+		out = append(out, IoC{
+			Type:  "ip_address",
+			Value: candidate,
+			Start: m[0],
+			End:   m[1],
+		})
+	}
+
+	// IPv6 — use capturing group regex
+	for _, s := range findAll(reIPv6, text) {
+		candidate := text[s[0]:s[1]]
+		// Strip zone ID before parsing
+		zone := candidate
+		if idx := indexOf(candidate, '%'); idx >= 0 {
+			zone = candidate[:idx]
+		}
+		if countRune(zone, ':') < 2 {
+			continue
+		}
+		ip := net.ParseIP(zone)
+		if ip == nil {
+			continue
+		}
+		if ip.To4() != nil {
+			// IPv4-in-IPv6 — skip, already captured above
+			continue
+		}
+		out = append(out, IoC{
+			Type:  "ip_address",
+			Value: candidate,
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+
+	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+	return out
+}
+
+func extractDomains(text string) []IoC {
+	spans := findAll(reDomain, text)
+	var out []IoC
+	for _, s := range spans {
+		candidate := text[s[0]:s[1]]
+		tld := lastPart(candidate, '.')
+		if !validTLDs[toLower(tld)] {
+			continue
+		}
+		out = append(out, IoC{
+			Type:  "domain",
+			Value: candidate,
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+	return out
+}
+
+func extractFileHashes(text string) []IoC {
+	var out []IoC
+	for _, m := range reHash.FindAllStringSubmatchIndex(text, -1) {
+		if len(m) < 4 || m[2] < 0 {
+			continue
+		}
+		candidate := text[m[2]:m[3]]
+		length := len(candidate)
+		algo := ""
+		for _, pair := range hashLengths {
+			if itoa(length) == pair[0] {
+				algo = pair[1]
+				break
+			}
+		}
+		if algo == "" {
+			continue
+		}
+		out = append(out, IoC{
+			Type:  "file_hash",
+			Value: candidate,
+			Start: m[2],
+			End:   m[3],
+			Extra: map[string]string{"algorithm": algo},
+		})
+	}
+	return out
+}
+
+func extractCryptoWallets(text string) []IoC {
+	var out []IoC
+	for _, pair := range []struct {
+		re    *regexp.Regexp
+		asset string
+	}{
+		{reBTCLegacy, "btc"},
+		{reBTCBech32, "btc"},
+		{reETH, "eth"},
+	} {
+		for _, s := range findAll(pair.re, text) {
+			out = append(out, IoC{
+				Type:  "crypto_wallet",
+				Value: text[s[0]:s[1]],
+				Start: s[0],
+				End:   s[1],
+				Extra: map[string]string{"asset": pair.asset},
+			})
+		}
+	}
+	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+	return out
+}
+
+func extractCVEIDs(text string) []IoC {
+	spans := findAll(reCVE, text)
+	out := make([]IoC, 0, len(spans))
+	for _, s := range spans {
+		out = append(out, IoC{
+			Type:  "cve_id",
+			Value: text[s[0]:s[1]],
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+	return out
+}
+
+func extractMACAddresses(text string) []IoC {
+	var out []IoC
+	for _, s := range findAll(reMAC, text) {
+		candidate := text[s[0]:s[1]]
+		// Reject mixed separators
+		hasColon := contains(candidate, ':')
+		hasDash := contains(candidate, '-')
+		if hasColon && hasDash {
+			continue
+		}
+		out = append(out, IoC{
+			Type:  "mac_address",
+			Value: candidate,
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+	return out
+}
+
+func extractPhoneNumbers(text string) []IoC {
+	seen := map[[2]int]bool{}
+	var out []IoC
+
+	for _, s := range findAll(reE164, text) {
+		candidate := text[s[0]:s[1]]
+		digits := reNonDigit.ReplaceAllString(candidate, "")
+		if len(digits) < 8 || len(digits) > 15 {
+			continue
+		}
+		key := [2]int{s[0], s[1]}
+		if seen[key] {
+			continue
+		}
+		seen[key] = true
+		out = append(out, IoC{
+			Type:  "phone_number",
+			Value: candidate,
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+
+	for _, s := range findAll(reESLocal, text) {
+		candidate := text[s[0]:s[1]]
+		digits := reNonDigit.ReplaceAllString(candidate, "")
+		if len(digits) != 9 {
+			continue
+		}
+		key := [2]int{s[0], s[1]}
+		if seen[key] {
+			continue
+		}
+		seen[key] = true
+		out = append(out, IoC{
+			Type:  "phone_number",
+			Value: candidate,
+			Start: s[0],
+			End:   s[1],
+		})
+	}
+
+	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
+	return out
+}
+
+// --- pipeline ---
+
+// extractorOrder defines the canonical order for running extractors,
+// matching the Python _EXTRACTORS map order.
+var extractorOrder = []string{
+	"email",
+	"ip_address",
+	"crypto_wallet",
+	"cve_id",
+	"mac_address",
+	"file_hash",
+	"phone_number",
+	"domain",
+}
+
+var extractorFuncs = map[string]func(string) []IoC{
+	"email":        extractEmails,
+	"ip_address":   extractIPAddresses,
+	"crypto_wallet": extractCryptoWallets,
+	"cve_id":       extractCVEIDs,
+	"mac_address":  extractMACAddresses,
+	"file_hash":    extractFileHashes,
+	"phone_number": extractPhoneNumbers,
+	"domain":       extractDomains,
+}
+
+// ExtractIocs extracts all IoCs from text and returns a deduplicated,
+// offset-sorted slice. If types is nil, all extractor types are run.
+// Unknown type strings are silently ignored.
+//
+// Deduplication: if a span is fully contained within another already-accepted
+// span, it is discarded (e.g. a domain inside an email). Exact-span ties keep
+// the first match in extractor order.
+func ExtractIocs(text string, types []string) []IoC {
+	if types == nil {
+		types = extractorOrder
+	}
+
+	var raw []IoC
+	for _, t := range types {
+		fn, ok := extractorFuncs[t]
+		if !ok {
+			continue
+		}
+		raw = append(raw, fn(text)...)
+	}
+
+	// Sort: ascending start, then descending length (wider span first).
+	sort.SliceStable(raw, func(i, j int) bool {
+		si, sj := raw[i], raw[j]
+		if si.Start != sj.Start {
+			return si.Start < sj.Start
+		}
+		return (si.End - si.Start) > (sj.End - sj.Start)
+	})
+
+	// Dedup by (Type, Value) and by containment.
+	seen := map[[2]string]bool{}
+	var deduped []IoC
+	for _, m := range raw {
+		key := [2]string{m.Type, m.Value}
+		if seen[key] {
+			continue
+		}
+		// Check if fully contained within an already-accepted span.
+		contained := false
+		for _, d := range deduped {
+			if d.Start <= m.Start && d.End >= m.End &&
+				!(d.Start == m.Start && d.End == m.End) {
+				contained = true
+				break
+			}
+		}
+		if contained {
+			continue
+		}
+		// Exact-span tie: first in order wins.
+		exactTie := false
+		for _, d := range deduped {
+			if d.Start == m.Start && d.End == m.End {
+				exactTie = true
+				break
+			}
+		}
+		if exactTie {
+			continue
+		}
+		seen[key] = true
+		deduped = append(deduped, m)
+	}
+
+	return deduped
+}
+
+// --- small string helpers (avoid importing strings to keep package lean) ---
+
+func indexOf(s string, b byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == b {
+			return i
+		}
+	}
+	return -1
+}
+
+func countRune(s string, b byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == b {
+			n++
+		}
+	}
+	return n
+}
+
+func contains(s string, b byte) bool {
+	return indexOf(s, b) >= 0
+}
+
+func lastPart(s string, sep byte) string {
+	for i := len(s) - 1; i >= 0; i-- {
+		if s[i] == sep {
+			return s[i+1:]
+		}
+	}
+	return s
+}
+
+func toLower(s string) string {
+	b := make([]byte, len(s))
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		if c >= 'A' && c <= 'Z' {
+			c += 32
+		}
+		b[i] = c
+	}
+	return string(b)
+}
+
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	buf := [20]byte{}
+	pos := 19
+	for n > 0 {
+		buf[pos] = byte('0' + n%10)
+		pos--
+		n /= 10
+	}
+	return string(buf[pos+1:])
+}
@@ -0,0 +1,75 @@
+---
+name: extract_iocs
+kind: function
+lang: go
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "func ExtractIocs(text string, types []string) []IoC"
+description: "Port a Go de extract_iocs_py_cybersecurity. Extrae todos los IoCs (email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number) de un texto usando regex puro. Si types es nil corre todos los extractores; si tiene valores filtra solo esos tipos. Deduplica por (Type, Value) y elimina spans contenidos (ej. dominio dentro de un email). Retorna slice ordenada por offset."
+tags: [ioc, cybersecurity, regex, threat-intel, email, ip, domain, hash, wallet, cve, mac, phone]
+uses_functions: []
+uses_types: [ioc_go_cybersecurity]
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["net", "regexp", "sort"]
+params:
+  - name: text
+    desc: "Texto plano o markdown del que extraer IoCs. Puede contener cualquier contenido — se aplican todos los extractores sobre el texto completo."
+  - name: types
+    desc: "Slice de tipos a extraer. Valores validos: email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number. Si es nil, se corren todos los extractores. Tipos desconocidos se ignoran silenciosamente."
+output: "Slice de IoC ordenada por offset Start ascendente. Cada IoC tiene Type, Value, Start (byte offset inicio), End (byte offset fin) y opcionalmente Extra (algorithm para file_hash, asset para crypto_wallet). Sin duplicados: mismo (Type, Value) aparece una sola vez, y spans completamente contenidos dentro de otro span se descartan."
+tested: true
+tests:
+  - "texto sin IoCs retorna slice vacia"
+  - "un IoC de cada tipo detectado"
+  - "filtro por types=[email] retorna solo emails"
+  - "dedup mismo email aparece dos veces solo una entrada"
+  - "IPv4 valida detectada"
+  - "numero con octeto 999 no es IPv4"
+  - "numero con octeto 256 no es IPv4"
+  - "hash MD5 exactamente 32 hex chars detectado"
+  - "hash SHA1 exactamente 40 hex chars detectado"
+  - "hash SHA256 exactamente 64 hex chars detectado"
+  - "hash SHA512 exactamente 128 hex chars detectado"
+  - "longitud intermedia 60 hex chars ignorada"
+  - "dominio contenido en email span se descarta"
+  - "tipos desconocidos se ignoran sin error"
+  - "CVE-2014-0160 extraido"
+  - "multiples CVEs en mismo texto"
+  - "MAC con dos puntos extraida"
+  - "separadores mezclados rechazados"
+  - "E.164 con prefijo pais extraido"
+  - "formato ES 9 digitos extraido"
+  - "offsets Start/End cubren el valor exacto en el texto"
+  - "pipeline completo detecta email ip cve mac wallet"
+test_file_path: "functions/cybersecurity/extract_iocs_test.go"
+file_path: "functions/cybersecurity/extract_iocs.go"
+---
+
+## Ejemplo
+
+```go
+iocs := ExtractIocs("Contact alice@example.com, vuln CVE-2023-1234, ip 192.0.2.5", nil)
+// iocs[0] = IoC{Type:"email", Value:"alice@example.com", Start:8, End:25}
+// iocs[1] = IoC{Type:"ip_address", Value:"192.0.2.5", ...}
+// iocs[2] = IoC{Type:"cve_id", Value:"CVE-2023-1234", ...}
+
+// Solo emails:
+emails := ExtractIocs(text, []string{"email"})
+
+// Campo extra para hashes:
+// ioc.Extra["algorithm"] == "sha256"
+
+// Campo extra para wallets:
+// ioc.Extra["asset"] == "btc" | "eth"
+```
+
+## Notas
+
+Port directo de `extract_iocs_py_cybersecurity`. La validacion de IPv4/IPv6 usa `net.ParseIP` de stdlib, equivalente al modulo `ipaddress` de Python. Los regex son equivalentes a los Python — Go usa RE2 (sin lookbehind de longitud variable) por lo que los patrones de contexto se implementan con grupos capturadores y ajuste de offsets al grupo 1.
+
+La deduplicacion opera en dos niveles:
+1. (Type, Value) — el mismo valor del mismo tipo solo aparece una vez (primer match gana).
+2. Contencion de spans — si el span de un IoC queda completamente dentro del span de otro ya aceptado, se descarta (ej. "example.com" dentro de "alice@example.com").
@@ -0,0 +1,292 @@
+package cybersecurity
+
+import (
+	"testing"
+)
+
+// ---- helpers ----
+
+func iocTypes(iocs []IoC) []string {
+	out := make([]string, len(iocs))
+	for i, ioc := range iocs {
+		out[i] = ioc.Type
+	}
+	return out
+}
+
+func iocValues(iocs []IoC) []string {
+	out := make([]string, len(iocs))
+	for i, ioc := range iocs {
+		out[i] = ioc.Value
+	}
+	return out
+}
+
+func containsType(iocs []IoC, t string) bool {
+	for _, ioc := range iocs {
+		if ioc.Type == t {
+			return true
+		}
+	}
+	return false
+}
+
+func countType(iocs []IoC, t string) int {
+	n := 0
+	for _, ioc := range iocs {
+		if ioc.Type == t {
+			n++
+		}
+	}
+	return n
+}
+
+// ---- text sin IoCs ----
+
+func TestExtractIocs_texto_sin_iocs_retorna_slice_vacia(t *testing.T) {
+	t.Run("texto sin IoCs retorna slice vacia", func(t *testing.T) {
+		got := ExtractIocs("nothing interesting here, just plain words.", nil)
+		if len(got) != 0 {
+			t.Errorf("expected empty slice, got %v", got)
+		}
+	})
+}
+
+// ---- un IoC de cada tipo ----
+
+func TestExtractIocs_un_ioc_de_cada_tipo(t *testing.T) {
+	t.Run("un IoC de cada tipo detectado", func(t *testing.T) {
+		text := "email alice@example.com " +
+			"ip 192.0.2.1 " +
+			"hash 5d41402abc4b2a76b9719d911017c592 " +
+			"wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1 " +
+			"cve CVE-2023-1234 " +
+			"mac 00:1A:2B:3C:4D:5E " +
+			"phone +34612345678 " +
+			"domain api.example.com"
+
+		got := ExtractIocs(text, nil)
+		wantTypes := []string{"email", "ip_address", "file_hash", "crypto_wallet", "cve_id", "mac_address", "phone_number", "domain"}
+		for _, wt := range wantTypes {
+			if !containsType(got, wt) {
+				t.Errorf("expected type %q in results, got types: %v", wt, iocTypes(got))
+			}
+		}
+	})
+}
+
+// ---- filtro por types ----
+
+func TestExtractIocs_filtro_por_types_solo_emails(t *testing.T) {
+	t.Run("filtro por types=[email] retorna solo emails", func(t *testing.T) {
+		text := "alice@example.com 192.0.2.1"
+		got := ExtractIocs(text, []string{"email"})
+		for _, ioc := range got {
+			if ioc.Type != "email" {
+				t.Errorf("expected only email type, got %q", ioc.Type)
+			}
+		}
+		if !containsType(got, "email") {
+			t.Errorf("expected at least one email IoC")
+		}
+	})
+}
+
+// ---- dedup por (Type, Value) ----
+
+func TestExtractIocs_dedup_mismo_email_dos_veces_una_entrada(t *testing.T) {
+	t.Run("dedup mismo email aparece dos veces solo una entrada", func(t *testing.T) {
+		text := "alice@example.com and alice@example.com again"
+		got := ExtractIocs(text, []string{"email"})
+		n := countType(got, "email")
+		if n != 1 {
+			t.Errorf("expected 1 email after dedup, got %d: %v", n, iocValues(got))
+		}
+	})
+}
+
+// ---- IPv4 valida vs numero que parece IP ----
+
+func TestExtractIocs_ipv4_valida_vs_octeto_invalido(t *testing.T) {
+	t.Run("IPv4 valida detectada", func(t *testing.T) {
+		got := ExtractIocs("addr 10.0.0.1 end", []string{"ip_address"})
+		if !containsType(got, "ip_address") {
+			t.Errorf("expected ip_address IoC for valid IPv4")
+		}
+	})
+
+	t.Run("numero con octeto 999 no es IPv4", func(t *testing.T) {
+		got := ExtractIocs("bad 999.999.999.999 end", []string{"ip_address"})
+		if containsType(got, "ip_address") {
+			t.Errorf("expected no ip_address IoC for 999.999.999.999, got %v", got)
+		}
+	})
+
+	t.Run("numero con octeto 256 no es IPv4", func(t *testing.T) {
+		got := ExtractIocs("bad 256.0.0.1 end", []string{"ip_address"})
+		if containsType(got, "ip_address") {
+			t.Errorf("expected no ip_address IoC for 256.0.0.1, got %v", got)
+		}
+	})
+}
+
+// ---- hashes exactamente 32/40/64 chars hex ----
+
+func TestExtractIocs_hashes_por_longitud(t *testing.T) {
+	md5val := "5d41402abc4b2a76b9719d911017c592"   // 32
+	sha1val := "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d" // 40
+	sha256val := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // 64
+	// 128 hex chars — SHA512 of empty string
+	sha512val := "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"
+
+	t.Run("hash MD5 exactamente 32 hex chars detectado", func(t *testing.T) {
+		got := ExtractIocs(md5val, []string{"file_hash"})
+		if len(got) != 1 || got[0].Extra["algorithm"] != "md5" {
+			t.Errorf("expected md5 hash, got %v", got)
+		}
+	})
+
+	t.Run("hash SHA1 exactamente 40 hex chars detectado", func(t *testing.T) {
+		got := ExtractIocs(sha1val, []string{"file_hash"})
+		if len(got) != 1 || got[0].Extra["algorithm"] != "sha1" {
+			t.Errorf("expected sha1 hash, got %v", got)
+		}
+	})
+
+	t.Run("hash SHA256 exactamente 64 hex chars detectado", func(t *testing.T) {
+		got := ExtractIocs(sha256val, []string{"file_hash"})
+		if len(got) != 1 || got[0].Extra["algorithm"] != "sha256" {
+			t.Errorf("expected sha256 hash, got %v", got)
+		}
+	})
+
+	t.Run("hash SHA512 exactamente 128 hex chars detectado", func(t *testing.T) {
+		got := ExtractIocs(sha512val, []string{"file_hash"})
+		if len(got) != 1 || got[0].Extra["algorithm"] != "sha512" {
+			t.Errorf("expected sha512 hash, got %v", got)
+		}
+	})
+
+	t.Run("longitud intermedia 60 hex chars ignorada", func(t *testing.T) {
+		hex60 := "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" // 60 chars
+		got := ExtractIocs(hex60, []string{"file_hash"})
+		if len(got) != 0 {
+			t.Errorf("expected no hash for 60-char hex, got %v", got)
+		}
+	})
+}
+
+// ---- contenido: dominio dentro de email se descarta ----
+
+func TestExtractIocs_dominio_dentro_de_email_se_descarta(t *testing.T) {
+	t.Run("dominio contenido en email span se descarta", func(t *testing.T) {
+		text := "Email: alice@example.com nothing else"
+		got := ExtractIocs(text, nil)
+		if containsType(got, "domain") {
+			t.Errorf("expected domain to be deduplicated as contained by email span, got %v", got)
+		}
+		if !containsType(got, "email") {
+			t.Errorf("expected email IoC to be present")
+		}
+	})
+}
+
+// ---- tipos desconocidos se ignoran ----
+
+func TestExtractIocs_tipos_desconocidos_se_ignoran(t *testing.T) {
+	t.Run("tipos desconocidos se ignoran sin error", func(t *testing.T) {
+		text := "alice@example.com"
+		got := ExtractIocs(text, []string{"nonexistent", "email"})
+		if len(got) != 1 || got[0].Type != "email" {
+			t.Errorf("expected exactly 1 email IoC, got %v", got)
+		}
+	})
+}
+
+// ---- CVE ----
+
+func TestExtractIocs_cve_ids(t *testing.T) {
+	t.Run("CVE-2014-0160 extraido", func(t *testing.T) {
+		got := ExtractIocs("Patch CVE-2014-0160 immediately", []string{"cve_id"})
+		if len(got) != 1 || got[0].Value != "CVE-2014-0160" {
+			t.Errorf("expected CVE-2014-0160, got %v", got)
+		}
+	})
+
+	t.Run("multiples CVEs en mismo texto", func(t *testing.T) {
+		text := "Affected: CVE-2021-44228, CVE-2021-45046, CVE-2021-45105"
+		got := ExtractIocs(text, []string{"cve_id"})
+		if len(got) != 3 {
+			t.Errorf("expected 3 CVEs, got %d: %v", len(got), iocValues(got))
+		}
+	})
+}
+
+// ---- MAC addresses ----
+
+func TestExtractIocs_mac_addresses(t *testing.T) {
+	t.Run("MAC con dos puntos extraida", func(t *testing.T) {
+		got := ExtractIocs("iface 00:1A:2B:3C:4D:5E up", []string{"mac_address"})
+		if len(got) != 1 || got[0].Value != "00:1A:2B:3C:4D:5E" {
+			t.Errorf("expected MAC 00:1A:2B:3C:4D:5E, got %v", got)
+		}
+	})
+
+	t.Run("separadores mezclados rechazados", func(t *testing.T) {
+		got := ExtractIocs("00:1A-2B:3C-4D:5E", []string{"mac_address"})
+		if len(got) != 0 {
+			t.Errorf("expected no MAC for mixed separators, got %v", got)
+		}
+	})
+}
+
+// ---- telefono ----
+
+func TestExtractIocs_phone_numbers(t *testing.T) {
+	t.Run("E.164 con prefijo pais extraido", func(t *testing.T) {
+		got := ExtractIocs("call +34 612 345 678 now", []string{"phone_number"})
+		if len(got) == 0 {
+			t.Errorf("expected phone_number IoC for +34 612 345 678")
+		}
+	})
+
+	t.Run("formato ES 9 digitos extraido", func(t *testing.T) {
+		got := ExtractIocs("directo 612345678 fijo", []string{"phone_number"})
+		if len(got) == 0 {
+			t.Errorf("expected phone_number IoC for 612345678")
+		}
+	})
+}
+
+// ---- offsets son correctos ----
+
+func TestExtractIocs_offsets_correctos(t *testing.T) {
+	t.Run("offsets Start/End cubren el valor exacto en el texto", func(t *testing.T) {
+		text := "contact alice@example.com for info"
+		got := ExtractIocs(text, []string{"email"})
+		if len(got) == 0 {
+			t.Fatal("expected at least one email IoC")
+		}
+		ioc := got[0]
+		extracted := text[ioc.Start:ioc.End]
+		if extracted != ioc.Value {
+			t.Errorf("text[%d:%d] = %q, want %q", ioc.Start, ioc.End, extracted, ioc.Value)
+		}
+	})
+}
+
+// ---- pipeline completo (equivalente al test Python) ----
+
+func TestExtractIocs_pipeline_completo(t *testing.T) {
+	t.Run("pipeline completo detecta email ip cve mac wallet", func(t *testing.T) {
+		text := "Reach alice@example.com from 10.0.0.5; " +
+			"CVE-2023-1234 vendor 00:1A:2B:3C:4D:5E " +
+			"wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1"
+		got := ExtractIocs(text, nil)
+		for _, wt := range []string{"email", "ip_address", "cve_id", "mac_address", "crypto_wallet"} {
+			if !containsType(got, wt) {
+				t.Errorf("expected type %q in full pipeline results, types present: %v", wt, iocTypes(got))
+			}
+		}
+	})
+}