92297e02c5
functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
293 lines
8.8 KiB
Go
293 lines
8.8 KiB
Go
package cybersecurity
|
|
|
|
import (
|
|
"testing"
|
|
)
|
|
|
|
// ---- helpers ----
|
|
|
|
func iocTypes(iocs []IoC) []string {
|
|
out := make([]string, len(iocs))
|
|
for i, ioc := range iocs {
|
|
out[i] = ioc.Type
|
|
}
|
|
return out
|
|
}
|
|
|
|
func iocValues(iocs []IoC) []string {
|
|
out := make([]string, len(iocs))
|
|
for i, ioc := range iocs {
|
|
out[i] = ioc.Value
|
|
}
|
|
return out
|
|
}
|
|
|
|
func containsType(iocs []IoC, t string) bool {
|
|
for _, ioc := range iocs {
|
|
if ioc.Type == t {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func countType(iocs []IoC, t string) int {
|
|
n := 0
|
|
for _, ioc := range iocs {
|
|
if ioc.Type == t {
|
|
n++
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
// ---- text sin IoCs ----
|
|
|
|
func TestExtractIocs_texto_sin_iocs_retorna_slice_vacia(t *testing.T) {
|
|
t.Run("texto sin IoCs retorna slice vacia", func(t *testing.T) {
|
|
got := ExtractIocs("nothing interesting here, just plain words.", nil)
|
|
if len(got) != 0 {
|
|
t.Errorf("expected empty slice, got %v", got)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- un IoC de cada tipo ----
|
|
|
|
func TestExtractIocs_un_ioc_de_cada_tipo(t *testing.T) {
|
|
t.Run("un IoC de cada tipo detectado", func(t *testing.T) {
|
|
text := "email alice@example.com " +
|
|
"ip 192.0.2.1 " +
|
|
"hash 5d41402abc4b2a76b9719d911017c592 " +
|
|
"wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1 " +
|
|
"cve CVE-2023-1234 " +
|
|
"mac 00:1A:2B:3C:4D:5E " +
|
|
"phone +34612345678 " +
|
|
"domain api.example.com"
|
|
|
|
got := ExtractIocs(text, nil)
|
|
wantTypes := []string{"email", "ip_address", "file_hash", "crypto_wallet", "cve_id", "mac_address", "phone_number", "domain"}
|
|
for _, wt := range wantTypes {
|
|
if !containsType(got, wt) {
|
|
t.Errorf("expected type %q in results, got types: %v", wt, iocTypes(got))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- filtro por types ----
|
|
|
|
func TestExtractIocs_filtro_por_types_solo_emails(t *testing.T) {
|
|
t.Run("filtro por types=[email] retorna solo emails", func(t *testing.T) {
|
|
text := "alice@example.com 192.0.2.1"
|
|
got := ExtractIocs(text, []string{"email"})
|
|
for _, ioc := range got {
|
|
if ioc.Type != "email" {
|
|
t.Errorf("expected only email type, got %q", ioc.Type)
|
|
}
|
|
}
|
|
if !containsType(got, "email") {
|
|
t.Errorf("expected at least one email IoC")
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- dedup por (Type, Value) ----
|
|
|
|
func TestExtractIocs_dedup_mismo_email_dos_veces_una_entrada(t *testing.T) {
|
|
t.Run("dedup mismo email aparece dos veces solo una entrada", func(t *testing.T) {
|
|
text := "alice@example.com and alice@example.com again"
|
|
got := ExtractIocs(text, []string{"email"})
|
|
n := countType(got, "email")
|
|
if n != 1 {
|
|
t.Errorf("expected 1 email after dedup, got %d: %v", n, iocValues(got))
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- IPv4 valida vs numero que parece IP ----
|
|
|
|
func TestExtractIocs_ipv4_valida_vs_octeto_invalido(t *testing.T) {
|
|
t.Run("IPv4 valida detectada", func(t *testing.T) {
|
|
got := ExtractIocs("addr 10.0.0.1 end", []string{"ip_address"})
|
|
if !containsType(got, "ip_address") {
|
|
t.Errorf("expected ip_address IoC for valid IPv4")
|
|
}
|
|
})
|
|
|
|
t.Run("numero con octeto 999 no es IPv4", func(t *testing.T) {
|
|
got := ExtractIocs("bad 999.999.999.999 end", []string{"ip_address"})
|
|
if containsType(got, "ip_address") {
|
|
t.Errorf("expected no ip_address IoC for 999.999.999.999, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("numero con octeto 256 no es IPv4", func(t *testing.T) {
|
|
got := ExtractIocs("bad 256.0.0.1 end", []string{"ip_address"})
|
|
if containsType(got, "ip_address") {
|
|
t.Errorf("expected no ip_address IoC for 256.0.0.1, got %v", got)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- hashes exactamente 32/40/64 chars hex ----
|
|
|
|
func TestExtractIocs_hashes_por_longitud(t *testing.T) {
|
|
md5val := "5d41402abc4b2a76b9719d911017c592" // 32
|
|
sha1val := "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d" // 40
|
|
sha256val := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // 64
|
|
// 128 hex chars — SHA512 of empty string
|
|
sha512val := "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"
|
|
|
|
t.Run("hash MD5 exactamente 32 hex chars detectado", func(t *testing.T) {
|
|
got := ExtractIocs(md5val, []string{"file_hash"})
|
|
if len(got) != 1 || got[0].Extra["algorithm"] != "md5" {
|
|
t.Errorf("expected md5 hash, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("hash SHA1 exactamente 40 hex chars detectado", func(t *testing.T) {
|
|
got := ExtractIocs(sha1val, []string{"file_hash"})
|
|
if len(got) != 1 || got[0].Extra["algorithm"] != "sha1" {
|
|
t.Errorf("expected sha1 hash, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("hash SHA256 exactamente 64 hex chars detectado", func(t *testing.T) {
|
|
got := ExtractIocs(sha256val, []string{"file_hash"})
|
|
if len(got) != 1 || got[0].Extra["algorithm"] != "sha256" {
|
|
t.Errorf("expected sha256 hash, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("hash SHA512 exactamente 128 hex chars detectado", func(t *testing.T) {
|
|
got := ExtractIocs(sha512val, []string{"file_hash"})
|
|
if len(got) != 1 || got[0].Extra["algorithm"] != "sha512" {
|
|
t.Errorf("expected sha512 hash, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("longitud intermedia 60 hex chars ignorada", func(t *testing.T) {
|
|
hex60 := "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" + "abcdef" // 60 chars
|
|
got := ExtractIocs(hex60, []string{"file_hash"})
|
|
if len(got) != 0 {
|
|
t.Errorf("expected no hash for 60-char hex, got %v", got)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- contenido: dominio dentro de email se descarta ----
|
|
|
|
func TestExtractIocs_dominio_dentro_de_email_se_descarta(t *testing.T) {
|
|
t.Run("dominio contenido en email span se descarta", func(t *testing.T) {
|
|
text := "Email: alice@example.com nothing else"
|
|
got := ExtractIocs(text, nil)
|
|
if containsType(got, "domain") {
|
|
t.Errorf("expected domain to be deduplicated as contained by email span, got %v", got)
|
|
}
|
|
if !containsType(got, "email") {
|
|
t.Errorf("expected email IoC to be present")
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- tipos desconocidos se ignoran ----
|
|
|
|
func TestExtractIocs_tipos_desconocidos_se_ignoran(t *testing.T) {
|
|
t.Run("tipos desconocidos se ignoran sin error", func(t *testing.T) {
|
|
text := "alice@example.com"
|
|
got := ExtractIocs(text, []string{"nonexistent", "email"})
|
|
if len(got) != 1 || got[0].Type != "email" {
|
|
t.Errorf("expected exactly 1 email IoC, got %v", got)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- CVE ----
|
|
|
|
func TestExtractIocs_cve_ids(t *testing.T) {
|
|
t.Run("CVE-2014-0160 extraido", func(t *testing.T) {
|
|
got := ExtractIocs("Patch CVE-2014-0160 immediately", []string{"cve_id"})
|
|
if len(got) != 1 || got[0].Value != "CVE-2014-0160" {
|
|
t.Errorf("expected CVE-2014-0160, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("multiples CVEs en mismo texto", func(t *testing.T) {
|
|
text := "Affected: CVE-2021-44228, CVE-2021-45046, CVE-2021-45105"
|
|
got := ExtractIocs(text, []string{"cve_id"})
|
|
if len(got) != 3 {
|
|
t.Errorf("expected 3 CVEs, got %d: %v", len(got), iocValues(got))
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- MAC addresses ----
|
|
|
|
func TestExtractIocs_mac_addresses(t *testing.T) {
|
|
t.Run("MAC con dos puntos extraida", func(t *testing.T) {
|
|
got := ExtractIocs("iface 00:1A:2B:3C:4D:5E up", []string{"mac_address"})
|
|
if len(got) != 1 || got[0].Value != "00:1A:2B:3C:4D:5E" {
|
|
t.Errorf("expected MAC 00:1A:2B:3C:4D:5E, got %v", got)
|
|
}
|
|
})
|
|
|
|
t.Run("separadores mezclados rechazados", func(t *testing.T) {
|
|
got := ExtractIocs("00:1A-2B:3C-4D:5E", []string{"mac_address"})
|
|
if len(got) != 0 {
|
|
t.Errorf("expected no MAC for mixed separators, got %v", got)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- telefono ----
|
|
|
|
func TestExtractIocs_phone_numbers(t *testing.T) {
|
|
t.Run("E.164 con prefijo pais extraido", func(t *testing.T) {
|
|
got := ExtractIocs("call +34 612 345 678 now", []string{"phone_number"})
|
|
if len(got) == 0 {
|
|
t.Errorf("expected phone_number IoC for +34 612 345 678")
|
|
}
|
|
})
|
|
|
|
t.Run("formato ES 9 digitos extraido", func(t *testing.T) {
|
|
got := ExtractIocs("directo 612345678 fijo", []string{"phone_number"})
|
|
if len(got) == 0 {
|
|
t.Errorf("expected phone_number IoC for 612345678")
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- offsets son correctos ----
|
|
|
|
func TestExtractIocs_offsets_correctos(t *testing.T) {
|
|
t.Run("offsets Start/End cubren el valor exacto en el texto", func(t *testing.T) {
|
|
text := "contact alice@example.com for info"
|
|
got := ExtractIocs(text, []string{"email"})
|
|
if len(got) == 0 {
|
|
t.Fatal("expected at least one email IoC")
|
|
}
|
|
ioc := got[0]
|
|
extracted := text[ioc.Start:ioc.End]
|
|
if extracted != ioc.Value {
|
|
t.Errorf("text[%d:%d] = %q, want %q", ioc.Start, ioc.End, extracted, ioc.Value)
|
|
}
|
|
})
|
|
}
|
|
|
|
// ---- pipeline completo (equivalente al test Python) ----
|
|
|
|
func TestExtractIocs_pipeline_completo(t *testing.T) {
|
|
t.Run("pipeline completo detecta email ip cve mac wallet", func(t *testing.T) {
|
|
text := "Reach alice@example.com from 10.0.0.5; " +
|
|
"CVE-2023-1234 vendor 00:1A:2B:3C:4D:5E " +
|
|
"wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1"
|
|
got := ExtractIocs(text, nil)
|
|
for _, wt := range []string{"email", "ip_address", "cve_id", "mac_address", "crypto_wallet"} {
|
|
if !containsType(got, wt) {
|
|
t.Errorf("expected type %q in full pipeline results, types present: %v", wt, iocTypes(got))
|
|
}
|
|
}
|
|
})
|
|
}
|