92297e02c5
functions/core/html_to_markdown: convierte HTML a Markdown limpio (golang-only sin dependencias externas). util como prep para LLMs y para indexar contenido web. functions/cybersecurity/extract_iocs + types/cybersecurity/ioc: extrae indicators of compromise (IPs, domains, URLs, hashes, emails, CVEs, crypto wallets) de texto libre. Devuelve []IOC tipado. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
495 lines
12 KiB
Go
495 lines
12 KiB
Go
package cybersecurity
|
|
|
|
import (
|
|
"net"
|
|
"regexp"
|
|
"sort"
|
|
)
|
|
|
|
// IoC represents a single Indicator of Compromise extracted from text.
|
|
// Type is one of: "email", "ip_address", "domain", "file_hash",
|
|
// "crypto_wallet", "cve_id", "mac_address", "phone_number".
|
|
// Start and End are byte offsets into the original text.
|
|
// Extra holds type-specific fields (e.g. "algorithm" for file_hash,
|
|
// "asset" for crypto_wallet).
|
|
type IoC struct {
|
|
Type string
|
|
Value string
|
|
Start int
|
|
End int
|
|
Extra map[string]string // optional: algorithm, asset, etc.
|
|
}
|
|
|
|
// --- compiled regexes (module-level, compiled once) ---
|
|
|
|
var (
|
|
reIPv4 = regexp.MustCompile(`\b\d{1,3}(?:\.\d{1,3}){3}\b`)
|
|
|
|
// IPv6: at least two colon-separated groups of hex digits.
|
|
reIPv6 = regexp.MustCompile(
|
|
`(?:^|[^0-9A-Fa-f:])` +
|
|
`([0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}(?:%[0-9A-Za-z]+)?)` +
|
|
`(?:[^0-9A-Fa-f:]|$)`,
|
|
)
|
|
|
|
reEmail = regexp.MustCompile(
|
|
`(?:^|[^A-Za-z0-9._%+\-])` +
|
|
`([A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?` +
|
|
`(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+)` +
|
|
`(?:[^A-Za-z0-9._%+\-]|$)`,
|
|
)
|
|
|
|
// Domain label: starts and ends with alnum, internal can have hyphens.
|
|
_label = `[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?`
|
|
reDomain = regexp.MustCompile(
|
|
`(?:^|[^A-Za-z0-9.\-])` +
|
|
`((?:` + _label + `\.)+[A-Za-z]{2,63})` +
|
|
`(?:[^A-Za-z0-9.\-]|$)`,
|
|
)
|
|
|
|
// Hex hashes: 32, 40, 64, or 128 chars.
|
|
reHash = regexp.MustCompile(`\b([A-Fa-f0-9]{32,128})\b`)
|
|
|
|
// Crypto wallets.
|
|
reBTCLegacy = regexp.MustCompile(`(?:^|[^A-Za-z0-9])([13][1-9A-HJ-NP-Za-km-z]{25,34})(?:[^A-Za-z0-9]|$)`)
|
|
reBTCBech32 = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(bc1[02-9ac-hj-np-z]{6,87})(?:[^A-Za-z0-9]|$)`)
|
|
reETH = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(0x[a-fA-F0-9]{40})(?:[^A-Za-z0-9]|$)`)
|
|
|
|
reCVE = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(CVE-\d{4}-\d{4,7})(?:[^A-Za-z0-9]|$)`)
|
|
|
|
reMAC = regexp.MustCompile(
|
|
`(?:^|[^A-Fa-f0-9:\-])([A-Fa-f0-9]{2}[:\-](?:[A-Fa-f0-9]{2}[:\-]){4}[A-Fa-f0-9]{2})(?:[^A-Fa-f0-9:\-]|$)`,
|
|
)
|
|
|
|
reE164 = regexp.MustCompile(
|
|
`(?:^|[^A-Za-z0-9])(\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4})(?:[^A-Za-z0-9]|$)`,
|
|
)
|
|
reESLocal = regexp.MustCompile(
|
|
`(?:^|[^A-Za-z0-9+])([6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3})(?:[^A-Za-z0-9]|$)`,
|
|
)
|
|
|
|
reNonDigit = regexp.MustCompile(`[^0-9]`)
|
|
)
|
|
|
|
// validTLDs is the same static set as the Python implementation.
|
|
var validTLDs = map[string]bool{
|
|
// original gTLD
|
|
"com": true, "org": true, "net": true, "edu": true, "gov": true, "mil": true, "int": true,
|
|
// common gTLD
|
|
"info": true, "biz": true, "name": true, "pro": true, "mobi": true, "asia": true,
|
|
"jobs": true, "tel": true, "travel": true, "xxx": true, "post": true,
|
|
// popular new gTLD
|
|
"app": true, "dev": true, "io": true, "ai": true, "tech": true, "cloud": true,
|
|
"online": true, "site": true, "store": true, "xyz": true, "top": true, "shop": true,
|
|
"club": true, "fun": true, "live": true, "blog": true, "page": true, "news": true,
|
|
"media": true, "design": true, "studio": true, "agency": true, "co": true, "me": true, "tv": true,
|
|
// ccTLD
|
|
"us": true, "uk": true, "de": true, "fr": true, "es": true, "it": true, "nl": true,
|
|
"be": true, "se": true, "no": true, "fi": true, "dk": true, "ru": true, "ua": true,
|
|
"pl": true, "cz": true, "ch": true, "at": true, "pt": true, "gr": true, "ie": true,
|
|
"tr": true, "ca": true, "mx": true, "br": true, "ar": true, "cl": true, "pe": true,
|
|
"ve": true, "uy": true, "cn": true, "jp": true, "kr": true, "in": true, "id": true,
|
|
"th": true, "vn": true, "my": true, "sg": true, "ph": true, "tw": true, "hk": true,
|
|
"au": true, "nz": true, "za": true, "eg": true, "ma": true, "ng": true, "ke": true,
|
|
"il": true, "ae": true, "sa": true, "qa": true, "eu": true,
|
|
}
|
|
|
|
// hashLengths maps valid hash lengths to algorithm names (longest first to
|
|
// avoid SHA1 being mis-identified within longer hex strings).
|
|
var hashLengths = [][2]string{
|
|
{"128", "sha512"},
|
|
{"64", "sha256"},
|
|
{"40", "sha1"},
|
|
{"32", "md5"},
|
|
}
|
|
|
|
// --- helper: find submatch positions accounting for leading/trailing context chars ---
|
|
|
|
// findAll returns all non-overlapping matches of a regex that uses a single
|
|
// capturing group (group 1) for the actual value, adjusting offsets so
|
|
// Start/End point to the captured group, not the full match.
|
|
func findAll(re *regexp.Regexp, text string) [][3]int {
|
|
// FindAllStringSubmatchIndex returns [][]int where [0],[1] = full match,
|
|
// [2],[3] = group 1.
|
|
raw := re.FindAllStringSubmatchIndex(text, -1)
|
|
out := make([][3]int, 0, len(raw))
|
|
for _, m := range raw {
|
|
if len(m) < 4 || m[2] < 0 {
|
|
continue
|
|
}
|
|
out = append(out, [3]int{m[2], m[3], 0})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// --- individual extractors ---
|
|
|
|
func extractEmails(text string) []IoC {
|
|
spans := findAll(reEmail, text)
|
|
out := make([]IoC, 0, len(spans))
|
|
for _, s := range spans {
|
|
out = append(out, IoC{
|
|
Type: "email",
|
|
Value: text[s[0]:s[1]],
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractIPAddresses(text string) []IoC {
|
|
var out []IoC
|
|
|
|
// IPv4 — validate with net.ParseIP
|
|
for _, m := range reIPv4.FindAllStringIndex(text, -1) {
|
|
candidate := text[m[0]:m[1]]
|
|
ip := net.ParseIP(candidate)
|
|
if ip == nil || ip.To4() == nil {
|
|
continue
|
|
}
|
|
out = append(out, IoC{
|
|
Type: "ip_address",
|
|
Value: candidate,
|
|
Start: m[0],
|
|
End: m[1],
|
|
})
|
|
}
|
|
|
|
// IPv6 — use capturing group regex
|
|
for _, s := range findAll(reIPv6, text) {
|
|
candidate := text[s[0]:s[1]]
|
|
// Strip zone ID before parsing
|
|
zone := candidate
|
|
if idx := indexOf(candidate, '%'); idx >= 0 {
|
|
zone = candidate[:idx]
|
|
}
|
|
if countRune(zone, ':') < 2 {
|
|
continue
|
|
}
|
|
ip := net.ParseIP(zone)
|
|
if ip == nil {
|
|
continue
|
|
}
|
|
if ip.To4() != nil {
|
|
// IPv4-in-IPv6 — skip, already captured above
|
|
continue
|
|
}
|
|
out = append(out, IoC{
|
|
Type: "ip_address",
|
|
Value: candidate,
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
|
|
return out
|
|
}
|
|
|
|
func extractDomains(text string) []IoC {
|
|
spans := findAll(reDomain, text)
|
|
var out []IoC
|
|
for _, s := range spans {
|
|
candidate := text[s[0]:s[1]]
|
|
tld := lastPart(candidate, '.')
|
|
if !validTLDs[toLower(tld)] {
|
|
continue
|
|
}
|
|
out = append(out, IoC{
|
|
Type: "domain",
|
|
Value: candidate,
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractFileHashes(text string) []IoC {
|
|
var out []IoC
|
|
for _, m := range reHash.FindAllStringSubmatchIndex(text, -1) {
|
|
if len(m) < 4 || m[2] < 0 {
|
|
continue
|
|
}
|
|
candidate := text[m[2]:m[3]]
|
|
length := len(candidate)
|
|
algo := ""
|
|
for _, pair := range hashLengths {
|
|
if itoa(length) == pair[0] {
|
|
algo = pair[1]
|
|
break
|
|
}
|
|
}
|
|
if algo == "" {
|
|
continue
|
|
}
|
|
out = append(out, IoC{
|
|
Type: "file_hash",
|
|
Value: candidate,
|
|
Start: m[2],
|
|
End: m[3],
|
|
Extra: map[string]string{"algorithm": algo},
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractCryptoWallets(text string) []IoC {
|
|
var out []IoC
|
|
for _, pair := range []struct {
|
|
re *regexp.Regexp
|
|
asset string
|
|
}{
|
|
{reBTCLegacy, "btc"},
|
|
{reBTCBech32, "btc"},
|
|
{reETH, "eth"},
|
|
} {
|
|
for _, s := range findAll(pair.re, text) {
|
|
out = append(out, IoC{
|
|
Type: "crypto_wallet",
|
|
Value: text[s[0]:s[1]],
|
|
Start: s[0],
|
|
End: s[1],
|
|
Extra: map[string]string{"asset": pair.asset},
|
|
})
|
|
}
|
|
}
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
|
|
return out
|
|
}
|
|
|
|
func extractCVEIDs(text string) []IoC {
|
|
spans := findAll(reCVE, text)
|
|
out := make([]IoC, 0, len(spans))
|
|
for _, s := range spans {
|
|
out = append(out, IoC{
|
|
Type: "cve_id",
|
|
Value: text[s[0]:s[1]],
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractMACAddresses(text string) []IoC {
|
|
var out []IoC
|
|
for _, s := range findAll(reMAC, text) {
|
|
candidate := text[s[0]:s[1]]
|
|
// Reject mixed separators
|
|
hasColon := contains(candidate, ':')
|
|
hasDash := contains(candidate, '-')
|
|
if hasColon && hasDash {
|
|
continue
|
|
}
|
|
out = append(out, IoC{
|
|
Type: "mac_address",
|
|
Value: candidate,
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractPhoneNumbers(text string) []IoC {
|
|
seen := map[[2]int]bool{}
|
|
var out []IoC
|
|
|
|
for _, s := range findAll(reE164, text) {
|
|
candidate := text[s[0]:s[1]]
|
|
digits := reNonDigit.ReplaceAllString(candidate, "")
|
|
if len(digits) < 8 || len(digits) > 15 {
|
|
continue
|
|
}
|
|
key := [2]int{s[0], s[1]}
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
out = append(out, IoC{
|
|
Type: "phone_number",
|
|
Value: candidate,
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
|
|
for _, s := range findAll(reESLocal, text) {
|
|
candidate := text[s[0]:s[1]]
|
|
digits := reNonDigit.ReplaceAllString(candidate, "")
|
|
if len(digits) != 9 {
|
|
continue
|
|
}
|
|
key := [2]int{s[0], s[1]}
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
out = append(out, IoC{
|
|
Type: "phone_number",
|
|
Value: candidate,
|
|
Start: s[0],
|
|
End: s[1],
|
|
})
|
|
}
|
|
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
|
|
return out
|
|
}
|
|
|
|
// --- pipeline ---
|
|
|
|
// extractorOrder defines the canonical order for running extractors,
|
|
// matching the Python _EXTRACTORS map order.
|
|
var extractorOrder = []string{
|
|
"email",
|
|
"ip_address",
|
|
"crypto_wallet",
|
|
"cve_id",
|
|
"mac_address",
|
|
"file_hash",
|
|
"phone_number",
|
|
"domain",
|
|
}
|
|
|
|
var extractorFuncs = map[string]func(string) []IoC{
|
|
"email": extractEmails,
|
|
"ip_address": extractIPAddresses,
|
|
"crypto_wallet": extractCryptoWallets,
|
|
"cve_id": extractCVEIDs,
|
|
"mac_address": extractMACAddresses,
|
|
"file_hash": extractFileHashes,
|
|
"phone_number": extractPhoneNumbers,
|
|
"domain": extractDomains,
|
|
}
|
|
|
|
// ExtractIocs extracts all IoCs from text and returns a deduplicated,
|
|
// offset-sorted slice. If types is nil, all extractor types are run.
|
|
// Unknown type strings are silently ignored.
|
|
//
|
|
// Deduplication: if a span is fully contained within another already-accepted
|
|
// span, it is discarded (e.g. a domain inside an email). Exact-span ties keep
|
|
// the first match in extractor order.
|
|
func ExtractIocs(text string, types []string) []IoC {
|
|
if types == nil {
|
|
types = extractorOrder
|
|
}
|
|
|
|
var raw []IoC
|
|
for _, t := range types {
|
|
fn, ok := extractorFuncs[t]
|
|
if !ok {
|
|
continue
|
|
}
|
|
raw = append(raw, fn(text)...)
|
|
}
|
|
|
|
// Sort: ascending start, then descending length (wider span first).
|
|
sort.SliceStable(raw, func(i, j int) bool {
|
|
si, sj := raw[i], raw[j]
|
|
if si.Start != sj.Start {
|
|
return si.Start < sj.Start
|
|
}
|
|
return (si.End - si.Start) > (sj.End - sj.Start)
|
|
})
|
|
|
|
// Dedup by (Type, Value) and by containment.
|
|
seen := map[[2]string]bool{}
|
|
var deduped []IoC
|
|
for _, m := range raw {
|
|
key := [2]string{m.Type, m.Value}
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
// Check if fully contained within an already-accepted span.
|
|
contained := false
|
|
for _, d := range deduped {
|
|
if d.Start <= m.Start && d.End >= m.End &&
|
|
!(d.Start == m.Start && d.End == m.End) {
|
|
contained = true
|
|
break
|
|
}
|
|
}
|
|
if contained {
|
|
continue
|
|
}
|
|
// Exact-span tie: first in order wins.
|
|
exactTie := false
|
|
for _, d := range deduped {
|
|
if d.Start == m.Start && d.End == m.End {
|
|
exactTie = true
|
|
break
|
|
}
|
|
}
|
|
if exactTie {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
deduped = append(deduped, m)
|
|
}
|
|
|
|
return deduped
|
|
}
|
|
|
|
// --- small string helpers (avoid importing strings to keep package lean) ---
|
|
|
|
func indexOf(s string, b byte) int {
|
|
for i := 0; i < len(s); i++ {
|
|
if s[i] == b {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func countRune(s string, b byte) int {
|
|
n := 0
|
|
for i := 0; i < len(s); i++ {
|
|
if s[i] == b {
|
|
n++
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
func contains(s string, b byte) bool {
|
|
return indexOf(s, b) >= 0
|
|
}
|
|
|
|
func lastPart(s string, sep byte) string {
|
|
for i := len(s) - 1; i >= 0; i-- {
|
|
if s[i] == sep {
|
|
return s[i+1:]
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
func toLower(s string) string {
|
|
b := make([]byte, len(s))
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if c >= 'A' && c <= 'Z' {
|
|
c += 32
|
|
}
|
|
b[i] = c
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func itoa(n int) string {
|
|
if n == 0 {
|
|
return "0"
|
|
}
|
|
buf := [20]byte{}
|
|
pos := 19
|
|
for n > 0 {
|
|
buf[pos] = byte('0' + n%10)
|
|
pos--
|
|
n /= 10
|
|
}
|
|
return string(buf[pos+1:])
|
|
}
|