package cybersecurity import ( "net" "regexp" "sort" ) // IoC represents a single Indicator of Compromise extracted from text. // Type is one of: "email", "ip_address", "domain", "file_hash", // "crypto_wallet", "cve_id", "mac_address", "phone_number". // Start and End are byte offsets into the original text. // Extra holds type-specific fields (e.g. "algorithm" for file_hash, // "asset" for crypto_wallet). type IoC struct { Type string Value string Start int End int Extra map[string]string // optional: algorithm, asset, etc. } // --- compiled regexes (module-level, compiled once) --- var ( reIPv4 = regexp.MustCompile(`\b\d{1,3}(?:\.\d{1,3}){3}\b`) // IPv6: at least two colon-separated groups of hex digits. reIPv6 = regexp.MustCompile( `(?:^|[^0-9A-Fa-f:])` + `([0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}(?:%[0-9A-Za-z]+)?)` + `(?:[^0-9A-Fa-f:]|$)`, ) reEmail = regexp.MustCompile( `(?:^|[^A-Za-z0-9._%+\-])` + `([A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?` + `(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+)` + `(?:[^A-Za-z0-9._%+\-]|$)`, ) // Domain label: starts and ends with alnum, internal can have hyphens. _label = `[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?` reDomain = regexp.MustCompile( `(?:^|[^A-Za-z0-9.\-])` + `((?:` + _label + `\.)+[A-Za-z]{2,63})` + `(?:[^A-Za-z0-9.\-]|$)`, ) // Hex hashes: 32, 40, 64, or 128 chars. reHash = regexp.MustCompile(`\b([A-Fa-f0-9]{32,128})\b`) // Crypto wallets. reBTCLegacy = regexp.MustCompile(`(?:^|[^A-Za-z0-9])([13][1-9A-HJ-NP-Za-km-z]{25,34})(?:[^A-Za-z0-9]|$)`) reBTCBech32 = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(bc1[02-9ac-hj-np-z]{6,87})(?:[^A-Za-z0-9]|$)`) reETH = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(0x[a-fA-F0-9]{40})(?:[^A-Za-z0-9]|$)`) reCVE = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(CVE-\d{4}-\d{4,7})(?:[^A-Za-z0-9]|$)`) reMAC = regexp.MustCompile( `(?:^|[^A-Fa-f0-9:\-])([A-Fa-f0-9]{2}[:\-](?:[A-Fa-f0-9]{2}[:\-]){4}[A-Fa-f0-9]{2})(?:[^A-Fa-f0-9:\-]|$)`, ) reE164 = regexp.MustCompile( `(?:^|[^A-Za-z0-9])(\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4})(?:[^A-Za-z0-9]|$)`, ) reESLocal = regexp.MustCompile( `(?:^|[^A-Za-z0-9+])([6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3})(?:[^A-Za-z0-9]|$)`, ) reNonDigit = regexp.MustCompile(`[^0-9]`) ) // validTLDs is the same static set as the Python implementation. var validTLDs = map[string]bool{ // original gTLD "com": true, "org": true, "net": true, "edu": true, "gov": true, "mil": true, "int": true, // common gTLD "info": true, "biz": true, "name": true, "pro": true, "mobi": true, "asia": true, "jobs": true, "tel": true, "travel": true, "xxx": true, "post": true, // popular new gTLD "app": true, "dev": true, "io": true, "ai": true, "tech": true, "cloud": true, "online": true, "site": true, "store": true, "xyz": true, "top": true, "shop": true, "club": true, "fun": true, "live": true, "blog": true, "page": true, "news": true, "media": true, "design": true, "studio": true, "agency": true, "co": true, "me": true, "tv": true, // ccTLD "us": true, "uk": true, "de": true, "fr": true, "es": true, "it": true, "nl": true, "be": true, "se": true, "no": true, "fi": true, "dk": true, "ru": true, "ua": true, "pl": true, "cz": true, "ch": true, "at": true, "pt": true, "gr": true, "ie": true, "tr": true, "ca": true, "mx": true, "br": true, "ar": true, "cl": true, "pe": true, "ve": true, "uy": true, "cn": true, "jp": true, "kr": true, "in": true, "id": true, "th": true, "vn": true, "my": true, "sg": true, "ph": true, "tw": true, "hk": true, "au": true, "nz": true, "za": true, "eg": true, "ma": true, "ng": true, "ke": true, "il": true, "ae": true, "sa": true, "qa": true, "eu": true, } // hashLengths maps valid hash lengths to algorithm names (longest first to // avoid SHA1 being mis-identified within longer hex strings). var hashLengths = [][2]string{ {"128", "sha512"}, {"64", "sha256"}, {"40", "sha1"}, {"32", "md5"}, } // --- helper: find submatch positions accounting for leading/trailing context chars --- // findAll returns all non-overlapping matches of a regex that uses a single // capturing group (group 1) for the actual value, adjusting offsets so // Start/End point to the captured group, not the full match. func findAll(re *regexp.Regexp, text string) [][3]int { // FindAllStringSubmatchIndex returns [][]int where [0],[1] = full match, // [2],[3] = group 1. raw := re.FindAllStringSubmatchIndex(text, -1) out := make([][3]int, 0, len(raw)) for _, m := range raw { if len(m) < 4 || m[2] < 0 { continue } out = append(out, [3]int{m[2], m[3], 0}) } return out } // --- individual extractors --- func extractEmails(text string) []IoC { spans := findAll(reEmail, text) out := make([]IoC, 0, len(spans)) for _, s := range spans { out = append(out, IoC{ Type: "email", Value: text[s[0]:s[1]], Start: s[0], End: s[1], }) } return out } func extractIPAddresses(text string) []IoC { var out []IoC // IPv4 — validate with net.ParseIP for _, m := range reIPv4.FindAllStringIndex(text, -1) { candidate := text[m[0]:m[1]] ip := net.ParseIP(candidate) if ip == nil || ip.To4() == nil { continue } out = append(out, IoC{ Type: "ip_address", Value: candidate, Start: m[0], End: m[1], }) } // IPv6 — use capturing group regex for _, s := range findAll(reIPv6, text) { candidate := text[s[0]:s[1]] // Strip zone ID before parsing zone := candidate if idx := indexOf(candidate, '%'); idx >= 0 { zone = candidate[:idx] } if countRune(zone, ':') < 2 { continue } ip := net.ParseIP(zone) if ip == nil { continue } if ip.To4() != nil { // IPv4-in-IPv6 — skip, already captured above continue } out = append(out, IoC{ Type: "ip_address", Value: candidate, Start: s[0], End: s[1], }) } sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start }) return out } func extractDomains(text string) []IoC { spans := findAll(reDomain, text) var out []IoC for _, s := range spans { candidate := text[s[0]:s[1]] tld := lastPart(candidate, '.') if !validTLDs[toLower(tld)] { continue } out = append(out, IoC{ Type: "domain", Value: candidate, Start: s[0], End: s[1], }) } return out } func extractFileHashes(text string) []IoC { var out []IoC for _, m := range reHash.FindAllStringSubmatchIndex(text, -1) { if len(m) < 4 || m[2] < 0 { continue } candidate := text[m[2]:m[3]] length := len(candidate) algo := "" for _, pair := range hashLengths { if itoa(length) == pair[0] { algo = pair[1] break } } if algo == "" { continue } out = append(out, IoC{ Type: "file_hash", Value: candidate, Start: m[2], End: m[3], Extra: map[string]string{"algorithm": algo}, }) } return out } func extractCryptoWallets(text string) []IoC { var out []IoC for _, pair := range []struct { re *regexp.Regexp asset string }{ {reBTCLegacy, "btc"}, {reBTCBech32, "btc"}, {reETH, "eth"}, } { for _, s := range findAll(pair.re, text) { out = append(out, IoC{ Type: "crypto_wallet", Value: text[s[0]:s[1]], Start: s[0], End: s[1], Extra: map[string]string{"asset": pair.asset}, }) } } sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start }) return out } func extractCVEIDs(text string) []IoC { spans := findAll(reCVE, text) out := make([]IoC, 0, len(spans)) for _, s := range spans { out = append(out, IoC{ Type: "cve_id", Value: text[s[0]:s[1]], Start: s[0], End: s[1], }) } return out } func extractMACAddresses(text string) []IoC { var out []IoC for _, s := range findAll(reMAC, text) { candidate := text[s[0]:s[1]] // Reject mixed separators hasColon := contains(candidate, ':') hasDash := contains(candidate, '-') if hasColon && hasDash { continue } out = append(out, IoC{ Type: "mac_address", Value: candidate, Start: s[0], End: s[1], }) } return out } func extractPhoneNumbers(text string) []IoC { seen := map[[2]int]bool{} var out []IoC for _, s := range findAll(reE164, text) { candidate := text[s[0]:s[1]] digits := reNonDigit.ReplaceAllString(candidate, "") if len(digits) < 8 || len(digits) > 15 { continue } key := [2]int{s[0], s[1]} if seen[key] { continue } seen[key] = true out = append(out, IoC{ Type: "phone_number", Value: candidate, Start: s[0], End: s[1], }) } for _, s := range findAll(reESLocal, text) { candidate := text[s[0]:s[1]] digits := reNonDigit.ReplaceAllString(candidate, "") if len(digits) != 9 { continue } key := [2]int{s[0], s[1]} if seen[key] { continue } seen[key] = true out = append(out, IoC{ Type: "phone_number", Value: candidate, Start: s[0], End: s[1], }) } sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start }) return out } // --- pipeline --- // extractorOrder defines the canonical order for running extractors, // matching the Python _EXTRACTORS map order. var extractorOrder = []string{ "email", "ip_address", "crypto_wallet", "cve_id", "mac_address", "file_hash", "phone_number", "domain", } var extractorFuncs = map[string]func(string) []IoC{ "email": extractEmails, "ip_address": extractIPAddresses, "crypto_wallet": extractCryptoWallets, "cve_id": extractCVEIDs, "mac_address": extractMACAddresses, "file_hash": extractFileHashes, "phone_number": extractPhoneNumbers, "domain": extractDomains, } // ExtractIocs extracts all IoCs from text and returns a deduplicated, // offset-sorted slice. If types is nil, all extractor types are run. // Unknown type strings are silently ignored. // // Deduplication: if a span is fully contained within another already-accepted // span, it is discarded (e.g. a domain inside an email). Exact-span ties keep // the first match in extractor order. func ExtractIocs(text string, types []string) []IoC { if types == nil { types = extractorOrder } var raw []IoC for _, t := range types { fn, ok := extractorFuncs[t] if !ok { continue } raw = append(raw, fn(text)...) } // Sort: ascending start, then descending length (wider span first). sort.SliceStable(raw, func(i, j int) bool { si, sj := raw[i], raw[j] if si.Start != sj.Start { return si.Start < sj.Start } return (si.End - si.Start) > (sj.End - sj.Start) }) // Dedup by (Type, Value) and by containment. seen := map[[2]string]bool{} var deduped []IoC for _, m := range raw { key := [2]string{m.Type, m.Value} if seen[key] { continue } // Check if fully contained within an already-accepted span. contained := false for _, d := range deduped { if d.Start <= m.Start && d.End >= m.End && !(d.Start == m.Start && d.End == m.End) { contained = true break } } if contained { continue } // Exact-span tie: first in order wins. exactTie := false for _, d := range deduped { if d.Start == m.Start && d.End == m.End { exactTie = true break } } if exactTie { continue } seen[key] = true deduped = append(deduped, m) } return deduped } // --- small string helpers (avoid importing strings to keep package lean) --- func indexOf(s string, b byte) int { for i := 0; i < len(s); i++ { if s[i] == b { return i } } return -1 } func countRune(s string, b byte) int { n := 0 for i := 0; i < len(s); i++ { if s[i] == b { n++ } } return n } func contains(s string, b byte) bool { return indexOf(s, b) >= 0 } func lastPart(s string, sep byte) string { for i := len(s) - 1; i >= 0; i-- { if s[i] == sep { return s[i+1:] } } return s } func toLower(s string) string { b := make([]byte, len(s)) for i := 0; i < len(s); i++ { c := s[i] if c >= 'A' && c <= 'Z' { c += 32 } b[i] = c } return string(b) } func itoa(n int) string { if n == 0 { return "0" } buf := [20]byte{} pos := 19 for n > 0 { buf[pos] = byte('0' + n%10) pos-- n /= 10 } return string(buf[pos+1:]) }