fn_registry/functions/cybersecurity/extract_iocs.go

package cybersecurity

import (
	"net"
	"regexp"
	"sort"
)

// IoC represents a single Indicator of Compromise extracted from text.
// Type is one of: "email", "ip_address", "domain", "file_hash",
// "crypto_wallet", "cve_id", "mac_address", "phone_number".
// Start and End are byte offsets into the original text.
// Extra holds type-specific fields (e.g. "algorithm" for file_hash,
// "asset" for crypto_wallet).
type IoC struct {
	Type  string
	Value string
	Start int
	End   int
	Extra map[string]string // optional: algorithm, asset, etc.
}

// --- compiled regexes (module-level, compiled once) ---

var (
	reIPv4 = regexp.MustCompile(`\b\d{1,3}(?:\.\d{1,3}){3}\b`)

	// IPv6: at least two colon-separated groups of hex digits.
	reIPv6 = regexp.MustCompile(
		`(?:^|[^0-9A-Fa-f:])` +
			`([0-9A-Fa-f]{0,4}(?::[0-9A-Fa-f]{0,4}){2,7}(?:%[0-9A-Za-z]+)?)` +
			`(?:[^0-9A-Fa-f:]|$)`,
	)

	reEmail = regexp.MustCompile(
		`(?:^|[^A-Za-z0-9._%+\-])` +
			`([A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?` +
			`(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+)` +
			`(?:[^A-Za-z0-9._%+\-]|$)`,
	)

	// Domain label: starts and ends with alnum, internal can have hyphens.
	_label   = `[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?`
	reDomain = regexp.MustCompile(
		`(?:^|[^A-Za-z0-9.\-])` +
			`((?:` + _label + `\.)+[A-Za-z]{2,63})` +
			`(?:[^A-Za-z0-9.\-]|$)`,
	)

	// Hex hashes: 32, 40, 64, or 128 chars.
	reHash = regexp.MustCompile(`\b([A-Fa-f0-9]{32,128})\b`)

	// Crypto wallets.
	reBTCLegacy = regexp.MustCompile(`(?:^|[^A-Za-z0-9])([13][1-9A-HJ-NP-Za-km-z]{25,34})(?:[^A-Za-z0-9]|$)`)
	reBTCBech32 = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(bc1[02-9ac-hj-np-z]{6,87})(?:[^A-Za-z0-9]|$)`)
	reETH       = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(0x[a-fA-F0-9]{40})(?:[^A-Za-z0-9]|$)`)

	reCVE = regexp.MustCompile(`(?:^|[^A-Za-z0-9])(CVE-\d{4}-\d{4,7})(?:[^A-Za-z0-9]|$)`)

	reMAC = regexp.MustCompile(
		`(?:^|[^A-Fa-f0-9:\-])([A-Fa-f0-9]{2}[:\-](?:[A-Fa-f0-9]{2}[:\-]){4}[A-Fa-f0-9]{2})(?:[^A-Fa-f0-9:\-]|$)`,
	)

	reE164 = regexp.MustCompile(
		`(?:^|[^A-Za-z0-9])(\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4})(?:[^A-Za-z0-9]|$)`,
	)
	reESLocal = regexp.MustCompile(
		`(?:^|[^A-Za-z0-9+])([6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3})(?:[^A-Za-z0-9]|$)`,
	)

	reNonDigit = regexp.MustCompile(`[^0-9]`)
)

// validTLDs is the same static set as the Python implementation.
var validTLDs = map[string]bool{
	// original gTLD
	"com": true, "org": true, "net": true, "edu": true, "gov": true, "mil": true, "int": true,
	// common gTLD
	"info": true, "biz": true, "name": true, "pro": true, "mobi": true, "asia": true,
	"jobs": true, "tel": true, "travel": true, "xxx": true, "post": true,
	// popular new gTLD
	"app": true, "dev": true, "io": true, "ai": true, "tech": true, "cloud": true,
	"online": true, "site": true, "store": true, "xyz": true, "top": true, "shop": true,
	"club": true, "fun": true, "live": true, "blog": true, "page": true, "news": true,
	"media": true, "design": true, "studio": true, "agency": true, "co": true, "me": true, "tv": true,
	// ccTLD
	"us": true, "uk": true, "de": true, "fr": true, "es": true, "it": true, "nl": true,
	"be": true, "se": true, "no": true, "fi": true, "dk": true, "ru": true, "ua": true,
	"pl": true, "cz": true, "ch": true, "at": true, "pt": true, "gr": true, "ie": true,
	"tr": true, "ca": true, "mx": true, "br": true, "ar": true, "cl": true, "pe": true,
	"ve": true, "uy": true, "cn": true, "jp": true, "kr": true, "in": true, "id": true,
	"th": true, "vn": true, "my": true, "sg": true, "ph": true, "tw": true, "hk": true,
	"au": true, "nz": true, "za": true, "eg": true, "ma": true, "ng": true, "ke": true,
	"il": true, "ae": true, "sa": true, "qa": true, "eu": true,
}

// hashLengths maps valid hash lengths to algorithm names (longest first to
// avoid SHA1 being mis-identified within longer hex strings).
var hashLengths = [][2]string{
	{"128", "sha512"},
	{"64", "sha256"},
	{"40", "sha1"},
	{"32", "md5"},
}

// --- helper: find submatch positions accounting for leading/trailing context chars ---

// findAll returns all non-overlapping matches of a regex that uses a single
// capturing group (group 1) for the actual value, adjusting offsets so
// Start/End point to the captured group, not the full match.
func findAll(re *regexp.Regexp, text string) [][3]int {
	// FindAllStringSubmatchIndex returns [][]int where [0],[1] = full match,
	// [2],[3] = group 1.
	raw := re.FindAllStringSubmatchIndex(text, -1)
	out := make([][3]int, 0, len(raw))
	for _, m := range raw {
		if len(m) < 4 || m[2] < 0 {
			continue
		}
		out = append(out, [3]int{m[2], m[3], 0})
	}
	return out
}

// --- individual extractors ---

func extractEmails(text string) []IoC {
	spans := findAll(reEmail, text)
	out := make([]IoC, 0, len(spans))
	for _, s := range spans {
		out = append(out, IoC{
			Type:  "email",
			Value: text[s[0]:s[1]],
			Start: s[0],
			End:   s[1],
		})
	}
	return out
}

func extractIPAddresses(text string) []IoC {
	var out []IoC

	// IPv4 — validate with net.ParseIP
	for _, m := range reIPv4.FindAllStringIndex(text, -1) {
		candidate := text[m[0]:m[1]]
		ip := net.ParseIP(candidate)
		if ip == nil || ip.To4() == nil {
			continue
		}
		out = append(out, IoC{
			Type:  "ip_address",
			Value: candidate,
			Start: m[0],
			End:   m[1],
		})
	}

	// IPv6 — use capturing group regex
	for _, s := range findAll(reIPv6, text) {
		candidate := text[s[0]:s[1]]
		// Strip zone ID before parsing
		zone := candidate
		if idx := indexOf(candidate, '%'); idx >= 0 {
			zone = candidate[:idx]
		}
		if countRune(zone, ':') < 2 {
			continue
		}
		ip := net.ParseIP(zone)
		if ip == nil {
			continue
		}
		if ip.To4() != nil {
			// IPv4-in-IPv6 — skip, already captured above
			continue
		}
		out = append(out, IoC{
			Type:  "ip_address",
			Value: candidate,
			Start: s[0],
			End:   s[1],
		})
	}

	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
	return out
}

func extractDomains(text string) []IoC {
	spans := findAll(reDomain, text)
	var out []IoC
	for _, s := range spans {
		candidate := text[s[0]:s[1]]
		tld := lastPart(candidate, '.')
		if !validTLDs[toLower(tld)] {
			continue
		}
		out = append(out, IoC{
			Type:  "domain",
			Value: candidate,
			Start: s[0],
			End:   s[1],
		})
	}
	return out
}

func extractFileHashes(text string) []IoC {
	var out []IoC
	for _, m := range reHash.FindAllStringSubmatchIndex(text, -1) {
		if len(m) < 4 || m[2] < 0 {
			continue
		}
		candidate := text[m[2]:m[3]]
		length := len(candidate)
		algo := ""
		for _, pair := range hashLengths {
			if itoa(length) == pair[0] {
				algo = pair[1]
				break
			}
		}
		if algo == "" {
			continue
		}
		out = append(out, IoC{
			Type:  "file_hash",
			Value: candidate,
			Start: m[2],
			End:   m[3],
			Extra: map[string]string{"algorithm": algo},
		})
	}
	return out
}

func extractCryptoWallets(text string) []IoC {
	var out []IoC
	for _, pair := range []struct {
		re    *regexp.Regexp
		asset string
	}{
		{reBTCLegacy, "btc"},
		{reBTCBech32, "btc"},
		{reETH, "eth"},
	} {
		for _, s := range findAll(pair.re, text) {
			out = append(out, IoC{
				Type:  "crypto_wallet",
				Value: text[s[0]:s[1]],
				Start: s[0],
				End:   s[1],
				Extra: map[string]string{"asset": pair.asset},
			})
		}
	}
	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
	return out
}

func extractCVEIDs(text string) []IoC {
	spans := findAll(reCVE, text)
	out := make([]IoC, 0, len(spans))
	for _, s := range spans {
		out = append(out, IoC{
			Type:  "cve_id",
			Value: text[s[0]:s[1]],
			Start: s[0],
			End:   s[1],
		})
	}
	return out
}

func extractMACAddresses(text string) []IoC {
	var out []IoC
	for _, s := range findAll(reMAC, text) {
		candidate := text[s[0]:s[1]]
		// Reject mixed separators
		hasColon := contains(candidate, ':')
		hasDash := contains(candidate, '-')
		if hasColon && hasDash {
			continue
		}
		out = append(out, IoC{
			Type:  "mac_address",
			Value: candidate,
			Start: s[0],
			End:   s[1],
		})
	}
	return out
}

func extractPhoneNumbers(text string) []IoC {
	seen := map[[2]int]bool{}
	var out []IoC

	for _, s := range findAll(reE164, text) {
		candidate := text[s[0]:s[1]]
		digits := reNonDigit.ReplaceAllString(candidate, "")
		if len(digits) < 8 || len(digits) > 15 {
			continue
		}
		key := [2]int{s[0], s[1]}
		if seen[key] {
			continue
		}
		seen[key] = true
		out = append(out, IoC{
			Type:  "phone_number",
			Value: candidate,
			Start: s[0],
			End:   s[1],
		})
	}

	for _, s := range findAll(reESLocal, text) {
		candidate := text[s[0]:s[1]]
		digits := reNonDigit.ReplaceAllString(candidate, "")
		if len(digits) != 9 {
			continue
		}
		key := [2]int{s[0], s[1]}
		if seen[key] {
			continue
		}
		seen[key] = true
		out = append(out, IoC{
			Type:  "phone_number",
			Value: candidate,
			Start: s[0],
			End:   s[1],
		})
	}

	sort.Slice(out, func(i, j int) bool { return out[i].Start < out[j].Start })
	return out
}

// --- pipeline ---

// extractorOrder defines the canonical order for running extractors,
// matching the Python _EXTRACTORS map order.
var extractorOrder = []string{
	"email",
	"ip_address",
	"crypto_wallet",
	"cve_id",
	"mac_address",
	"file_hash",
	"phone_number",
	"domain",
}

var extractorFuncs = map[string]func(string) []IoC{
	"email":        extractEmails,
	"ip_address":   extractIPAddresses,
	"crypto_wallet": extractCryptoWallets,
	"cve_id":       extractCVEIDs,
	"mac_address":  extractMACAddresses,
	"file_hash":    extractFileHashes,
	"phone_number": extractPhoneNumbers,
	"domain":       extractDomains,
}

// ExtractIocs extracts all IoCs from text and returns a deduplicated,
// offset-sorted slice. If types is nil, all extractor types are run.
// Unknown type strings are silently ignored.
//
// Deduplication: if a span is fully contained within another already-accepted
// span, it is discarded (e.g. a domain inside an email). Exact-span ties keep
// the first match in extractor order.
func ExtractIocs(text string, types []string) []IoC {
	if types == nil {
		types = extractorOrder
	}

	var raw []IoC
	for _, t := range types {
		fn, ok := extractorFuncs[t]
		if !ok {
			continue
		}
		raw = append(raw, fn(text)...)
	}

	// Sort: ascending start, then descending length (wider span first).
	sort.SliceStable(raw, func(i, j int) bool {
		si, sj := raw[i], raw[j]
		if si.Start != sj.Start {
			return si.Start < sj.Start
		}
		return (si.End - si.Start) > (sj.End - sj.Start)
	})

	// Dedup by (Type, Value) and by containment.
	seen := map[[2]string]bool{}
	var deduped []IoC
	for _, m := range raw {
		key := [2]string{m.Type, m.Value}
		if seen[key] {
			continue
		}
		// Check if fully contained within an already-accepted span.
		contained := false
		for _, d := range deduped {
			if d.Start <= m.Start && d.End >= m.End &&
				!(d.Start == m.Start && d.End == m.End) {
				contained = true
				break
			}
		}
		if contained {
			continue
		}
		// Exact-span tie: first in order wins.
		exactTie := false
		for _, d := range deduped {
			if d.Start == m.Start && d.End == m.End {
				exactTie = true
				break
			}
		}
		if exactTie {
			continue
		}
		seen[key] = true
		deduped = append(deduped, m)
	}

	return deduped
}

// --- small string helpers (avoid importing strings to keep package lean) ---

func indexOf(s string, b byte) int {
	for i := 0; i < len(s); i++ {
		if s[i] == b {
			return i
		}
	}
	return -1
}

func countRune(s string, b byte) int {
	n := 0
	for i := 0; i < len(s); i++ {
		if s[i] == b {
			n++
		}
	}
	return n
}

func contains(s string, b byte) bool {
	return indexOf(s, b) >= 0
}

func lastPart(s string, sep byte) string {
	for i := len(s) - 1; i >= 0; i-- {
		if s[i] == sep {
			return s[i+1:]
		}
	}
	return s
}

func toLower(s string) string {
	b := make([]byte, len(s))
	for i := 0; i < len(s); i++ {
		c := s[i]
		if c >= 'A' && c <= 'Z' {
			c += 32
		}
		b[i] = c
	}
	return string(b)
}

func itoa(n int) string {
	if n == 0 {
		return "0"
	}
	buf := [20]byte{}
	pos := 19
	for n > 0 {
		buf[pos] = byte('0' + n%10)
		pos--
		n /= 10
	}
	return string(buf[pos+1:])
}