chore: initial sync
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
|
||||
|
||||
|
||||
def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los entity types disponibles.
|
||||
|
||||
Formatea los presets del registry en una seccion del system prompt que indica
|
||||
al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
|
||||
|
||||
Args:
|
||||
entity_presets: Lista de presets con campos 'label', 'type_ref' y
|
||||
opcionalmente 'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista de presets esta vacia.
|
||||
"""
|
||||
if not entity_presets:
|
||||
return ""
|
||||
|
||||
lines = ["Entity types available for extraction:", ""]
|
||||
|
||||
for i, preset in enumerate(entity_presets, start=1):
|
||||
label = preset.get("label", "Unknown")
|
||||
type_ref = preset.get("type_ref", "")
|
||||
metadata_fields = preset.get("metadata_fields", [])
|
||||
|
||||
lines.append(f"{i}. {label} (type_ref: {type_ref})")
|
||||
|
||||
if metadata_fields:
|
||||
attrs = ", ".join(metadata_fields)
|
||||
lines.append(f" Attributes: {attrs}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Remove trailing blank line
|
||||
if lines and lines[-1] == "":
|
||||
lines.pop()
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,22 @@
|
||||
"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
|
||||
|
||||
|
||||
def build_relation_schema_prompt(relation_types: list[str]) -> str:
|
||||
"""Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
|
||||
|
||||
Formatea la lista de tipos de relacion en una seccion del system prompt que
|
||||
indica al LLM que relaciones puede extraer entre entidades.
|
||||
|
||||
Args:
|
||||
relation_types: Lista de strings con los tipos de relacion permitidos.
|
||||
Ejemplo: ["funds", "employs", "communicates_with"]
|
||||
|
||||
Returns:
|
||||
String formateado con la seccion del prompt. Retorna string vacio si
|
||||
la lista esta vacia.
|
||||
"""
|
||||
if not relation_types:
|
||||
return ""
|
||||
|
||||
joined = ", ".join(relation_types)
|
||||
return f"Allowed relation types:\n{joined}"
|
||||
@@ -0,0 +1,814 @@
|
||||
"""Core functional programming utilities — pure functions for list/collection operations."""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from functools import reduce as _reduce
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def filter_list(xs: list, pred: Callable) -> list:
|
||||
"""Filter list by predicate. Does not mutate the original."""
|
||||
return [x for x in xs if pred(x)]
|
||||
|
||||
|
||||
def map_list(xs: list, fn: Callable) -> list:
|
||||
"""Map function over list. Does not mutate the original."""
|
||||
return [fn(x) for x in xs]
|
||||
|
||||
|
||||
def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
|
||||
"""Reduce list with accumulator. fn(acc, x) -> acc."""
|
||||
return _reduce(fn, xs, initial)
|
||||
|
||||
|
||||
def flat_map(xs: list, fn: Callable) -> list:
|
||||
"""Map function over list then flatten one level."""
|
||||
result = []
|
||||
for x in xs:
|
||||
result.extend(fn(x))
|
||||
return result
|
||||
|
||||
|
||||
def flatten(xss: list) -> list:
|
||||
"""Flatten a list of lists one level."""
|
||||
result = []
|
||||
for xs in xss:
|
||||
result.extend(xs)
|
||||
return result
|
||||
|
||||
|
||||
def chunk(xs: list, size: int) -> list:
|
||||
"""Split list into chunks of given size. Last chunk may be smaller."""
|
||||
if size <= 0:
|
||||
return []
|
||||
return [xs[i : i + size] for i in range(0, len(xs), size)]
|
||||
|
||||
|
||||
def take(xs: list, n: int) -> list:
|
||||
"""Take first n elements from list."""
|
||||
return xs[:n]
|
||||
|
||||
|
||||
def drop(xs: list, n: int) -> list:
|
||||
"""Drop first n elements from list."""
|
||||
return xs[n:]
|
||||
|
||||
|
||||
def unique(xs: list) -> list:
|
||||
"""Remove duplicates preserving order. Uses identity for hashable elements."""
|
||||
seen = set()
|
||||
result = []
|
||||
for x in xs:
|
||||
if x not in seen:
|
||||
seen.add(x)
|
||||
result.append(x)
|
||||
return result
|
||||
|
||||
|
||||
def group_by(xs: list, key_fn: Callable) -> Dict:
|
||||
"""Group elements by key function. Returns dict of key -> list."""
|
||||
groups: Dict = {}
|
||||
for x in xs:
|
||||
k = key_fn(x)
|
||||
if k not in groups:
|
||||
groups[k] = []
|
||||
groups[k].append(x)
|
||||
return groups
|
||||
|
||||
|
||||
def partition(xs: list, pred: Callable) -> Tuple[list, list]:
|
||||
"""Split list into (matches, non_matches) based on predicate."""
|
||||
matches = []
|
||||
non_matches = []
|
||||
for x in xs:
|
||||
if pred(x):
|
||||
matches.append(x)
|
||||
else:
|
||||
non_matches.append(x)
|
||||
return (matches, non_matches)
|
||||
|
||||
|
||||
def find(xs: list, pred: Callable) -> Any:
|
||||
"""Find first element matching predicate. Returns None if not found."""
|
||||
for x in xs:
|
||||
if pred(x):
|
||||
return x
|
||||
return None
|
||||
|
||||
|
||||
def find_index(xs: list, pred: Callable) -> int:
|
||||
"""Find index of first element matching predicate. Returns -1 if not found."""
|
||||
for i, x in enumerate(xs):
|
||||
if pred(x):
|
||||
return i
|
||||
return -1
|
||||
|
||||
|
||||
def zip_with(xs: list, ys: list, fn: Callable) -> list:
|
||||
"""Zip two lists with a combining function. Stops at shorter list."""
|
||||
return [fn(x, y) for x, y in zip(xs, ys)]
|
||||
|
||||
|
||||
def all_of(xs: list, pred: Callable) -> bool:
|
||||
"""Return True if all elements match predicate."""
|
||||
return all(pred(x) for x in xs)
|
||||
|
||||
|
||||
def any_of(xs: list, pred: Callable) -> bool:
|
||||
"""Return True if any element matches predicate."""
|
||||
return any(pred(x) for x in xs)
|
||||
|
||||
|
||||
def pipe(value: Any, *fns: Callable) -> Any:
|
||||
"""Pipe a value through a sequence of functions left-to-right."""
|
||||
result = value
|
||||
for fn in fns:
|
||||
result = fn(result)
|
||||
return result
|
||||
|
||||
|
||||
def compose(*fns: Callable) -> Callable:
|
||||
"""Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
|
||||
def composed(x: Any) -> Any:
|
||||
result = x
|
||||
for fn in reversed(fns):
|
||||
result = fn(result)
|
||||
return result
|
||||
return composed
|
||||
|
||||
|
||||
# ── Tree manipulation ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def flatten_tree(structure: Any) -> List[Dict]:
|
||||
"""Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
nodes = [node]
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
nodes.extend(flatten_tree(structure[key]))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(flatten_tree(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def tree_to_flat_list(structure: Any) -> List[Dict]:
|
||||
"""Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
|
||||
if isinstance(structure, dict):
|
||||
nodes = [structure]
|
||||
if 'nodes' in structure:
|
||||
nodes.extend(tree_to_flat_list(structure['nodes']))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
for item in structure:
|
||||
nodes.extend(tree_to_flat_list(item))
|
||||
return nodes
|
||||
return []
|
||||
|
||||
|
||||
def get_leaf_nodes(structure: Any) -> List[Dict]:
|
||||
"""Extract only leaf nodes (no children) from a hierarchical tree."""
|
||||
import copy
|
||||
if isinstance(structure, dict):
|
||||
if not structure.get('nodes'):
|
||||
node = copy.deepcopy(structure)
|
||||
node.pop('nodes', None)
|
||||
return [node]
|
||||
leaf_nodes = []
|
||||
for key in list(structure.keys()):
|
||||
if 'nodes' in key:
|
||||
leaf_nodes.extend(get_leaf_nodes(structure[key]))
|
||||
return leaf_nodes
|
||||
elif isinstance(structure, list):
|
||||
leaf_nodes = []
|
||||
for item in structure:
|
||||
leaf_nodes.extend(get_leaf_nodes(item))
|
||||
return leaf_nodes
|
||||
return []
|
||||
|
||||
|
||||
def write_node_ids(data: Any, node_id: int = 0) -> int:
|
||||
"""Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
|
||||
if isinstance(data, dict):
|
||||
data['node_id'] = str(node_id).zfill(4)
|
||||
node_id += 1
|
||||
for key in list(data.keys()):
|
||||
if 'nodes' in key:
|
||||
node_id = write_node_ids(data[key], node_id)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
node_id = write_node_ids(item, node_id)
|
||||
return node_id
|
||||
|
||||
|
||||
def list_to_tree(data: List[Dict]) -> List[Dict]:
|
||||
"""Convert flat list with structure codes ('1.2.3') to nested tree."""
|
||||
def get_parent_structure(structure):
|
||||
if not structure:
|
||||
return None
|
||||
parts = str(structure).split('.')
|
||||
return '.'.join(parts[:-1]) if len(parts) > 1 else None
|
||||
|
||||
nodes = {}
|
||||
root_nodes = []
|
||||
|
||||
for item in data:
|
||||
structure = item.get('structure')
|
||||
node = {
|
||||
'title': item.get('title'),
|
||||
'start_index': item.get('start_index'),
|
||||
'end_index': item.get('end_index'),
|
||||
'nodes': []
|
||||
}
|
||||
nodes[structure] = node
|
||||
parent_structure = get_parent_structure(structure)
|
||||
|
||||
if parent_structure and parent_structure in nodes:
|
||||
nodes[parent_structure]['nodes'].append(node)
|
||||
else:
|
||||
root_nodes.append(node)
|
||||
|
||||
def clean_node(node):
|
||||
if not node['nodes']:
|
||||
del node['nodes']
|
||||
else:
|
||||
for child in node['nodes']:
|
||||
clean_node(child)
|
||||
return node
|
||||
|
||||
return [clean_node(node) for node in root_nodes]
|
||||
|
||||
|
||||
def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
|
||||
"""Recursively remove specified fields from a tree (dict/list)."""
|
||||
if fields is None:
|
||||
fields = ['text']
|
||||
if isinstance(data, dict):
|
||||
return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
|
||||
elif isinstance(data, list):
|
||||
return [remove_tree_fields(item, fields) for item in data]
|
||||
return data
|
||||
|
||||
|
||||
def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
|
||||
"""Reorder fields of each node in a tree according to specified key order."""
|
||||
if not order:
|
||||
return structure
|
||||
if isinstance(structure, dict):
|
||||
if 'nodes' in structure:
|
||||
structure['nodes'] = format_tree_structure(structure['nodes'], order)
|
||||
if not structure.get('nodes'):
|
||||
structure.pop('nodes', None)
|
||||
return {key: structure[key] for key in order if key in structure}
|
||||
elif isinstance(structure, list):
|
||||
return [format_tree_structure(item, order) for item in structure]
|
||||
return structure
|
||||
|
||||
|
||||
def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
|
||||
"""Create flat dict mapping node_id to node for O(1) lookup."""
|
||||
mapping = {}
|
||||
def _traverse(nodes):
|
||||
for node in nodes:
|
||||
if node.get('node_id'):
|
||||
mapping[node['node_id']] = node
|
||||
if node.get('nodes'):
|
||||
_traverse(node['nodes'])
|
||||
_traverse(tree)
|
||||
return mapping
|
||||
|
||||
|
||||
# ── Text / JSON extraction ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_json_from_llm(content: str) -> Dict:
|
||||
"""Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
|
||||
import json
|
||||
try:
|
||||
start_idx = content.find("```json")
|
||||
if start_idx != -1:
|
||||
start_idx += 7
|
||||
end_idx = content.rfind("```")
|
||||
json_content = content[start_idx:end_idx].strip()
|
||||
else:
|
||||
json_content = content.strip()
|
||||
|
||||
json_content = json_content.replace('None', 'null')
|
||||
json_content = json_content.replace('\n', ' ').replace('\r', ' ')
|
||||
json_content = ' '.join(json_content.split())
|
||||
|
||||
return json.loads(json_content)
|
||||
except (json.JSONDecodeError, Exception):
|
||||
try:
|
||||
json_content = json_content.replace(',]', ']').replace(',}', '}')
|
||||
return json.loads(json_content)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def parse_page_range(pages: str) -> List[int]:
|
||||
"""Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
|
||||
result = []
|
||||
for part in pages.split(','):
|
||||
part = part.strip()
|
||||
if '-' in part:
|
||||
start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
|
||||
if start > end:
|
||||
raise ValueError(f"Invalid range '{part}': start must be <= end")
|
||||
result.extend(range(start, end + 1))
|
||||
else:
|
||||
result.append(int(part))
|
||||
return sorted(set(result))
|
||||
|
||||
|
||||
# ── Markdown parsing ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
|
||||
"""Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
|
||||
import re
|
||||
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||
code_block_pattern = r'^```'
|
||||
node_list = []
|
||||
lines = markdown_content.split('\n')
|
||||
in_code_block = False
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
stripped_line = line.strip()
|
||||
if re.match(code_block_pattern, stripped_line):
|
||||
in_code_block = not in_code_block
|
||||
continue
|
||||
if not stripped_line:
|
||||
continue
|
||||
if not in_code_block:
|
||||
match = re.match(header_pattern, stripped_line)
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
title = match.group(2).strip()
|
||||
node_list.append({'title': title, 'level': level, 'line_num': line_num})
|
||||
|
||||
return node_list, lines
|
||||
|
||||
|
||||
def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
|
||||
"""Build nested tree from flat list of headers with levels (h1>h2>h3)."""
|
||||
if not node_list:
|
||||
return []
|
||||
|
||||
stack = []
|
||||
root_nodes = []
|
||||
node_counter = 1
|
||||
|
||||
for node in node_list:
|
||||
current_level = node['level']
|
||||
tree_node = {
|
||||
'title': node['title'],
|
||||
'node_id': str(node_counter).zfill(4),
|
||||
'line_num': node['line_num'],
|
||||
'nodes': []
|
||||
}
|
||||
node_counter += 1
|
||||
|
||||
while stack and stack[-1][1] >= current_level:
|
||||
stack.pop()
|
||||
|
||||
if not stack:
|
||||
root_nodes.append(tree_node)
|
||||
else:
|
||||
parent_node, _ = stack[-1]
|
||||
parent_node['nodes'].append(tree_node)
|
||||
|
||||
stack.append((tree_node, current_level))
|
||||
|
||||
def clean_empty_nodes(nodes):
|
||||
for n in nodes:
|
||||
if n['nodes']:
|
||||
clean_empty_nodes(n['nodes'])
|
||||
else:
|
||||
del n['nodes']
|
||||
return nodes
|
||||
|
||||
return clean_empty_nodes(root_nodes)
|
||||
|
||||
|
||||
# ── Pagination / chunking ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
|
||||
max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
|
||||
"""Group pages into text chunks respecting token limit with configurable overlap."""
|
||||
import math
|
||||
num_tokens = sum(token_lengths)
|
||||
|
||||
if num_tokens <= max_tokens:
|
||||
return ["".join(page_contents)]
|
||||
|
||||
subsets = []
|
||||
current_subset = []
|
||||
current_token_count = 0
|
||||
|
||||
expected_parts = math.ceil(num_tokens / max_tokens)
|
||||
avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
|
||||
|
||||
for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
|
||||
if current_token_count + page_tokens > avg_tokens:
|
||||
subsets.append(''.join(current_subset))
|
||||
overlap_start = max(i - overlap_pages, 0)
|
||||
current_subset = list(page_contents[overlap_start:i])
|
||||
current_token_count = sum(token_lengths[overlap_start:i])
|
||||
|
||||
current_subset.append(page_content)
|
||||
current_token_count += page_tokens
|
||||
|
||||
if current_subset:
|
||||
subsets.append(''.join(current_subset))
|
||||
|
||||
return subsets
|
||||
|
||||
|
||||
def calculate_page_offset(pairs: List[Dict]) -> int:
|
||||
"""Calculate offset between logical page numbers and physical indices using reference pairs."""
|
||||
differences = []
|
||||
for pair in pairs:
|
||||
try:
|
||||
difference = pair['physical_index'] - pair['page']
|
||||
differences.append(difference)
|
||||
except (KeyError, TypeError):
|
||||
continue
|
||||
|
||||
if not differences:
|
||||
return 0
|
||||
|
||||
counts: Dict[int, int] = {}
|
||||
for diff in differences:
|
||||
counts[diff] = counts.get(diff, 0) + 1
|
||||
|
||||
return max(counts.items(), key=lambda x: x[1])[0]
|
||||
|
||||
|
||||
# ── Text preprocessing ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def preprocess_text(text: str) -> str:
|
||||
"""Normalize whitespace and newlines in raw text.
|
||||
|
||||
Args:
|
||||
text: Raw text to normalize.
|
||||
|
||||
Returns:
|
||||
Normalized text with consistent newlines, stripped lines, and no
|
||||
excessive blank lines.
|
||||
"""
|
||||
# Normalize line endings: \r\n and \r -> \n
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
# Reduce 3+ consecutive newlines to at most 2
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
# Strip whitespace from each line
|
||||
text = '\n'.join(line.strip() for line in text.split('\n'))
|
||||
# Strip globally
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_text_stats(text: str) -> dict:
|
||||
"""Compute basic statistics of a text: characters, lines, words.
|
||||
|
||||
Args:
|
||||
text: Input text to analyze.
|
||||
|
||||
Returns:
|
||||
Dict with keys total_chars (int), total_lines (int), total_words (int).
|
||||
"""
|
||||
return {
|
||||
'total_chars': len(text),
|
||||
'total_lines': text.count('\n') + 1,
|
||||
'total_words': len(text.split()),
|
||||
}
|
||||
|
||||
|
||||
# ── Git URL parsing ──────────────────────────────────────────────────────────
|
||||
|
||||
_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
|
||||
|
||||
|
||||
def _sanitize_git_segment(segment: str) -> str:
|
||||
"""Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
|
||||
if segment.endswith(".git"):
|
||||
segment = segment[:-4]
|
||||
return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
|
||||
|
||||
|
||||
def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
|
||||
"""Parse a code-hosting URL and return the 'org/repo' path component.
|
||||
|
||||
Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
|
||||
Returns None if the URL does not match any known host or is malformed.
|
||||
|
||||
Args:
|
||||
url: Repository URL in any supported format.
|
||||
known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
'org/repo' string or None.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
if url.startswith("git@"):
|
||||
# git@github.com:org/repo.git
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return None
|
||||
host, path = rest.split(":", 1)
|
||||
if host not in hosts:
|
||||
return None
|
||||
segments = [s for s in path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
for prefix in ("http://", "https://", "git://", "ssh://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.hostname or ""
|
||||
if netloc not in hosts:
|
||||
return None
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) < 2:
|
||||
return None
|
||||
org = _sanitize_git_segment(segments[0])
|
||||
repo = _sanitize_git_segment(segments[1])
|
||||
if not org or not repo:
|
||||
return None
|
||||
return f"{org}/{repo}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
|
||||
"""Return True only if url points to a clonable git repository.
|
||||
|
||||
Accepts org/repo and org/repo/tree/<ref> paths.
|
||||
Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
|
||||
|
||||
Args:
|
||||
url: URL to verify.
|
||||
known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
|
||||
|
||||
Returns:
|
||||
True if url is a clonable repository URL.
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
|
||||
url = url.strip()
|
||||
|
||||
# SSH shorthand — always repo-level if host matches
|
||||
if url.startswith("git@"):
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
return False
|
||||
host, _ = rest.split(":", 1)
|
||||
return host in hosts
|
||||
|
||||
# git:// and ssh:// — always repo-level if host matches
|
||||
for prefix in ("ssh://", "git://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
return (parsed.hostname or "") in hosts
|
||||
|
||||
# http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
|
||||
for prefix in ("http://", "https://"):
|
||||
if url.startswith(prefix):
|
||||
parsed = urlparse(url)
|
||||
if (parsed.hostname or "") not in hosts:
|
||||
return False
|
||||
segments = [s for s in parsed.path.split("/") if s]
|
||||
if len(segments) == 2:
|
||||
return True
|
||||
if len(segments) == 4 and segments[2] == "tree":
|
||||
return True
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_git_ssh_uri(url: str) -> None:
|
||||
"""Validate a git SSH URI of the form git@host:path.
|
||||
|
||||
Raises ValueError with a descriptive message if the URI is malformed.
|
||||
|
||||
Args:
|
||||
url: URI string to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If the URI does not conform to git SSH format.
|
||||
"""
|
||||
if not url.startswith("git@"):
|
||||
raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
|
||||
rest = url[len("git@"):]
|
||||
if ":" not in rest:
|
||||
raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
|
||||
_, path = rest.split(":", 1)
|
||||
if not path:
|
||||
raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown parsing utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
|
||||
"""Extract YAML frontmatter delimited by '---' from the start of a markdown string.
|
||||
|
||||
Args:
|
||||
content: Raw markdown string, optionally starting with YAML frontmatter.
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_frontmatter, frontmatter_dict).
|
||||
frontmatter_dict is None when no frontmatter is found.
|
||||
"""
|
||||
pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
|
||||
match = pattern.match(content)
|
||||
if not match:
|
||||
return content, None
|
||||
|
||||
raw = match.group(1)
|
||||
remaining = content[match.end():]
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
data = yaml.safe_load(raw)
|
||||
if not isinstance(data, dict):
|
||||
data = None
|
||||
except Exception:
|
||||
# Fallback: simple key: value parser (no yaml dependency)
|
||||
data = {}
|
||||
for line in raw.splitlines():
|
||||
if ':' in line:
|
||||
key, _, value = line.partition(':')
|
||||
data[key.strip()] = value.strip()
|
||||
|
||||
return remaining, data
|
||||
|
||||
|
||||
def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
|
||||
"""Find all markdown headings (# to ######), excluding those inside code blocks,
|
||||
HTML comments, and indented blocks.
|
||||
|
||||
Args:
|
||||
content: Markdown text to search.
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos, title, level) for each heading found.
|
||||
"""
|
||||
excluded: List[Tuple[int, int]] = []
|
||||
|
||||
# Code blocks (triple backtick)
|
||||
for m in re.finditer(r'```.*?```', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# HTML comments
|
||||
for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
# Indented blocks (lines starting with 4 spaces or a tab)
|
||||
for m in re.finditer(r'^( |\t).+$', content, re.MULTILINE):
|
||||
excluded.append((m.start(), m.end()))
|
||||
|
||||
def is_excluded(pos: int) -> bool:
|
||||
return any(start <= pos < end for start, end in excluded)
|
||||
|
||||
results: List[Tuple[int, int, str, int]] = []
|
||||
for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||
# Skip escaped headings (\#)
|
||||
before = content[m.start() - 1] if m.start() > 0 else ''
|
||||
if before == '\\':
|
||||
continue
|
||||
if is_excluded(m.start()):
|
||||
continue
|
||||
level = len(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
results.append((m.start(), m.end(), title, level))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def estimate_token_count(content: str) -> int:
|
||||
"""Estimate token count without a tokenizer.
|
||||
|
||||
CJK characters count as ~0.7 tokens each; other non-whitespace characters
|
||||
count as ~0.3 tokens each.
|
||||
|
||||
Args:
|
||||
content: Text to estimate.
|
||||
|
||||
Returns:
|
||||
Estimated integer token count.
|
||||
"""
|
||||
cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
|
||||
without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
|
||||
others = re.findall(r'\S', without_cjk)
|
||||
return int(len(cjk) * 0.7 + len(others) * 0.3)
|
||||
|
||||
|
||||
def smart_split_content(
|
||||
content: str,
|
||||
max_tokens: int = 1024,
|
||||
max_chars: int = 8000,
|
||||
) -> List[str]:
|
||||
"""Split large content into parts respecting token and character limits.
|
||||
|
||||
Splits by paragraphs (double newline). If a single paragraph exceeds the
|
||||
limit it is force-cut into chunks of max_chars.
|
||||
|
||||
Args:
|
||||
content: Text to split.
|
||||
max_tokens: Maximum estimated tokens per part.
|
||||
max_chars: Maximum characters per part.
|
||||
|
||||
Returns:
|
||||
List of string parts.
|
||||
"""
|
||||
paragraphs = content.split('\n\n')
|
||||
parts: List[str] = []
|
||||
current_parts: List[str] = []
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
def flush() -> None:
|
||||
if current_parts:
|
||||
parts.append('\n\n'.join(current_parts))
|
||||
current_parts.clear()
|
||||
|
||||
for para in paragraphs:
|
||||
para_tokens = estimate_token_count(para)
|
||||
para_chars = len(para)
|
||||
|
||||
# Single paragraph exceeds limits — force-cut it
|
||||
if para_tokens > max_tokens or para_chars > max_chars:
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
for i in range(0, len(para), max_chars):
|
||||
parts.append(para[i:i + max_chars])
|
||||
continue
|
||||
|
||||
# Would exceed limits if added — flush first
|
||||
if (current_tokens + para_tokens > max_tokens or
|
||||
current_chars + para_chars > max_chars):
|
||||
flush()
|
||||
current_tokens = 0
|
||||
current_chars = 0
|
||||
|
||||
current_parts.append(para)
|
||||
current_tokens += para_tokens
|
||||
current_chars += para_chars
|
||||
|
||||
flush()
|
||||
return parts if parts else [content]
|
||||
|
||||
|
||||
def sanitize_for_path(text: str, max_length: int = 50) -> str:
|
||||
"""Convert text to a safe string for use in file paths.
|
||||
|
||||
Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
|
||||
with underscores. Truncates with a sha256 suffix if the result exceeds
|
||||
max_length.
|
||||
|
||||
Args:
|
||||
text: Input text to sanitize.
|
||||
max_length: Maximum length of the returned string.
|
||||
|
||||
Returns:
|
||||
Safe path-friendly string.
|
||||
"""
|
||||
cleaned = re.sub(
|
||||
r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
|
||||
'',
|
||||
text,
|
||||
)
|
||||
cleaned = cleaned.replace(' ', '_').strip('_')
|
||||
|
||||
if not cleaned:
|
||||
return 'section'
|
||||
|
||||
if len(cleaned) <= max_length:
|
||||
return cleaned
|
||||
|
||||
suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
|
||||
return cleaned[:max_length - len(suffix)] + suffix
|
||||
@@ -0,0 +1,283 @@
|
||||
"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from deduplication_result import DeduplicationResult
|
||||
from normalize_entity_name import normalize_entity_name
|
||||
from merge_entity_attributes import merge_entity_attributes
|
||||
|
||||
|
||||
# ── Similitud helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _levenshtein(a: str, b: str) -> int:
|
||||
"""Distancia de edicion Levenshtein entre dos strings."""
|
||||
if a == b:
|
||||
return 0
|
||||
if not a:
|
||||
return len(b)
|
||||
if not b:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
curr = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
|
||||
"""Similitud de Jaccard entre dos conjuntos de tokens."""
|
||||
set_a = set(tokens_a)
|
||||
set_b = set(tokens_b)
|
||||
if not set_a and not set_b:
|
||||
return 1.0
|
||||
inter = len(set_a & set_b)
|
||||
union = len(set_a | set_b)
|
||||
return inter / union if union else 0.0
|
||||
|
||||
|
||||
def _name_similarity(a: str, b: str) -> float:
|
||||
"""Score de similitud entre dos nombres normalizados.
|
||||
|
||||
Combina similitud de Levenshtein y Jaccard sobre tokens.
|
||||
Aplica bonus de contencion (+0.3) y deteccion de acronimos.
|
||||
"""
|
||||
if a == b:
|
||||
return 1.0
|
||||
|
||||
# Similitud Levenshtein
|
||||
max_len = max(len(a), len(b))
|
||||
lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
|
||||
|
||||
# Similitud Jaccard sobre tokens
|
||||
tokens_a = a.split()
|
||||
tokens_b = b.split()
|
||||
jac_sim = _jaccard(tokens_a, tokens_b)
|
||||
|
||||
score = max(lev_sim, jac_sim)
|
||||
|
||||
# Bonus de contencion: un nombre contiene al otro
|
||||
if a in b or b in a:
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
# Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
|
||||
if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
|
||||
score = min(1.0, score + 0.3)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
|
||||
"""Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
|
||||
if not candidate or not tokens:
|
||||
return False
|
||||
initials = "".join(t[0] for t in tokens if t).upper()
|
||||
return candidate.upper() == initials
|
||||
|
||||
|
||||
_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
|
||||
|
||||
|
||||
def _is_exact_type(entity_type: str) -> bool:
|
||||
"""Tipos tecnicos donde solo se acepta matching exacto."""
|
||||
return entity_type.lower() in _EXACT_TYPES
|
||||
|
||||
|
||||
# ── Union-Find ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class _UnionFind:
|
||||
def __init__(self, n: int) -> None:
|
||||
self._parent = list(range(n))
|
||||
self._rank = [0] * n
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
while self._parent[x] != x:
|
||||
self._parent[x] = self._parent[self._parent[x]]
|
||||
x = self._parent[x]
|
||||
return x
|
||||
|
||||
def union(self, x: int, y: int) -> None:
|
||||
rx, ry = self.find(x), self.find(y)
|
||||
if rx == ry:
|
||||
return
|
||||
if self._rank[rx] < self._rank[ry]:
|
||||
rx, ry = ry, rx
|
||||
self._parent[ry] = rx
|
||||
if self._rank[rx] == self._rank[ry]:
|
||||
self._rank[rx] += 1
|
||||
|
||||
|
||||
# ── Implementacion principal ────────────────────────────────────────────────────
|
||||
|
||||
def deduplicate_entities(
|
||||
candidates: list[EntityCandidate],
|
||||
name_threshold: float = 0.85,
|
||||
same_type_only: bool = True,
|
||||
) -> DeduplicationResult:
|
||||
"""Agrupa entidades candidatas que refieren a la misma entidad real.
|
||||
|
||||
Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
|
||||
detectar clusters transitivos. Por cada cluster genera una entidad canonica
|
||||
mergeando atributos de todos sus miembros.
|
||||
|
||||
Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
|
||||
acepta matching exacto normalizado, ignorando el umbral de nombre.
|
||||
|
||||
Args:
|
||||
candidates: lista de EntityCandidate a deduplicar.
|
||||
name_threshold: score minimo para considerar dos nombres iguales (0-1).
|
||||
same_type_only: si True, solo compara entidades del mismo type_ref.
|
||||
|
||||
Returns:
|
||||
DeduplicationResult con entidades deduplicadas, mapas de resolucion
|
||||
e historial de merges.
|
||||
"""
|
||||
if not candidates:
|
||||
return DeduplicationResult(
|
||||
entities=[],
|
||||
entity_id_map={},
|
||||
name_to_id={},
|
||||
merge_log=[],
|
||||
total_before=0,
|
||||
total_after=0,
|
||||
)
|
||||
|
||||
n = len(candidates)
|
||||
|
||||
# Paso 1: normalizar nombres
|
||||
normalized: list[str] = []
|
||||
for c in candidates:
|
||||
norm = normalize_entity_name(c.name, c.type_ref)
|
||||
normalized.append(norm)
|
||||
|
||||
# Paso 2: Union-Find sobre todos los indices
|
||||
uf = _UnionFind(n)
|
||||
|
||||
# Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
|
||||
merge_pairs: list[tuple[int, int, float]] = []
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
|
||||
continue
|
||||
|
||||
ni, nj = normalized[i], normalized[j]
|
||||
et = candidates[i].type_ref.lower()
|
||||
|
||||
if _is_exact_type(et):
|
||||
if ni == nj:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, 1.0))
|
||||
continue
|
||||
|
||||
score = _name_similarity(ni, nj)
|
||||
if score >= name_threshold:
|
||||
uf.union(i, j)
|
||||
merge_pairs.append((i, j, score))
|
||||
|
||||
# Paso 4: agrupar indices por raiz del Union-Find
|
||||
clusters: dict[int, list[int]] = {}
|
||||
for i in range(n):
|
||||
root = uf.find(i)
|
||||
clusters.setdefault(root, []).append(i)
|
||||
|
||||
# Paso 5: merge por cluster
|
||||
merged_entities: list[EntityCandidate] = []
|
||||
entity_id_map: dict[str, str] = {}
|
||||
name_to_id: dict[str, str] = {}
|
||||
merge_log: list[dict] = []
|
||||
|
||||
# Pares mergeados para construir el log
|
||||
merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
|
||||
for i, j, score in merge_pairs:
|
||||
root = uf.find(i)
|
||||
merged_pairs_by_root.setdefault(root, []).append((i, j, score))
|
||||
|
||||
for root, indices in clusters.items():
|
||||
cluster_candidates = [candidates[idx] for idx in indices]
|
||||
|
||||
if len(cluster_candidates) == 1:
|
||||
c = cluster_candidates[0]
|
||||
canonical_name = c.name
|
||||
canonical_norm = normalized[indices[0]]
|
||||
merged_attrs = c.attributes
|
||||
merged_confidence = c.confidence
|
||||
merged_chunks = list(c.source_chunk_indices)
|
||||
merged_from = list(c.merged_from) if c.merged_from else [c.name]
|
||||
else:
|
||||
# Candidato con mayor confidence es el canonico
|
||||
best = max(cluster_candidates, key=lambda c: c.confidence)
|
||||
canonical_name = best.name
|
||||
canonical_norm = normalize_entity_name(best.name, best.type_ref)
|
||||
|
||||
merged_attrs = merge_entity_attributes(
|
||||
[c.attributes for c in cluster_candidates]
|
||||
)
|
||||
merged_confidence = max(c.confidence for c in cluster_candidates)
|
||||
|
||||
merged_chunks: list[int] = []
|
||||
seen_chunks: set[int] = set()
|
||||
for c in cluster_candidates:
|
||||
for idx in c.source_chunk_indices:
|
||||
if idx not in seen_chunks:
|
||||
merged_chunks.append(idx)
|
||||
seen_chunks.add(idx)
|
||||
|
||||
merged_from: list[str] = []
|
||||
seen_names: set[str] = set()
|
||||
for c in cluster_candidates:
|
||||
names_to_add = c.merged_from if c.merged_from else [c.name]
|
||||
for nm in names_to_add:
|
||||
if nm not in seen_names:
|
||||
merged_from.append(nm)
|
||||
seen_names.add(nm)
|
||||
|
||||
# Log de merge
|
||||
other_names = [c.name for c in cluster_candidates if c is not best]
|
||||
pairs = merged_pairs_by_root.get(root, [])
|
||||
max_score = max((s for _, _, s in pairs), default=1.0)
|
||||
merge_log.append(
|
||||
{
|
||||
"canonical": canonical_name,
|
||||
"merged": other_names,
|
||||
"score": round(max_score, 4),
|
||||
"reason": "fuzzy_name",
|
||||
}
|
||||
)
|
||||
|
||||
ent_id = str(uuid.uuid4())
|
||||
entity = EntityCandidate(
|
||||
name=canonical_name,
|
||||
name_normalized=canonical_norm,
|
||||
type_ref=cluster_candidates[0].type_ref,
|
||||
type_label=cluster_candidates[0].type_label,
|
||||
attributes=merged_attrs,
|
||||
confidence=merged_confidence,
|
||||
source_chunk_indices=merged_chunks,
|
||||
merged_from=merged_from,
|
||||
)
|
||||
merged_entities.append(entity)
|
||||
|
||||
# Poblar mapas de resolucion
|
||||
entity_id_map[canonical_norm] = ent_id
|
||||
for orig_name in merged_from:
|
||||
name_to_id[orig_name] = ent_id
|
||||
name_to_id[canonical_norm] = ent_id
|
||||
|
||||
return DeduplicationResult(
|
||||
entities=merged_entities,
|
||||
entity_id_map=entity_id_map,
|
||||
name_to_id=name_to_id,
|
||||
merge_log=merge_log,
|
||||
total_before=n,
|
||||
total_after=len(merged_entities),
|
||||
)
|
||||
@@ -0,0 +1,189 @@
|
||||
"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Importar levenshtein_distance desde cybersecurity ---
|
||||
# Soporta dos contextos:
|
||||
# 1. Ejecutado desde python/functions/datascience/ (pytest local)
|
||||
# 2. Ejecutado desde la raiz del registry (fn run)
|
||||
def _levenshtein_distance(a: str, b: str) -> int:
|
||||
"""Calcula la distancia de edicion de Levenshtein entre dos strings."""
|
||||
if len(a) < len(b):
|
||||
return _levenshtein_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev_row = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr_row = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr_row.append(
|
||||
min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
|
||||
)
|
||||
prev_row = curr_row
|
||||
return prev_row[-1]
|
||||
|
||||
|
||||
try:
|
||||
_here = os.path.dirname(os.path.abspath(__file__))
|
||||
_cyber_path = os.path.join(_here, "..", "cybersecurity")
|
||||
if _cyber_path not in sys.path:
|
||||
sys.path.insert(0, _cyber_path)
|
||||
from cybersecurity import levenshtein_distance as _lev
|
||||
except ImportError:
|
||||
_lev = None # type: ignore
|
||||
|
||||
levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
|
||||
|
||||
|
||||
def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
|
||||
"""Intenta resolver un nombre contra las claves del mapa por fuzzy match.
|
||||
|
||||
Recorre todas las claves de entity_id_map y busca la mas cercana segun
|
||||
distancia de Levenshtein. Retorna el entity_id si la distancia es <=
|
||||
threshold, o '' si no hay match aceptable.
|
||||
|
||||
Args:
|
||||
name: nombre a resolver (ya en lowercase strip).
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id.
|
||||
threshold: distancia maxima de edicion para considerar match (default 3).
|
||||
|
||||
Returns:
|
||||
entity_id del mejor match o '' si no hay match.
|
||||
"""
|
||||
best_id = ""
|
||||
best_dist = threshold + 1
|
||||
for key, entity_id in entity_id_map.items():
|
||||
dist = levenshtein_distance(name, key)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_id = entity_id
|
||||
return best_id if best_dist <= threshold else ""
|
||||
|
||||
|
||||
def deduplicate_relations(
|
||||
relations: list,
|
||||
entity_id_map: dict[str, str],
|
||||
) -> list:
|
||||
"""Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
|
||||
|
||||
Algoritmo:
|
||||
1. Para cada RelationCandidate, intentar resolver from_name y to_name al
|
||||
entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
|
||||
Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
|
||||
Si sigue sin match, descartar la relacion con warning.
|
||||
2. Descartar self-loops (from_id == to_id).
|
||||
3. Deduplicar por (from_id, to_id, relation_type):
|
||||
- description: concatenar descripciones unicas separadas por '; '
|
||||
- confidence: max del grupo
|
||||
4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
|
||||
|
||||
Args:
|
||||
relations: lista de RelationCandidate con from_name/to_name originales.
|
||||
entity_id_map: mapa nombre_normalizado -> entity_id (output de
|
||||
deduplicate_entities). Permite resolver nombres que fueron mergeados.
|
||||
|
||||
Returns:
|
||||
Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
|
||||
"""
|
||||
# Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
|
||||
try:
|
||||
_types_path = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"..", "..", "..", "python", "types", "datascience",
|
||||
)
|
||||
if _types_path not in sys.path:
|
||||
sys.path.insert(0, _types_path)
|
||||
from relation_candidate import RelationCandidate
|
||||
except ImportError:
|
||||
from relation_candidate import RelationCandidate # type: ignore
|
||||
|
||||
resolved: list = []
|
||||
|
||||
for rel in relations:
|
||||
# --- Resolver from_name ---
|
||||
from_key = rel.from_name.lower().strip()
|
||||
from_id = entity_id_map.get(from_key, "")
|
||||
if not from_id:
|
||||
from_id = _fuzzy_resolve(from_key, entity_id_map)
|
||||
if not from_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver from_name=%r — descartando",
|
||||
rel.from_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Resolver to_name ---
|
||||
to_key = rel.to_name.lower().strip()
|
||||
to_id = entity_id_map.get(to_key, "")
|
||||
if not to_id:
|
||||
to_id = _fuzzy_resolve(to_key, entity_id_map)
|
||||
if not to_id:
|
||||
logger.warning(
|
||||
"deduplicate_relations: no se pudo resolver to_name=%r — descartando",
|
||||
rel.to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# --- Descartar self-loops ---
|
||||
if from_id == to_id:
|
||||
logger.debug(
|
||||
"deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
|
||||
rel.from_name,
|
||||
rel.to_name,
|
||||
rel.relation_type,
|
||||
)
|
||||
continue
|
||||
|
||||
resolved.append(
|
||||
RelationCandidate(
|
||||
from_name=rel.from_name,
|
||||
to_name=rel.to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel.relation_type,
|
||||
description=rel.description,
|
||||
confidence=rel.confidence,
|
||||
source_chunk_index=rel.source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
# --- Deduplicar por (from_id, to_id, relation_type) ---
|
||||
groups: dict[tuple, list] = {}
|
||||
for rel in resolved:
|
||||
key = (rel.from_id, rel.to_id, rel.relation_type)
|
||||
groups.setdefault(key, []).append(rel)
|
||||
|
||||
result: list = []
|
||||
for (from_id, to_id, rel_type), group in groups.items():
|
||||
if len(group) == 1:
|
||||
result.append(group[0])
|
||||
continue
|
||||
|
||||
# Mergear: max confidence + union de descripciones unicas
|
||||
best_confidence = max(r.confidence for r in group)
|
||||
seen_desc: set[str] = set()
|
||||
descriptions: list[str] = []
|
||||
for r in group:
|
||||
if r.description and r.description not in seen_desc:
|
||||
descriptions.append(r.description)
|
||||
seen_desc.add(r.description)
|
||||
|
||||
result.append(
|
||||
RelationCandidate(
|
||||
from_name=group[0].from_name,
|
||||
to_name=group[0].to_name,
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
relation_type=rel_type,
|
||||
description="; ".join(descriptions),
|
||||
confidence=best_confidence,
|
||||
source_chunk_index=group[0].source_chunk_index,
|
||||
)
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,22 @@
|
||||
"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeduplicationResult:
|
||||
"""Resultado de deduplicacion de entidades.
|
||||
|
||||
El `name_to_id` mapea TODOS los nombres originales (incluyendo los
|
||||
mergeados) a su ID final, permitiendo resolver relaciones que usan
|
||||
cualquier variante del nombre.
|
||||
"""
|
||||
|
||||
entities: list[EntityCandidate]
|
||||
entity_id_map: dict[str, str]
|
||||
name_to_id: dict[str, str]
|
||||
merge_log: list[dict] = field(default_factory=list)
|
||||
total_before: int = 0
|
||||
total_after: int = 0
|
||||
@@ -0,0 +1,34 @@
|
||||
"""EntityCandidate — candidato de entidad extraido por el LLM."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityCandidate:
|
||||
"""Candidato de entidad extraido por el LLM.
|
||||
|
||||
Puede venir de un solo chunk o ser el resultado de mergear multiples
|
||||
extracciones. `merged_from` rastrea los nombres originales para debugging.
|
||||
"""
|
||||
|
||||
name: str
|
||||
name_normalized: str = ""
|
||||
type_ref: str = ""
|
||||
type_label: str = ""
|
||||
attributes: dict = field(default_factory=dict)
|
||||
confidence: float = 0.0
|
||||
source_chunk_indices: list[int] = field(default_factory=list)
|
||||
merged_from: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serializa el candidato a un diccionario."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"name_normalized": self.name_normalized,
|
||||
"type_ref": self.type_ref,
|
||||
"type_label": self.type_label,
|
||||
"attributes": self.attributes,
|
||||
"confidence": self.confidence,
|
||||
"source_chunk_indices": self.source_chunk_indices,
|
||||
"merged_from": self.merged_from,
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
|
||||
|
||||
def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
|
||||
"""Construye el system prompt para extraccion de entidades."""
|
||||
lines = [
|
||||
"You are an entity extraction expert. Given text, extract all entities",
|
||||
"matching these types. For each entity, provide: name, type_ref,",
|
||||
"attributes (matching the metadata_fields for that type), and a",
|
||||
"confidence score (0.0-1.0).",
|
||||
"",
|
||||
"Entity types:",
|
||||
]
|
||||
|
||||
for schema_entry in entity_schema:
|
||||
label = schema_entry.get("label", "Unknown")
|
||||
type_ref = schema_entry.get("type_ref", "")
|
||||
metadata_fields = schema_entry.get("metadata_fields", [])
|
||||
lines.append(f"- {label} (type_ref: {type_ref})")
|
||||
if metadata_fields:
|
||||
lines.append(f" fields: {', '.join(metadata_fields)}")
|
||||
|
||||
lines += [
|
||||
"",
|
||||
'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
|
||||
"",
|
||||
"Rules:",
|
||||
"- Only extract entities explicitly mentioned in the text",
|
||||
"- Use the exact type_ref from the schema",
|
||||
"- Leave unknown attributes as null",
|
||||
"- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
|
||||
f"- {language_instruction}",
|
||||
]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def extract_entities_llm(
|
||||
text: str,
|
||||
entity_schema: list[dict],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[EntityCandidate]:
|
||||
"""Extrae entidades de un chunk de texto usando un LLM inyectado.
|
||||
|
||||
Construye un system prompt con el schema de entity types, llama al LLM
|
||||
y valida la respuesta retornando una lista de EntityCandidate.
|
||||
|
||||
Args:
|
||||
text: Chunk de texto a analizar.
|
||||
entity_schema: Lista de tipos con metadata fields. Cada entrada es un
|
||||
dict con las claves 'type_ref', 'label' y opcionalmente
|
||||
'metadata_fields'. Ejemplo:
|
||||
[{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
|
||||
"metadata_fields": ["full_name", "alias"]}]
|
||||
llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
|
||||
y retorna un dict con la respuesta JSON del LLM. Interfaz:
|
||||
llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
|
||||
language_instruction: Instruccion de idioma para el LLM. Por defecto
|
||||
"Respond in English."
|
||||
|
||||
Returns:
|
||||
Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
|
||||
no retorna JSON valido o si no se encuentran entidades.
|
||||
|
||||
Raises:
|
||||
ValueError: Si entity_schema esta vacio.
|
||||
"""
|
||||
if not entity_schema:
|
||||
raise ValueError("entity_schema no puede estar vacio")
|
||||
|
||||
valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
|
||||
type_ref_to_label = {
|
||||
entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
|
||||
}
|
||||
|
||||
system_prompt = _build_system_prompt(entity_schema, language_instruction)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
|
||||
return []
|
||||
|
||||
raw_entities = response.get("entities", [])
|
||||
if not isinstance(raw_entities, list):
|
||||
warnings.warn(
|
||||
"extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
|
||||
stacklevel=2,
|
||||
)
|
||||
return []
|
||||
|
||||
candidates: list[EntityCandidate] = []
|
||||
for item in raw_entities:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
name = item.get("name", "")
|
||||
if not name:
|
||||
continue
|
||||
|
||||
type_ref = item.get("type_ref", "")
|
||||
if type_ref not in valid_type_refs:
|
||||
warnings.warn(
|
||||
f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
|
||||
stacklevel=2,
|
||||
)
|
||||
continue
|
||||
|
||||
attributes = item.get("attributes", {})
|
||||
if not isinstance(attributes, dict):
|
||||
attributes = {}
|
||||
# Normalizar null values a None
|
||||
attributes = {k: v for k, v in attributes.items() if v is not None}
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
candidates.append(
|
||||
EntityCandidate(
|
||||
name=name,
|
||||
type_ref=type_ref,
|
||||
type_label=type_ref_to_label.get(type_ref, ""),
|
||||
attributes=attributes,
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return candidates
|
||||
@@ -0,0 +1,141 @@
|
||||
"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from typing import Callable
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from relation_candidate import RelationCandidate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_relations_llm(
|
||||
text: str,
|
||||
entities: list[EntityCandidate],
|
||||
relation_types: list[str],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
language_instruction: str = "Respond in English.",
|
||||
) -> list[RelationCandidate]:
|
||||
"""Extrae relaciones entre entidades de un chunk de texto usando un LLM.
|
||||
|
||||
Dado el texto original y las entidades ya extraidas, pide al LLM que
|
||||
identifique relaciones entre pares de entidades. Las relaciones cuyo
|
||||
from_name o to_name no coincidan con ninguna entidad existente se descartan.
|
||||
Los tipos de relacion no permitidos se reemplazan por "related_to".
|
||||
|
||||
Args:
|
||||
text: chunk de texto (el mismo que se uso para extraer las entidades).
|
||||
entities: entidades ya extraidas del chunk.
|
||||
relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
|
||||
"communicates_with", "owns", "related_to"].
|
||||
llm_chat_json: funcion inyectada que recibe una lista de mensajes
|
||||
(dicts con "role" y "content") y retorna un dict con la respuesta
|
||||
JSON del LLM.
|
||||
language_instruction: instruccion de idioma para el LLM.
|
||||
|
||||
Returns:
|
||||
Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
|
||||
o si el LLM no encuentra relaciones.
|
||||
"""
|
||||
if len(entities) < 2:
|
||||
return []
|
||||
|
||||
entity_names = {e.name for e in entities}
|
||||
relation_types_set = set(relation_types)
|
||||
|
||||
# Construir lista de entidades para el prompt
|
||||
entity_lines = "\n".join(
|
||||
f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
|
||||
)
|
||||
|
||||
# Construir tipos de relacion para el prompt
|
||||
relation_types_str = ", ".join(relation_types)
|
||||
|
||||
system_prompt = f"""\
|
||||
You are a relation extraction expert. Given text and a list of entities already \
|
||||
extracted, identify relationships between them.
|
||||
|
||||
Entities found in this text:
|
||||
{entity_lines}
|
||||
|
||||
Allowed relation types: {relation_types_str}
|
||||
|
||||
Output JSON: {{"relations": [
|
||||
{{"from_name": "Entity A", "to_name": "Entity B",
|
||||
"relation_type": "employs", "description": "...", "confidence": 0.8}}
|
||||
]}}
|
||||
|
||||
Rules:
|
||||
- Only extract relations explicitly stated or strongly implied in the text
|
||||
- from_name and to_name must match entity names exactly as listed above
|
||||
- relation_type must be one of the allowed types
|
||||
- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
|
||||
- Do not invent entities not in the list above
|
||||
- {language_instruction}"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
try:
|
||||
response = llm_chat_json(messages)
|
||||
except Exception as exc:
|
||||
logger.warning("extract_relations_llm: LLM call failed: %s", exc)
|
||||
return []
|
||||
|
||||
raw_relations = response.get("relations", [])
|
||||
if not isinstance(raw_relations, list):
|
||||
logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
|
||||
return []
|
||||
|
||||
results: list[RelationCandidate] = []
|
||||
for item in raw_relations:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
from_name = item.get("from_name", "")
|
||||
to_name = item.get("to_name", "")
|
||||
|
||||
# Validar que ambos nombres corresponden a entidades existentes
|
||||
if from_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
|
||||
from_name,
|
||||
)
|
||||
continue
|
||||
if to_name not in entity_names:
|
||||
logger.debug(
|
||||
"extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
|
||||
to_name,
|
||||
)
|
||||
continue
|
||||
|
||||
relation_type = item.get("relation_type", "")
|
||||
if relation_type not in relation_types_set:
|
||||
logger.debug(
|
||||
"extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
|
||||
relation_type,
|
||||
)
|
||||
relation_type = "related_to"
|
||||
|
||||
confidence = item.get("confidence", 0.0)
|
||||
if not isinstance(confidence, (int, float)):
|
||||
confidence = 0.0
|
||||
confidence = float(max(0.0, min(1.0, confidence)))
|
||||
|
||||
results.append(
|
||||
RelationCandidate(
|
||||
from_name=from_name,
|
||||
to_name=to_name,
|
||||
relation_type=relation_type,
|
||||
description=item.get("description", ""),
|
||||
confidence=confidence,
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Extract plain text from PDF, Markdown, or TXT files."""
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
|
||||
|
||||
|
||||
def _detect_encoding(data: bytes) -> str:
|
||||
"""Detect encoding of raw bytes using multiple fallback strategies."""
|
||||
# Strategy 1: UTF-8
|
||||
try:
|
||||
data.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: charset_normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
|
||||
result = from_bytes(data).best()
|
||||
if result is not None and result.encoding:
|
||||
return result.encoding
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Strategy 3: chardet
|
||||
try:
|
||||
import chardet
|
||||
|
||||
detected = chardet.detect(data)
|
||||
if detected and detected.get("encoding"):
|
||||
return detected["encoding"]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Last resort: UTF-8 with replacement
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def extract_text_from_file(file_path: str) -> str:
|
||||
"""Extract plain text from a file. Supports PDF, Markdown and TXT.
|
||||
|
||||
For PDF files uses PyMuPDF (fitz) to extract text from each page,
|
||||
joining them with double newlines. For text-based files (.md, .markdown,
|
||||
.txt) reads the file with automatic encoding detection.
|
||||
|
||||
Args:
|
||||
file_path: Absolute or relative path to the file.
|
||||
|
||||
Returns:
|
||||
str: Extracted plain text content.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If the file extension is not supported.
|
||||
ImportError: If PyMuPDF is not installed and a PDF is provided.
|
||||
"""
|
||||
import os
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"PyMuPDF is required for PDF extraction. "
|
||||
"Install it with: pip install PyMuPDF"
|
||||
) from e
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
pages = [page.get_text() for page in doc]
|
||||
return "\n\n".join(pages)
|
||||
|
||||
elif ext in {".md", ".markdown", ".txt"}:
|
||||
with open(file_path, "rb") as f:
|
||||
raw = f.read()
|
||||
|
||||
encoding = _detect_encoding(raw)
|
||||
try:
|
||||
return raw.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: '{ext}'. "
|
||||
f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||
)
|
||||
@@ -0,0 +1,208 @@
|
||||
"""Pipeline de extraccion de entidades y relaciones desde un documento."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
from typing import Callable
|
||||
|
||||
# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
|
||||
|
||||
from extract_text_from_file import extract_text_from_file
|
||||
from core_functions import preprocess_text
|
||||
from split_text_into_chunks import split_text_into_chunks
|
||||
from build_entity_schema_prompt import build_entity_schema_prompt
|
||||
from build_relation_schema_prompt import build_relation_schema_prompt
|
||||
from extract_entities_llm import extract_entities_llm
|
||||
from extract_relations_llm import extract_relations_llm
|
||||
from deduplicate_entities import deduplicate_entities
|
||||
from deduplicate_relations import deduplicate_relations
|
||||
from entity_candidate import EntityCandidate
|
||||
from extraction_result import ExtractionResult
|
||||
from extraction_stats import ExtractionStats
|
||||
|
||||
|
||||
def extraction_pipeline(
|
||||
file_path: str,
|
||||
entity_presets: list[dict],
|
||||
relation_types: list[str],
|
||||
llm_chat_json: Callable[[list[dict]], dict],
|
||||
chunk_size: int = 500,
|
||||
chunk_overlap: int = 50,
|
||||
confidence_threshold: float = 0.5,
|
||||
dedup_threshold: float = 0.85,
|
||||
on_progress: Callable[[str, float], None] | None = None,
|
||||
) -> ExtractionResult:
|
||||
"""Pipeline completa de extraccion de entidades y relaciones desde un documento.
|
||||
|
||||
Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
|
||||
-> extract_entities_llm por chunk -> deduplicate_entities ->
|
||||
extract_relations_llm por chunk -> deduplicate_relations.
|
||||
|
||||
Args:
|
||||
file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
|
||||
entity_presets: lista de dicts con type_ref, label y metadata_fields.
|
||||
Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
|
||||
"label": "Person",
|
||||
"metadata_fields": ["full_name", "nationality"]}]
|
||||
relation_types: tipos de relacion permitidos para extraccion.
|
||||
Ejemplo: ["funds", "employs", "communicates_with", "owns"]
|
||||
llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
|
||||
con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
|
||||
chunk_size: numero de caracteres por chunk (default 500).
|
||||
chunk_overlap: overlap entre chunks consecutivos (default 50).
|
||||
confidence_threshold: umbral minimo de confidence para aceptar entidades
|
||||
candidatas antes de deduplicar (default 0.5).
|
||||
dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
|
||||
on_progress: callback opcional de progreso (message: str, pct: float 0-1).
|
||||
0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
|
||||
80-100%: deduplicacion.
|
||||
|
||||
Returns:
|
||||
ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: si file_path no existe.
|
||||
ValueError: si entity_presets esta vacio.
|
||||
"""
|
||||
if not entity_presets:
|
||||
raise ValueError("entity_presets no puede estar vacio")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
|
||||
|
||||
def _progress(msg: str, pct: float) -> None:
|
||||
if on_progress is not None:
|
||||
try:
|
||||
on_progress(msg, pct)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
start_time = time.monotonic()
|
||||
stats = ExtractionStats()
|
||||
|
||||
# ── Paso 1: Extraer texto ──────────────────────────────────────────────────
|
||||
_progress("Extracting text from file...", 0.0)
|
||||
try:
|
||||
raw_text = extract_text_from_file(file_path)
|
||||
except Exception as exc:
|
||||
warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
|
||||
raw_text = ""
|
||||
|
||||
# ── Paso 2: Preprocesar ────────────────────────────────────────────────────
|
||||
clean_text = preprocess_text(raw_text)
|
||||
stats.total_chars = len(clean_text)
|
||||
|
||||
# ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
|
||||
chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
|
||||
n = len(chunks)
|
||||
stats.total_chunks = n
|
||||
|
||||
if n == 0:
|
||||
stats.processing_time_seconds = time.monotonic() - start_time
|
||||
return ExtractionResult(entities=[], relations=[], stats=stats)
|
||||
|
||||
# ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
|
||||
all_raw_entities: list[EntityCandidate] = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
_progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
|
||||
try:
|
||||
candidates = extract_entities_llm(
|
||||
text=chunk,
|
||||
entity_schema=entity_presets,
|
||||
llm_chat_json=llm_chat_json,
|
||||
)
|
||||
except Exception as exc:
|
||||
warnings.warn(
|
||||
f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
|
||||
)
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
# Anotar el chunk de origen
|
||||
if i not in candidate.source_chunk_indices:
|
||||
candidate.source_chunk_indices.append(i)
|
||||
all_raw_entities.append(candidate)
|
||||
|
||||
# ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
|
||||
filtered_entities = [
|
||||
e for e in all_raw_entities if e.confidence >= confidence_threshold
|
||||
]
|
||||
stats.raw_entities_count = len(filtered_entities)
|
||||
|
||||
# Actualizar stats de tipos
|
||||
for ent in filtered_entities:
|
||||
stats.entity_types_found[ent.type_ref] = (
|
||||
stats.entity_types_found.get(ent.type_ref, 0) + 1
|
||||
)
|
||||
|
||||
# ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
|
||||
_progress("Deduplicating entities...", 0.4)
|
||||
dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
|
||||
|
||||
stats.final_entities_count = dedup_result.total_after
|
||||
stats.entities_merged = dedup_result.total_before - dedup_result.total_after
|
||||
|
||||
final_entities = dedup_result.entities
|
||||
entity_id_map = dedup_result.name_to_id # nombre_original -> entity_id
|
||||
|
||||
# ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
|
||||
all_raw_relations = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
_progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
|
||||
|
||||
# Obtener entidades relevantes de este chunk
|
||||
chunk_entities = [
|
||||
e for e in final_entities if i in e.source_chunk_indices
|
||||
]
|
||||
# Si no hay entidades en este chunk especifico, usar todas
|
||||
if not chunk_entities:
|
||||
chunk_entities = final_entities
|
||||
|
||||
if len(chunk_entities) < 2:
|
||||
continue
|
||||
|
||||
try:
|
||||
chunk_relations = extract_relations_llm(
|
||||
text=chunk,
|
||||
entities=chunk_entities,
|
||||
relation_types=relation_types,
|
||||
llm_chat_json=llm_chat_json,
|
||||
)
|
||||
except Exception as exc:
|
||||
warnings.warn(
|
||||
f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
|
||||
)
|
||||
chunk_relations = []
|
||||
|
||||
for rel in chunk_relations:
|
||||
rel.source_chunk_index = i
|
||||
all_raw_relations.extend(chunk_relations)
|
||||
|
||||
stats.raw_relations_count = len(all_raw_relations)
|
||||
|
||||
# Actualizar stats de tipos de relacion
|
||||
for rel in all_raw_relations:
|
||||
stats.relation_types_found[rel.relation_type] = (
|
||||
stats.relation_types_found.get(rel.relation_type, 0) + 1
|
||||
)
|
||||
|
||||
# ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
|
||||
_progress("Deduplicating relations...", 0.8)
|
||||
final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
|
||||
|
||||
stats.final_relations_count = len(final_relations)
|
||||
stats.relations_merged = stats.raw_relations_count - len(final_relations)
|
||||
stats.processing_time_seconds = time.monotonic() - start_time
|
||||
|
||||
_progress("Done", 1.0)
|
||||
|
||||
return ExtractionResult(
|
||||
entities=final_entities,
|
||||
relations=final_relations,
|
||||
stats=stats,
|
||||
)
|
||||
@@ -0,0 +1,20 @@
|
||||
"""ExtractionResult — resultado final del pipeline de extraccion."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from entity_candidate import EntityCandidate
|
||||
from extraction_stats import ExtractionStats
|
||||
from relation_candidate import RelationCandidate
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Resultado final del pipeline de extraccion de entidades y relaciones.
|
||||
|
||||
Contiene las listas deduplicadas de entidades y relaciones junto con
|
||||
las estadisticas del proceso completo.
|
||||
"""
|
||||
|
||||
entities: list[EntityCandidate]
|
||||
relations: list[RelationCandidate]
|
||||
stats: ExtractionStats = field(default_factory=ExtractionStats)
|
||||
@@ -0,0 +1,25 @@
|
||||
"""ExtractionStats — estadisticas del proceso de extraccion."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionStats:
|
||||
"""Estadisticas del proceso de extraccion.
|
||||
|
||||
Util para reporting y debugging. Registra conteos antes y despues de
|
||||
deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
|
||||
"""
|
||||
|
||||
total_chunks: int = 0
|
||||
total_chars: int = 0
|
||||
raw_entities_count: int = 0
|
||||
final_entities_count: int = 0
|
||||
entities_merged: int = 0
|
||||
raw_relations_count: int = 0
|
||||
final_relations_count: int = 0
|
||||
relations_merged: int = 0
|
||||
relations_discarded: int = 0
|
||||
entity_types_found: dict[str, int] = field(default_factory=dict)
|
||||
relation_types_found: dict[str, int] = field(default_factory=dict)
|
||||
processing_time_seconds: float = 0.0
|
||||
@@ -0,0 +1,78 @@
|
||||
"""Combina atributos de multiples candidatos de la misma entidad."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
|
||||
_DATE_MIN_FIELDS = {"first_seen", "created_date"}
|
||||
_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
|
||||
_BOOL_FIELDS = {"verified", "exploited"}
|
||||
|
||||
|
||||
def merge_entity_attributes(attr_list: list[dict]) -> dict:
|
||||
"""Combina atributos de multiples candidatos de la misma entidad.
|
||||
|
||||
Para cada campo presente en cualquier candidato recopila todos los valores
|
||||
non-null y aplica heuristicas de resolucion por tipo de campo:
|
||||
- Numerico (risk_score, balance, cvss): max
|
||||
- Fecha min (first_seen, created_date): min (mas antigua)
|
||||
- Fecha max (last_seen, expires_date): max (mas reciente)
|
||||
- Lista (cualquier valor de tipo list): union sin duplicados
|
||||
- Boolean (verified, exploited): OR logico
|
||||
- String: el mas largo
|
||||
|
||||
Args:
|
||||
attr_list: Lista de dicts con los atributos de cada candidato.
|
||||
|
||||
Returns:
|
||||
Dict con los atributos fusionados.
|
||||
"""
|
||||
if not attr_list:
|
||||
return {}
|
||||
|
||||
# Recopilar todas las claves presentes en cualquier candidato
|
||||
all_keys: set[str] = set()
|
||||
for attrs in attr_list:
|
||||
all_keys.update(attrs.keys())
|
||||
|
||||
merged: dict = {}
|
||||
|
||||
for key in all_keys:
|
||||
# Recopilar valores non-null
|
||||
values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
|
||||
|
||||
if not values:
|
||||
merged[key] = None
|
||||
continue
|
||||
|
||||
if len(values) == 1:
|
||||
merged[key] = values[0]
|
||||
continue
|
||||
|
||||
# Todos iguales
|
||||
if all(v == values[0] for v in values):
|
||||
merged[key] = values[0]
|
||||
continue
|
||||
|
||||
# Resolver conflicto segun tipo de campo
|
||||
if key in _NUMERIC_FIELDS:
|
||||
merged[key] = max(values)
|
||||
elif key in _DATE_MIN_FIELDS:
|
||||
merged[key] = min(values)
|
||||
elif key in _DATE_MAX_FIELDS:
|
||||
merged[key] = max(values)
|
||||
elif key in _BOOL_FIELDS:
|
||||
merged[key] = any(values)
|
||||
elif isinstance(values[0], list):
|
||||
# Union de listas sin duplicados, preservando orden de aparicion
|
||||
seen: list = []
|
||||
for lst in values:
|
||||
for item in lst:
|
||||
if item not in seen:
|
||||
seen.append(item)
|
||||
merged[key] = seen
|
||||
else:
|
||||
# String u otro: usar el mas largo
|
||||
str_values = [str(v) for v in values]
|
||||
merged[key] = max(str_values, key=len)
|
||||
|
||||
return merged
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
_TITLES = re.compile(
|
||||
r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_LEGAL_SUFFIXES = re.compile(
|
||||
r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
|
||||
r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
|
||||
r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_MULTI_SPACE = re.compile(r"\s+")
|
||||
|
||||
|
||||
def normalize_entity_name(name: str, entity_type: str = "") -> str:
|
||||
"""Normaliza el nombre de una entidad para comparacion y deduplicacion.
|
||||
|
||||
Aplica reglas diferentes segun el tipo de entidad:
|
||||
- ip / email / domain / crypto_wallet / phone: normalizacion tecnica
|
||||
- person: normalizacion de nombre humano (titulos, formato apellido-nombre)
|
||||
- organization: normalizacion corporativa (sufijos legales)
|
||||
- default: lower + strip + colapsar espacios
|
||||
|
||||
Args:
|
||||
name: nombre de la entidad a normalizar.
|
||||
entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
|
||||
person, organization). Vacio = default.
|
||||
|
||||
Returns:
|
||||
nombre normalizado como string.
|
||||
"""
|
||||
name = name.strip()
|
||||
et = entity_type.lower().strip()
|
||||
|
||||
if et == "ip":
|
||||
return name.lower()
|
||||
|
||||
if et == "email":
|
||||
return name.lower()
|
||||
|
||||
if et == "domain":
|
||||
result = name.lower().rstrip(".")
|
||||
if result.startswith("www."):
|
||||
result = result[4:]
|
||||
return result
|
||||
|
||||
if et == "crypto_wallet":
|
||||
# Bitcoin addresses son case-sensitive — solo strip
|
||||
return name
|
||||
|
||||
if et == "phone":
|
||||
# Mantener solo digitos y el signo +
|
||||
return re.sub(r"[^\d+]", "", name)
|
||||
|
||||
if et == "person":
|
||||
# Remover titulos al inicio
|
||||
result = _TITLES.sub("", name).strip()
|
||||
# Detectar formato "Apellido, Nombre"
|
||||
if "," in result:
|
||||
parts = result.split(",", 1)
|
||||
last = parts[0].strip()
|
||||
first = parts[1].strip()
|
||||
result = f"{first} {last}"
|
||||
# Colapsar espacios y title case
|
||||
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||
return result.title()
|
||||
|
||||
if et == "organization":
|
||||
result = _LEGAL_SUFFIXES.sub("", name).strip()
|
||||
result = _MULTI_SPACE.sub(" ", result).strip()
|
||||
# Title case para consistencia
|
||||
return result.title()
|
||||
|
||||
# Default: lower, strip, colapsar espacios
|
||||
return _MULTI_SPACE.sub(" ", name.lower()).strip()
|
||||
@@ -0,0 +1,35 @@
|
||||
"""RelationCandidate — candidato de relacion extraido por el LLM."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelationCandidate:
|
||||
"""Candidato de relacion entre dos entidades extraido por el LLM.
|
||||
|
||||
`from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
|
||||
y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
|
||||
contra los EntityCandidate finales.
|
||||
"""
|
||||
|
||||
from_name: str
|
||||
to_name: str
|
||||
from_id: str = ""
|
||||
to_id: str = ""
|
||||
relation_type: str = ""
|
||||
description: str = ""
|
||||
confidence: float = 0.0
|
||||
source_chunk_index: int = -1
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Serializa el candidato a un diccionario."""
|
||||
return {
|
||||
"from_name": self.from_name,
|
||||
"to_name": self.to_name,
|
||||
"from_id": self.from_id,
|
||||
"to_id": self.to_id,
|
||||
"relation_type": self.relation_type,
|
||||
"description": self.description,
|
||||
"confidence": self.confidence,
|
||||
"source_chunk_index": self.source_chunk_index,
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
_HTML_TEMPLATE = """\
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>{title}</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
|
||||
<style>
|
||||
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
||||
body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
|
||||
#container {{ width: 100vw; height: 100vh; }}
|
||||
#panel {{
|
||||
position: absolute; top: 12px; right: 12px;
|
||||
background: rgba(10, 10, 30, 0.88);
|
||||
border: 1px solid rgba(255,255,255,0.12);
|
||||
padding: 16px; border-radius: 10px;
|
||||
z-index: 10; min-width: 200px; max-width: 260px;
|
||||
backdrop-filter: blur(6px);
|
||||
}}
|
||||
#panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
|
||||
#stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
|
||||
#filters {{ display: flex; flex-direction: column; gap: 6px; }}
|
||||
.filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
|
||||
.filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
|
||||
.color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
|
||||
#tooltip {{
|
||||
position: absolute; display: none;
|
||||
background: rgba(5, 5, 20, 0.95);
|
||||
border: 1px solid rgba(255,255,255,0.15);
|
||||
padding: 10px 14px; border-radius: 8px;
|
||||
pointer-events: none; z-index: 20;
|
||||
max-width: 300px; font-size: 12px; line-height: 1.6;
|
||||
}}
|
||||
#tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
|
||||
#tooltip .tt-row {{ display: flex; gap: 6px; }}
|
||||
#tooltip .tt-key {{ color: #888; min-width: 80px; }}
|
||||
#tooltip .tt-val {{ color: #eee; word-break: break-all; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="container"></div>
|
||||
<div id="panel">
|
||||
<h3>{title}</h3>
|
||||
<div id="stats"></div>
|
||||
<div id="filters"></div>
|
||||
</div>
|
||||
<div id="tooltip"></div>
|
||||
|
||||
<script>
|
||||
(function () {{
|
||||
const graphData = {json_data};
|
||||
|
||||
// ── Build graphology graph ──────────────────────────────────────────────
|
||||
const Graph = graphology.Graph || graphology;
|
||||
const g = new Graph({{ multi: true, type: 'directed' }});
|
||||
|
||||
// Assign random initial positions
|
||||
graphData.nodes.forEach(function (n) {{
|
||||
g.addNode(n.key, Object.assign({{
|
||||
x: (Math.random() - 0.5) * 10,
|
||||
y: (Math.random() - 0.5) * 10,
|
||||
}}, n.attributes));
|
||||
}});
|
||||
|
||||
graphData.edges.forEach(function (e) {{
|
||||
try {{
|
||||
g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
|
||||
}} catch (err) {{
|
||||
// skip duplicate edge keys gracefully
|
||||
}}
|
||||
}});
|
||||
|
||||
// ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
|
||||
const FA2 = graphologyLibrary.layoutForceAtlas2;
|
||||
FA2.assign(g, {{
|
||||
iterations: 500,
|
||||
settings: {{
|
||||
gravity: 1,
|
||||
scalingRatio: 2,
|
||||
slowDown: 5,
|
||||
barnesHutOptimize: g.order > 300,
|
||||
}},
|
||||
}});
|
||||
|
||||
// ── Sigma renderer ──────────────────────────────────────────────────────
|
||||
const renderer = new Sigma(g, document.getElementById('container'), {{
|
||||
renderEdgeLabels: false,
|
||||
defaultEdgeColor: '#444',
|
||||
defaultNodeColor: '#95a5a6',
|
||||
labelColor: {{ color: '#ccc' }},
|
||||
labelSize: 11,
|
||||
edgeReducer: function (edge, data) {{
|
||||
return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
|
||||
}},
|
||||
}});
|
||||
|
||||
// ── Stats panel ─────────────────────────────────────────────────────────
|
||||
document.getElementById('stats').textContent =
|
||||
graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
|
||||
|
||||
// ── Filter panel by node type ───────────────────────────────────────────
|
||||
const typeColors = {{}};
|
||||
graphData.nodes.forEach(function (n) {{
|
||||
const t = n.attributes.entity_type || 'unknown';
|
||||
typeColors[t] = n.attributes.color || '#95a5a6';
|
||||
}});
|
||||
|
||||
const hiddenTypes = new Set();
|
||||
const filtersDiv = document.getElementById('filters');
|
||||
|
||||
Object.keys(typeColors).sort().forEach(function (type) {{
|
||||
const color = typeColors[type];
|
||||
const label = document.createElement('label');
|
||||
label.className = 'filter-item';
|
||||
|
||||
const cb = document.createElement('input');
|
||||
cb.type = 'checkbox';
|
||||
cb.checked = true;
|
||||
cb.addEventListener('change', function () {{
|
||||
if (cb.checked) hiddenTypes.delete(type);
|
||||
else hiddenTypes.add(type);
|
||||
renderer.refresh();
|
||||
}});
|
||||
|
||||
const dot = document.createElement('span');
|
||||
dot.className = 'color-dot';
|
||||
dot.style.background = color;
|
||||
|
||||
label.appendChild(cb);
|
||||
label.appendChild(dot);
|
||||
label.appendChild(document.createTextNode(type));
|
||||
filtersDiv.appendChild(label);
|
||||
}});
|
||||
|
||||
// Node reducer applies type filter
|
||||
renderer.setSetting('nodeReducer', function (node, data) {{
|
||||
if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
|
||||
return data;
|
||||
}});
|
||||
|
||||
// ── Tooltip on hover ────────────────────────────────────────────────────
|
||||
const tooltip = document.getElementById('tooltip');
|
||||
|
||||
renderer.on('enterNode', function (ref) {{
|
||||
const nodeAttrs = g.getNodeAttributes(ref.node);
|
||||
const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
|
||||
|
||||
let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
|
||||
html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
|
||||
|
||||
Object.keys(nodeAttrs).sort().forEach(function (k) {{
|
||||
if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
|
||||
html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
|
||||
}}
|
||||
}});
|
||||
|
||||
tooltip.innerHTML = html;
|
||||
tooltip.style.display = 'block';
|
||||
}});
|
||||
|
||||
renderer.on('leaveNode', function () {{
|
||||
tooltip.style.display = 'none';
|
||||
}});
|
||||
|
||||
document.getElementById('container').addEventListener('mousemove', function (e) {{
|
||||
tooltip.style.left = (e.clientX + 16) + 'px';
|
||||
tooltip.style.top = (e.clientY + 16) + 'px';
|
||||
}});
|
||||
|
||||
function escHtml(str) {{
|
||||
return String(str)
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"');
|
||||
}}
|
||||
}})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def render_sigma_html(
|
||||
graph_data: dict,
|
||||
output_path: str,
|
||||
title: str = "OSINT Graph",
|
||||
) -> str:
|
||||
"""Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
|
||||
|
||||
Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
|
||||
en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
|
||||
sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
|
||||
tooltip con metadata al hacer hover.
|
||||
|
||||
Args:
|
||||
graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
|
||||
output_path: Ruta del archivo HTML a escribir.
|
||||
title: Titulo del grafo mostrado en el panel y la pestana.
|
||||
|
||||
Returns:
|
||||
Ruta absoluta del archivo HTML escrito.
|
||||
|
||||
Raises:
|
||||
Exception: Si no se puede escribir el archivo en output_path.
|
||||
"""
|
||||
json_data = json.dumps(graph_data, ensure_ascii=False)
|
||||
|
||||
html = _HTML_TEMPLATE.format(
|
||||
title=title,
|
||||
json_data=json_data,
|
||||
)
|
||||
|
||||
abs_path = os.path.abspath(output_path)
|
||||
os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
|
||||
|
||||
try:
|
||||
with open(abs_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
except OSError as exc:
|
||||
raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
|
||||
|
||||
return abs_path
|
||||
@@ -0,0 +1,66 @@
|
||||
"""Split text into overlapping chunks with sentence-boundary awareness."""
|
||||
|
||||
|
||||
def split_text_into_chunks(
|
||||
text: str, chunk_size: int = 500, overlap: int = 50
|
||||
) -> list[str]:
|
||||
"""Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
|
||||
|
||||
Args:
|
||||
text: Texto a dividir.
|
||||
chunk_size: Tamaño máximo de cada chunk en caracteres.
|
||||
overlap: Número de caracteres de solapamiento entre chunks consecutivos.
|
||||
|
||||
Returns:
|
||||
Lista de chunks. Vacía si el texto es vacío.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
if len(text) <= chunk_size:
|
||||
stripped = text.strip()
|
||||
return [stripped] if stripped else []
|
||||
|
||||
# Separadores en orden de prioridad (más específicos primero)
|
||||
separators = ["。", "!", "?", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
text_len = len(text)
|
||||
|
||||
while start < text_len:
|
||||
end = start + chunk_size
|
||||
|
||||
if end < text_len:
|
||||
# Buscar el último separador de oración dentro de text[start:end]
|
||||
# Solo aceptar si está después del 30% del chunk
|
||||
min_pos = start + int(chunk_size * 0.30)
|
||||
best_end = None
|
||||
|
||||
for sep in separators:
|
||||
sep_len = len(sep)
|
||||
# Buscar la última ocurrencia del separador en text[start:end]
|
||||
search_region = text[start:end]
|
||||
pos = search_region.rfind(sep)
|
||||
if pos == -1:
|
||||
continue
|
||||
abs_pos = start + pos + sep_len
|
||||
if abs_pos > min_pos:
|
||||
# Usar este separador solo si produce un corte más tarde que el mínimo
|
||||
# y más temprano que chunk_size (ya garantizado por rfind en [start:end])
|
||||
if best_end is None or abs_pos > best_end:
|
||||
best_end = abs_pos
|
||||
|
||||
if best_end is not None:
|
||||
end = best_end
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = end - overlap
|
||||
# Protección contra bucle infinito si overlap >= chunk_size o end no avanza
|
||||
if start >= end:
|
||||
start = end
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user