chore: initial sync

2026-04-28 22:13:08 +02:00
commit 40bea81603
30 changed files with 6675 additions and 0 deletions
@@ -0,0 +1,43 @@
+"""Genera la seccion del system prompt que describe los entity types disponibles para extraccion."""
+
+
+def build_entity_schema_prompt(entity_presets: list[dict]) -> str:
+    """Genera texto legible para el LLM describiendo los entity types disponibles.
+
+    Formatea los presets del registry en una seccion del system prompt que indica
+    al LLM que tipos de entidades puede extraer y que atributos tiene cada uno.
+
+    Args:
+        entity_presets: Lista de presets con campos 'label', 'type_ref' y
+                        opcionalmente 'metadata_fields'. Ejemplo:
+                        [{"type_ref": "osint_person_go_cybersecurity",
+                          "label": "Person",
+                          "metadata_fields": ["full_name", "alias"]}]
+
+    Returns:
+        String formateado con la seccion del prompt. Retorna string vacio si
+        la lista de presets esta vacia.
+    """
+    if not entity_presets:
+        return ""
+
+    lines = ["Entity types available for extraction:", ""]
+
+    for i, preset in enumerate(entity_presets, start=1):
+        label = preset.get("label", "Unknown")
+        type_ref = preset.get("type_ref", "")
+        metadata_fields = preset.get("metadata_fields", [])
+
+        lines.append(f"{i}. {label} (type_ref: {type_ref})")
+
+        if metadata_fields:
+            attrs = ", ".join(metadata_fields)
+            lines.append(f"   Attributes: {attrs}")
+
+        lines.append("")
+
+    # Remove trailing blank line
+    if lines and lines[-1] == "":
+        lines.pop()
+
+    return "\n".join(lines)
@@ -0,0 +1,22 @@
+"""Genera la seccion del system prompt con los tipos de relacion permitidos."""
+
+
+def build_relation_schema_prompt(relation_types: list[str]) -> str:
+    """Genera texto legible para el LLM describiendo los tipos de relacion permitidos.
+
+    Formatea la lista de tipos de relacion en una seccion del system prompt que
+    indica al LLM que relaciones puede extraer entre entidades.
+
+    Args:
+        relation_types: Lista de strings con los tipos de relacion permitidos.
+                        Ejemplo: ["funds", "employs", "communicates_with"]
+
+    Returns:
+        String formateado con la seccion del prompt. Retorna string vacio si
+        la lista esta vacia.
+    """
+    if not relation_types:
+        return ""
+
+    joined = ", ".join(relation_types)
+    return f"Allowed relation types:\n{joined}"
@@ -0,0 +1,814 @@
+"""Core functional programming utilities — pure functions for list/collection operations."""
+
+import hashlib
+import re
+from functools import reduce as _reduce
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+
+def filter_list(xs: list, pred: Callable) -> list:
+    """Filter list by predicate. Does not mutate the original."""
+    return [x for x in xs if pred(x)]
+
+
+def map_list(xs: list, fn: Callable) -> list:
+    """Map function over list. Does not mutate the original."""
+    return [fn(x) for x in xs]
+
+
+def reduce_list(xs: list, initial: Any, fn: Callable) -> Any:
+    """Reduce list with accumulator. fn(acc, x) -> acc."""
+    return _reduce(fn, xs, initial)
+
+
+def flat_map(xs: list, fn: Callable) -> list:
+    """Map function over list then flatten one level."""
+    result = []
+    for x in xs:
+        result.extend(fn(x))
+    return result
+
+
+def flatten(xss: list) -> list:
+    """Flatten a list of lists one level."""
+    result = []
+    for xs in xss:
+        result.extend(xs)
+    return result
+
+
+def chunk(xs: list, size: int) -> list:
+    """Split list into chunks of given size. Last chunk may be smaller."""
+    if size <= 0:
+        return []
+    return [xs[i : i + size] for i in range(0, len(xs), size)]
+
+
+def take(xs: list, n: int) -> list:
+    """Take first n elements from list."""
+    return xs[:n]
+
+
+def drop(xs: list, n: int) -> list:
+    """Drop first n elements from list."""
+    return xs[n:]
+
+
+def unique(xs: list) -> list:
+    """Remove duplicates preserving order. Uses identity for hashable elements."""
+    seen = set()
+    result = []
+    for x in xs:
+        if x not in seen:
+            seen.add(x)
+            result.append(x)
+    return result
+
+
+def group_by(xs: list, key_fn: Callable) -> Dict:
+    """Group elements by key function. Returns dict of key -> list."""
+    groups: Dict = {}
+    for x in xs:
+        k = key_fn(x)
+        if k not in groups:
+            groups[k] = []
+        groups[k].append(x)
+    return groups
+
+
+def partition(xs: list, pred: Callable) -> Tuple[list, list]:
+    """Split list into (matches, non_matches) based on predicate."""
+    matches = []
+    non_matches = []
+    for x in xs:
+        if pred(x):
+            matches.append(x)
+        else:
+            non_matches.append(x)
+    return (matches, non_matches)
+
+
+def find(xs: list, pred: Callable) -> Any:
+    """Find first element matching predicate. Returns None if not found."""
+    for x in xs:
+        if pred(x):
+            return x
+    return None
+
+
+def find_index(xs: list, pred: Callable) -> int:
+    """Find index of first element matching predicate. Returns -1 if not found."""
+    for i, x in enumerate(xs):
+        if pred(x):
+            return i
+    return -1
+
+
+def zip_with(xs: list, ys: list, fn: Callable) -> list:
+    """Zip two lists with a combining function. Stops at shorter list."""
+    return [fn(x, y) for x, y in zip(xs, ys)]
+
+
+def all_of(xs: list, pred: Callable) -> bool:
+    """Return True if all elements match predicate."""
+    return all(pred(x) for x in xs)
+
+
+def any_of(xs: list, pred: Callable) -> bool:
+    """Return True if any element matches predicate."""
+    return any(pred(x) for x in xs)
+
+
+def pipe(value: Any, *fns: Callable) -> Any:
+    """Pipe a value through a sequence of functions left-to-right."""
+    result = value
+    for fn in fns:
+        result = fn(result)
+    return result
+
+
+def compose(*fns: Callable) -> Callable:
+    """Compose functions right-to-left. compose(f, g)(x) == f(g(x))."""
+    def composed(x: Any) -> Any:
+        result = x
+        for fn in reversed(fns):
+            result = fn(result)
+        return result
+    return composed
+
+
+# ── Tree manipulation ────────────────────────────────────────────────────────
+
+
+def flatten_tree(structure: Any) -> List[Dict]:
+    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
+    import copy
+    if isinstance(structure, dict):
+        node = copy.deepcopy(structure)
+        node.pop('nodes', None)
+        nodes = [node]
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                nodes.extend(flatten_tree(structure[key]))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(flatten_tree(item))
+        return nodes
+    return []
+
+
+def tree_to_flat_list(structure: Any) -> List[Dict]:
+    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
+    if isinstance(structure, dict):
+        nodes = [structure]
+        if 'nodes' in structure:
+            nodes.extend(tree_to_flat_list(structure['nodes']))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(tree_to_flat_list(item))
+        return nodes
+    return []
+
+
+def get_leaf_nodes(structure: Any) -> List[Dict]:
+    """Extract only leaf nodes (no children) from a hierarchical tree."""
+    import copy
+    if isinstance(structure, dict):
+        if not structure.get('nodes'):
+            node = copy.deepcopy(structure)
+            node.pop('nodes', None)
+            return [node]
+        leaf_nodes = []
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                leaf_nodes.extend(get_leaf_nodes(structure[key]))
+        return leaf_nodes
+    elif isinstance(structure, list):
+        leaf_nodes = []
+        for item in structure:
+            leaf_nodes.extend(get_leaf_nodes(item))
+        return leaf_nodes
+    return []
+
+
+def write_node_ids(data: Any, node_id: int = 0) -> int:
+    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_ids(data[key], node_id)
+    elif isinstance(data, list):
+        for item in data:
+            node_id = write_node_ids(item, node_id)
+    return node_id
+
+
+def list_to_tree(data: List[Dict]) -> List[Dict]:
+    """Convert flat list with structure codes ('1.2.3') to nested tree."""
+    def get_parent_structure(structure):
+        if not structure:
+            return None
+        parts = str(structure).split('.')
+        return '.'.join(parts[:-1]) if len(parts) > 1 else None
+
+    nodes = {}
+    root_nodes = []
+
+    for item in data:
+        structure = item.get('structure')
+        node = {
+            'title': item.get('title'),
+            'start_index': item.get('start_index'),
+            'end_index': item.get('end_index'),
+            'nodes': []
+        }
+        nodes[structure] = node
+        parent_structure = get_parent_structure(structure)
+
+        if parent_structure and parent_structure in nodes:
+            nodes[parent_structure]['nodes'].append(node)
+        else:
+            root_nodes.append(node)
+
+    def clean_node(node):
+        if not node['nodes']:
+            del node['nodes']
+        else:
+            for child in node['nodes']:
+                clean_node(child)
+        return node
+
+    return [clean_node(node) for node in root_nodes]
+
+
+def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
+    """Recursively remove specified fields from a tree (dict/list)."""
+    if fields is None:
+        fields = ['text']
+    if isinstance(data, dict):
+        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
+    elif isinstance(data, list):
+        return [remove_tree_fields(item, fields) for item in data]
+    return data
+
+
+def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
+    """Reorder fields of each node in a tree according to specified key order."""
+    if not order:
+        return structure
+    if isinstance(structure, dict):
+        if 'nodes' in structure:
+            structure['nodes'] = format_tree_structure(structure['nodes'], order)
+        if not structure.get('nodes'):
+            structure.pop('nodes', None)
+        return {key: structure[key] for key in order if key in structure}
+    elif isinstance(structure, list):
+        return [format_tree_structure(item, order) for item in structure]
+    return structure
+
+
+def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
+    """Create flat dict mapping node_id to node for O(1) lookup."""
+    mapping = {}
+    def _traverse(nodes):
+        for node in nodes:
+            if node.get('node_id'):
+                mapping[node['node_id']] = node
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+    _traverse(tree)
+    return mapping
+
+
+# ── Text / JSON extraction ───────────────────────────────────────────────────
+
+
+def extract_json_from_llm(content: str) -> Dict:
+    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
+    import json
+    try:
+        start_idx = content.find("```json")
+        if start_idx != -1:
+            start_idx += 7
+            end_idx = content.rfind("```")
+            json_content = content[start_idx:end_idx].strip()
+        else:
+            json_content = content.strip()
+
+        json_content = json_content.replace('None', 'null')
+        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
+        json_content = ' '.join(json_content.split())
+
+        return json.loads(json_content)
+    except (json.JSONDecodeError, Exception):
+        try:
+            json_content = json_content.replace(',]', ']').replace(',}', '}')
+            return json.loads(json_content)
+        except Exception:
+            return {}
+
+
+def parse_page_range(pages: str) -> List[int]:
+    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
+    result = []
+    for part in pages.split(','):
+        part = part.strip()
+        if '-' in part:
+            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
+            if start > end:
+                raise ValueError(f"Invalid range '{part}': start must be <= end")
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(set(result))
+
+
+# ── Markdown parsing ─────────────────────────────────────────────────────────
+
+
+def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
+    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
+    import re
+    header_pattern = r'^(#{1,6})\s+(.+)$'
+    code_block_pattern = r'^```'
+    node_list = []
+    lines = markdown_content.split('\n')
+    in_code_block = False
+
+    for line_num, line in enumerate(lines, 1):
+        stripped_line = line.strip()
+        if re.match(code_block_pattern, stripped_line):
+            in_code_block = not in_code_block
+            continue
+        if not stripped_line:
+            continue
+        if not in_code_block:
+            match = re.match(header_pattern, stripped_line)
+            if match:
+                level = len(match.group(1))
+                title = match.group(2).strip()
+                node_list.append({'title': title, 'level': level, 'line_num': line_num})
+
+    return node_list, lines
+
+
+def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
+    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
+    if not node_list:
+        return []
+
+    stack = []
+    root_nodes = []
+    node_counter = 1
+
+    for node in node_list:
+        current_level = node['level']
+        tree_node = {
+            'title': node['title'],
+            'node_id': str(node_counter).zfill(4),
+            'line_num': node['line_num'],
+            'nodes': []
+        }
+        node_counter += 1
+
+        while stack and stack[-1][1] >= current_level:
+            stack.pop()
+
+        if not stack:
+            root_nodes.append(tree_node)
+        else:
+            parent_node, _ = stack[-1]
+            parent_node['nodes'].append(tree_node)
+
+        stack.append((tree_node, current_level))
+
+    def clean_empty_nodes(nodes):
+        for n in nodes:
+            if n['nodes']:
+                clean_empty_nodes(n['nodes'])
+            else:
+                del n['nodes']
+        return nodes
+
+    return clean_empty_nodes(root_nodes)
+
+
+# ── Pagination / chunking ────────────────────────────────────────────────────
+
+
+def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
+                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
+    """Group pages into text chunks respecting token limit with configurable overlap."""
+    import math
+    num_tokens = sum(token_lengths)
+
+    if num_tokens <= max_tokens:
+        return ["".join(page_contents)]
+
+    subsets = []
+    current_subset = []
+    current_token_count = 0
+
+    expected_parts = math.ceil(num_tokens / max_tokens)
+    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
+
+    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
+        if current_token_count + page_tokens > avg_tokens:
+            subsets.append(''.join(current_subset))
+            overlap_start = max(i - overlap_pages, 0)
+            current_subset = list(page_contents[overlap_start:i])
+            current_token_count = sum(token_lengths[overlap_start:i])
+
+        current_subset.append(page_content)
+        current_token_count += page_tokens
+
+    if current_subset:
+        subsets.append(''.join(current_subset))
+
+    return subsets
+
+
+def calculate_page_offset(pairs: List[Dict]) -> int:
+    """Calculate offset between logical page numbers and physical indices using reference pairs."""
+    differences = []
+    for pair in pairs:
+        try:
+            difference = pair['physical_index'] - pair['page']
+            differences.append(difference)
+        except (KeyError, TypeError):
+            continue
+
+    if not differences:
+        return 0
+
+    counts: Dict[int, int] = {}
+    for diff in differences:
+        counts[diff] = counts.get(diff, 0) + 1
+
+    return max(counts.items(), key=lambda x: x[1])[0]
+
+
+# ── Text preprocessing ───────────────────────────────────────────────────────
+
+
+def preprocess_text(text: str) -> str:
+    """Normalize whitespace and newlines in raw text.
+
+    Args:
+        text: Raw text to normalize.
+
+    Returns:
+        Normalized text with consistent newlines, stripped lines, and no
+        excessive blank lines.
+    """
+    # Normalize line endings: \r\n and \r -> \n
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Reduce 3+ consecutive newlines to at most 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Strip whitespace from each line
+    text = '\n'.join(line.strip() for line in text.split('\n'))
+    # Strip globally
+    return text.strip()
+
+
+def get_text_stats(text: str) -> dict:
+    """Compute basic statistics of a text: characters, lines, words.
+
+    Args:
+        text: Input text to analyze.
+
+    Returns:
+        Dict with keys total_chars (int), total_lines (int), total_words (int).
+    """
+    return {
+        'total_chars': len(text),
+        'total_lines': text.count('\n') + 1,
+        'total_words': len(text.split()),
+    }
+
+
+# ── Git URL parsing ──────────────────────────────────────────────────────────
+
+_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
+
+
+def _sanitize_git_segment(segment: str) -> str:
+    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
+    if segment.endswith(".git"):
+        segment = segment[:-4]
+    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
+
+
+def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
+    """Parse a code-hosting URL and return the 'org/repo' path component.
+
+    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
+    Returns None if the URL does not match any known host or is malformed.
+
+    Args:
+        url: Repository URL in any supported format.
+        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        'org/repo' string or None.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    if url.startswith("git@"):
+        # git@github.com:org/repo.git
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return None
+        host, path = rest.split(":", 1)
+        if host not in hosts:
+            return None
+        segments = [s for s in path.split("/") if s]
+        if len(segments) < 2:
+            return None
+        org = _sanitize_git_segment(segments[0])
+        repo = _sanitize_git_segment(segments[1])
+        if not org or not repo:
+            return None
+        return f"{org}/{repo}"
+
+    for prefix in ("http://", "https://", "git://", "ssh://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            netloc = parsed.hostname or ""
+            if netloc not in hosts:
+                return None
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) < 2:
+                return None
+            org = _sanitize_git_segment(segments[0])
+            repo = _sanitize_git_segment(segments[1])
+            if not org or not repo:
+                return None
+            return f"{org}/{repo}"
+
+    return None
+
+
+def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
+    """Return True only if url points to a clonable git repository.
+
+    Accepts org/repo and org/repo/tree/<ref> paths.
+    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
+
+    Args:
+        url: URL to verify.
+        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        True if url is a clonable repository URL.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    # SSH shorthand — always repo-level if host matches
+    if url.startswith("git@"):
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return False
+        host, _ = rest.split(":", 1)
+        return host in hosts
+
+    # git:// and ssh:// — always repo-level if host matches
+    for prefix in ("ssh://", "git://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            return (parsed.hostname or "") in hosts
+
+    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
+    for prefix in ("http://", "https://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            if (parsed.hostname or "") not in hosts:
+                return False
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) == 2:
+                return True
+            if len(segments) == 4 and segments[2] == "tree":
+                return True
+            return False
+
+    return False
+
+
+def validate_git_ssh_uri(url: str) -> None:
+    """Validate a git SSH URI of the form git@host:path.
+
+    Raises ValueError with a descriptive message if the URI is malformed.
+
+    Args:
+        url: URI string to validate.
+
+    Raises:
+        ValueError: If the URI does not conform to git SSH format.
+    """
+    if not url.startswith("git@"):
+        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
+    rest = url[len("git@"):]
+    if ":" not in rest:
+        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
+    _, path = rest.split(":", 1)
+    if not path:
+        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
+
+
+# ---------------------------------------------------------------------------
+# Markdown parsing utilities
+# ---------------------------------------------------------------------------
+
+
+def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
+    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
+
+    Args:
+        content: Raw markdown string, optionally starting with YAML frontmatter.
+
+    Returns:
+        Tuple of (content_without_frontmatter, frontmatter_dict).
+        frontmatter_dict is None when no frontmatter is found.
+    """
+    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
+    match = pattern.match(content)
+    if not match:
+        return content, None
+
+    raw = match.group(1)
+    remaining = content[match.end():]
+
+    try:
+        import yaml  # type: ignore
+        data = yaml.safe_load(raw)
+        if not isinstance(data, dict):
+            data = None
+    except Exception:
+        # Fallback: simple key: value parser (no yaml dependency)
+        data = {}
+        for line in raw.splitlines():
+            if ':' in line:
+                key, _, value = line.partition(':')
+                data[key.strip()] = value.strip()
+
+    return remaining, data
+
+
+def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
+    """Find all markdown headings (# to ######), excluding those inside code blocks,
+    HTML comments, and indented blocks.
+
+    Args:
+        content: Markdown text to search.
+
+    Returns:
+        List of (start_pos, end_pos, title, level) for each heading found.
+    """
+    excluded: List[Tuple[int, int]] = []
+
+    # Code blocks (triple backtick)
+    for m in re.finditer(r'```.*?```', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # HTML comments
+    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # Indented blocks (lines starting with 4 spaces or a tab)
+    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
+        excluded.append((m.start(), m.end()))
+
+    def is_excluded(pos: int) -> bool:
+        return any(start <= pos < end for start, end in excluded)
+
+    results: List[Tuple[int, int, str, int]] = []
+    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
+        # Skip escaped headings (\#)
+        before = content[m.start() - 1] if m.start() > 0 else ''
+        if before == '\\':
+            continue
+        if is_excluded(m.start()):
+            continue
+        level = len(m.group(1))
+        title = m.group(2).strip()
+        results.append((m.start(), m.end(), title, level))
+
+    return results
+
+
+def estimate_token_count(content: str) -> int:
+    """Estimate token count without a tokenizer.
+
+    CJK characters count as ~0.7 tokens each; other non-whitespace characters
+    count as ~0.3 tokens each.
+
+    Args:
+        content: Text to estimate.
+
+    Returns:
+        Estimated integer token count.
+    """
+    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
+    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
+    others = re.findall(r'\S', without_cjk)
+    return int(len(cjk) * 0.7 + len(others) * 0.3)
+
+
+def smart_split_content(
+    content: str,
+    max_tokens: int = 1024,
+    max_chars: int = 8000,
+) -> List[str]:
+    """Split large content into parts respecting token and character limits.
+
+    Splits by paragraphs (double newline). If a single paragraph exceeds the
+    limit it is force-cut into chunks of max_chars.
+
+    Args:
+        content: Text to split.
+        max_tokens: Maximum estimated tokens per part.
+        max_chars: Maximum characters per part.
+
+    Returns:
+        List of string parts.
+    """
+    paragraphs = content.split('\n\n')
+    parts: List[str] = []
+    current_parts: List[str] = []
+    current_tokens = 0
+    current_chars = 0
+
+    def flush() -> None:
+        if current_parts:
+            parts.append('\n\n'.join(current_parts))
+            current_parts.clear()
+
+    for para in paragraphs:
+        para_tokens = estimate_token_count(para)
+        para_chars = len(para)
+
+        # Single paragraph exceeds limits — force-cut it
+        if para_tokens > max_tokens or para_chars > max_chars:
+            flush()
+            current_tokens = 0
+            current_chars = 0
+            for i in range(0, len(para), max_chars):
+                parts.append(para[i:i + max_chars])
+            continue
+
+        # Would exceed limits if added — flush first
+        if (current_tokens + para_tokens > max_tokens or
+                current_chars + para_chars > max_chars):
+            flush()
+            current_tokens = 0
+            current_chars = 0
+
+        current_parts.append(para)
+        current_tokens += para_tokens
+        current_chars += para_chars
+
+    flush()
+    return parts if parts else [content]
+
+
+def sanitize_for_path(text: str, max_length: int = 50) -> str:
+    """Convert text to a safe string for use in file paths.
+
+    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
+    with underscores. Truncates with a sha256 suffix if the result exceeds
+    max_length.
+
+    Args:
+        text: Input text to sanitize.
+        max_length: Maximum length of the returned string.
+
+    Returns:
+        Safe path-friendly string.
+    """
+    cleaned = re.sub(
+        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
+        '',
+        text,
+    )
+    cleaned = cleaned.replace(' ', '_').strip('_')
+
+    if not cleaned:
+        return 'section'
+
+    if len(cleaned) <= max_length:
+        return cleaned
+
+    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
+    return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,283 @@
+"""Deduplica entidades candidatas usando fuzzy matching de nombres."""
+
+from __future__ import annotations
+
+import sys
+import os
+import uuid
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from entity_candidate import EntityCandidate
+from deduplication_result import DeduplicationResult
+from normalize_entity_name import normalize_entity_name
+from merge_entity_attributes import merge_entity_attributes
+
+
+# ── Similitud helpers ──────────────────────────────────────────────────────────
+
+def _levenshtein(a: str, b: str) -> int:
+    """Distancia de edicion Levenshtein entre dos strings."""
+    if a == b:
+        return 0
+    if not a:
+        return len(b)
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+
+
+def _jaccard(tokens_a: list[str], tokens_b: list[str]) -> float:
+    """Similitud de Jaccard entre dos conjuntos de tokens."""
+    set_a = set(tokens_a)
+    set_b = set(tokens_b)
+    if not set_a and not set_b:
+        return 1.0
+    inter = len(set_a & set_b)
+    union = len(set_a | set_b)
+    return inter / union if union else 0.0
+
+
+def _name_similarity(a: str, b: str) -> float:
+    """Score de similitud entre dos nombres normalizados.
+
+    Combina similitud de Levenshtein y Jaccard sobre tokens.
+    Aplica bonus de contencion (+0.3) y deteccion de acronimos.
+    """
+    if a == b:
+        return 1.0
+
+    # Similitud Levenshtein
+    max_len = max(len(a), len(b))
+    lev_sim = 1.0 - (_levenshtein(a, b) / max_len) if max_len else 1.0
+
+    # Similitud Jaccard sobre tokens
+    tokens_a = a.split()
+    tokens_b = b.split()
+    jac_sim = _jaccard(tokens_a, tokens_b)
+
+    score = max(lev_sim, jac_sim)
+
+    # Bonus de contencion: un nombre contiene al otro
+    if a in b or b in a:
+        score = min(1.0, score + 0.3)
+
+    # Deteccion de acronimo: "FBI" ~ "Federal Bureau of Investigation"
+    if _is_acronym_of(a, tokens_b) or _is_acronym_of(b, tokens_a):
+        score = min(1.0, score + 0.3)
+
+    return score
+
+
+def _is_acronym_of(candidate: str, tokens: list[str]) -> bool:
+    """Comprueba si candidate es un acronimo formado por las iniciales de tokens."""
+    if not candidate or not tokens:
+        return False
+    initials = "".join(t[0] for t in tokens if t).upper()
+    return candidate.upper() == initials
+
+
+_EXACT_TYPES = {"ip", "email", "domain", "crypto_wallet", "phone"}
+
+
+def _is_exact_type(entity_type: str) -> bool:
+    """Tipos tecnicos donde solo se acepta matching exacto."""
+    return entity_type.lower() in _EXACT_TYPES
+
+
+# ── Union-Find ─────────────────────────────────────────────────────────────────
+
+class _UnionFind:
+    def __init__(self, n: int) -> None:
+        self._parent = list(range(n))
+        self._rank = [0] * n
+
+    def find(self, x: int) -> int:
+        while self._parent[x] != x:
+            self._parent[x] = self._parent[self._parent[x]]
+            x = self._parent[x]
+        return x
+
+    def union(self, x: int, y: int) -> None:
+        rx, ry = self.find(x), self.find(y)
+        if rx == ry:
+            return
+        if self._rank[rx] < self._rank[ry]:
+            rx, ry = ry, rx
+        self._parent[ry] = rx
+        if self._rank[rx] == self._rank[ry]:
+            self._rank[rx] += 1
+
+
+# ── Implementacion principal ────────────────────────────────────────────────────
+
+def deduplicate_entities(
+    candidates: list[EntityCandidate],
+    name_threshold: float = 0.85,
+    same_type_only: bool = True,
+) -> DeduplicationResult:
+    """Agrupa entidades candidatas que refieren a la misma entidad real.
+
+    Usa fuzzy matching de nombres (Levenshtein + Jaccard) y Union-Find para
+    detectar clusters transitivos. Por cada cluster genera una entidad canonica
+    mergeando atributos de todos sus miembros.
+
+    Para tipos tecnicos (ip, email, domain, crypto_wallet, phone) solo se
+    acepta matching exacto normalizado, ignorando el umbral de nombre.
+
+    Args:
+        candidates: lista de EntityCandidate a deduplicar.
+        name_threshold: score minimo para considerar dos nombres iguales (0-1).
+        same_type_only: si True, solo compara entidades del mismo type_ref.
+
+    Returns:
+        DeduplicationResult con entidades deduplicadas, mapas de resolucion
+        e historial de merges.
+    """
+    if not candidates:
+        return DeduplicationResult(
+            entities=[],
+            entity_id_map={},
+            name_to_id={},
+            merge_log=[],
+            total_before=0,
+            total_after=0,
+        )
+
+    n = len(candidates)
+
+    # Paso 1: normalizar nombres
+    normalized: list[str] = []
+    for c in candidates:
+        norm = normalize_entity_name(c.name, c.type_ref)
+        normalized.append(norm)
+
+    # Paso 2: Union-Find sobre todos los indices
+    uf = _UnionFind(n)
+
+    # Paso 3: comparacion pairwise (con agrupacion por tipo si same_type_only)
+    merge_pairs: list[tuple[int, int, float]] = []
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            if same_type_only and candidates[i].type_ref != candidates[j].type_ref:
+                continue
+
+            ni, nj = normalized[i], normalized[j]
+            et = candidates[i].type_ref.lower()
+
+            if _is_exact_type(et):
+                if ni == nj:
+                    uf.union(i, j)
+                    merge_pairs.append((i, j, 1.0))
+                continue
+
+            score = _name_similarity(ni, nj)
+            if score >= name_threshold:
+                uf.union(i, j)
+                merge_pairs.append((i, j, score))
+
+    # Paso 4: agrupar indices por raiz del Union-Find
+    clusters: dict[int, list[int]] = {}
+    for i in range(n):
+        root = uf.find(i)
+        clusters.setdefault(root, []).append(i)
+
+    # Paso 5: merge por cluster
+    merged_entities: list[EntityCandidate] = []
+    entity_id_map: dict[str, str] = {}
+    name_to_id: dict[str, str] = {}
+    merge_log: list[dict] = []
+
+    # Pares mergeados para construir el log
+    merged_pairs_by_root: dict[int, list[tuple[int, int, float]]] = {}
+    for i, j, score in merge_pairs:
+        root = uf.find(i)
+        merged_pairs_by_root.setdefault(root, []).append((i, j, score))
+
+    for root, indices in clusters.items():
+        cluster_candidates = [candidates[idx] for idx in indices]
+
+        if len(cluster_candidates) == 1:
+            c = cluster_candidates[0]
+            canonical_name = c.name
+            canonical_norm = normalized[indices[0]]
+            merged_attrs = c.attributes
+            merged_confidence = c.confidence
+            merged_chunks = list(c.source_chunk_indices)
+            merged_from = list(c.merged_from) if c.merged_from else [c.name]
+        else:
+            # Candidato con mayor confidence es el canonico
+            best = max(cluster_candidates, key=lambda c: c.confidence)
+            canonical_name = best.name
+            canonical_norm = normalize_entity_name(best.name, best.type_ref)
+
+            merged_attrs = merge_entity_attributes(
+                [c.attributes for c in cluster_candidates]
+            )
+            merged_confidence = max(c.confidence for c in cluster_candidates)
+
+            merged_chunks: list[int] = []
+            seen_chunks: set[int] = set()
+            for c in cluster_candidates:
+                for idx in c.source_chunk_indices:
+                    if idx not in seen_chunks:
+                        merged_chunks.append(idx)
+                        seen_chunks.add(idx)
+
+            merged_from: list[str] = []
+            seen_names: set[str] = set()
+            for c in cluster_candidates:
+                names_to_add = c.merged_from if c.merged_from else [c.name]
+                for nm in names_to_add:
+                    if nm not in seen_names:
+                        merged_from.append(nm)
+                        seen_names.add(nm)
+
+            # Log de merge
+            other_names = [c.name for c in cluster_candidates if c is not best]
+            pairs = merged_pairs_by_root.get(root, [])
+            max_score = max((s for _, _, s in pairs), default=1.0)
+            merge_log.append(
+                {
+                    "canonical": canonical_name,
+                    "merged": other_names,
+                    "score": round(max_score, 4),
+                    "reason": "fuzzy_name",
+                }
+            )
+
+        ent_id = str(uuid.uuid4())
+        entity = EntityCandidate(
+            name=canonical_name,
+            name_normalized=canonical_norm,
+            type_ref=cluster_candidates[0].type_ref,
+            type_label=cluster_candidates[0].type_label,
+            attributes=merged_attrs,
+            confidence=merged_confidence,
+            source_chunk_indices=merged_chunks,
+            merged_from=merged_from,
+        )
+        merged_entities.append(entity)
+
+        # Poblar mapas de resolucion
+        entity_id_map[canonical_norm] = ent_id
+        for orig_name in merged_from:
+            name_to_id[orig_name] = ent_id
+        name_to_id[canonical_norm] = ent_id
+
+    return DeduplicationResult(
+        entities=merged_entities,
+        entity_id_map=entity_id_map,
+        name_to_id=name_to_id,
+        merge_log=merge_log,
+        total_before=n,
+        total_after=len(merged_entities),
+    )
@@ -0,0 +1,189 @@
+"""Deduplica RelationCandidate resolviendo nombres a IDs y colapsando duplicados."""
+
+import logging
+import os
+import sys
+
+logger = logging.getLogger(__name__)
+
+# --- Importar levenshtein_distance desde cybersecurity ---
+# Soporta dos contextos:
+#   1. Ejecutado desde python/functions/datascience/ (pytest local)
+#   2. Ejecutado desde la raiz del registry (fn run)
+def _levenshtein_distance(a: str, b: str) -> int:
+    """Calcula la distancia de edicion de Levenshtein entre dos strings."""
+    if len(a) < len(b):
+        return _levenshtein_distance(b, a)
+    if len(b) == 0:
+        return len(a)
+    prev_row = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        curr_row = [i + 1]
+        for j, cb in enumerate(b):
+            cost = 0 if ca == cb else 1
+            curr_row.append(
+                min(curr_row[j] + 1, prev_row[j + 1] + 1, prev_row[j] + cost)
+            )
+        prev_row = curr_row
+    return prev_row[-1]
+
+
+try:
+    _here = os.path.dirname(os.path.abspath(__file__))
+    _cyber_path = os.path.join(_here, "..", "cybersecurity")
+    if _cyber_path not in sys.path:
+        sys.path.insert(0, _cyber_path)
+    from cybersecurity import levenshtein_distance as _lev
+except ImportError:
+    _lev = None  # type: ignore
+
+levenshtein_distance = _lev if _lev is not None else _levenshtein_distance
+
+
+def _fuzzy_resolve(name: str, entity_id_map: dict[str, str], threshold: int = 3) -> str:
+    """Intenta resolver un nombre contra las claves del mapa por fuzzy match.
+
+    Recorre todas las claves de entity_id_map y busca la mas cercana segun
+    distancia de Levenshtein. Retorna el entity_id si la distancia es <=
+    threshold, o '' si no hay match aceptable.
+
+    Args:
+        name: nombre a resolver (ya en lowercase strip).
+        entity_id_map: mapa nombre_normalizado -> entity_id.
+        threshold: distancia maxima de edicion para considerar match (default 3).
+
+    Returns:
+        entity_id del mejor match o '' si no hay match.
+    """
+    best_id = ""
+    best_dist = threshold + 1
+    for key, entity_id in entity_id_map.items():
+        dist = levenshtein_distance(name, key)
+        if dist < best_dist:
+            best_dist = dist
+            best_id = entity_id
+    return best_id if best_dist <= threshold else ""
+
+
+def deduplicate_relations(
+    relations: list,
+    entity_id_map: dict[str, str],
+) -> list:
+    """Deduplica relaciones candidatas resolviendo nombres a IDs de entidad finales.
+
+    Algoritmo:
+    1. Para cada RelationCandidate, intentar resolver from_name y to_name al
+       entity_id via entity_id_map (lookup exacto primero, ignorando mayusculas).
+       Si no hay match exacto, intentar fuzzy match con levenshtein_distance.
+       Si sigue sin match, descartar la relacion con warning.
+    2. Descartar self-loops (from_id == to_id).
+    3. Deduplicar por (from_id, to_id, relation_type):
+       - description: concatenar descripciones unicas separadas por '; '
+       - confidence: max del grupo
+    4. Retornar lista limpia de RelationCandidate con from_id y to_id resueltos.
+
+    Args:
+        relations: lista de RelationCandidate con from_name/to_name originales.
+        entity_id_map: mapa nombre_normalizado -> entity_id (output de
+            deduplicate_entities). Permite resolver nombres que fueron mergeados.
+
+    Returns:
+        Lista deduplicada de RelationCandidate con from_id y to_id resueltos.
+    """
+    # Importar tipo — funciona tanto desde datascience/ como desde raiz del registry
+    try:
+        _types_path = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "..", "..", "..", "python", "types", "datascience",
+        )
+        if _types_path not in sys.path:
+            sys.path.insert(0, _types_path)
+        from relation_candidate import RelationCandidate
+    except ImportError:
+        from relation_candidate import RelationCandidate  # type: ignore
+
+    resolved: list = []
+
+    for rel in relations:
+        # --- Resolver from_name ---
+        from_key = rel.from_name.lower().strip()
+        from_id = entity_id_map.get(from_key, "")
+        if not from_id:
+            from_id = _fuzzy_resolve(from_key, entity_id_map)
+            if not from_id:
+                logger.warning(
+                    "deduplicate_relations: no se pudo resolver from_name=%r — descartando",
+                    rel.from_name,
+                )
+                continue
+
+        # --- Resolver to_name ---
+        to_key = rel.to_name.lower().strip()
+        to_id = entity_id_map.get(to_key, "")
+        if not to_id:
+            to_id = _fuzzy_resolve(to_key, entity_id_map)
+            if not to_id:
+                logger.warning(
+                    "deduplicate_relations: no se pudo resolver to_name=%r — descartando",
+                    rel.to_name,
+                )
+                continue
+
+        # --- Descartar self-loops ---
+        if from_id == to_id:
+            logger.debug(
+                "deduplicate_relations: self-loop descartado (from=%r, to=%r, type=%r)",
+                rel.from_name,
+                rel.to_name,
+                rel.relation_type,
+            )
+            continue
+
+        resolved.append(
+            RelationCandidate(
+                from_name=rel.from_name,
+                to_name=rel.to_name,
+                from_id=from_id,
+                to_id=to_id,
+                relation_type=rel.relation_type,
+                description=rel.description,
+                confidence=rel.confidence,
+                source_chunk_index=rel.source_chunk_index,
+            )
+        )
+
+    # --- Deduplicar por (from_id, to_id, relation_type) ---
+    groups: dict[tuple, list] = {}
+    for rel in resolved:
+        key = (rel.from_id, rel.to_id, rel.relation_type)
+        groups.setdefault(key, []).append(rel)
+
+    result: list = []
+    for (from_id, to_id, rel_type), group in groups.items():
+        if len(group) == 1:
+            result.append(group[0])
+            continue
+
+        # Mergear: max confidence + union de descripciones unicas
+        best_confidence = max(r.confidence for r in group)
+        seen_desc: set[str] = set()
+        descriptions: list[str] = []
+        for r in group:
+            if r.description and r.description not in seen_desc:
+                descriptions.append(r.description)
+                seen_desc.add(r.description)
+
+        result.append(
+            RelationCandidate(
+                from_name=group[0].from_name,
+                to_name=group[0].to_name,
+                from_id=from_id,
+                to_id=to_id,
+                relation_type=rel_type,
+                description="; ".join(descriptions),
+                confidence=best_confidence,
+                source_chunk_index=group[0].source_chunk_index,
+            )
+        )
+
+    return result
@@ -0,0 +1,22 @@
+"""DeduplicationResult — resultado del proceso de deduplicacion de entidades."""
+
+from dataclasses import dataclass, field
+
+from entity_candidate import EntityCandidate
+
+
+@dataclass
+class DeduplicationResult:
+    """Resultado de deduplicacion de entidades.
+
+    El `name_to_id` mapea TODOS los nombres originales (incluyendo los
+    mergeados) a su ID final, permitiendo resolver relaciones que usan
+    cualquier variante del nombre.
+    """
+
+    entities: list[EntityCandidate]
+    entity_id_map: dict[str, str]
+    name_to_id: dict[str, str]
+    merge_log: list[dict] = field(default_factory=list)
+    total_before: int = 0
+    total_after: int = 0
@@ -0,0 +1,34 @@
+"""EntityCandidate — candidato de entidad extraido por el LLM."""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class EntityCandidate:
+    """Candidato de entidad extraido por el LLM.
+
+    Puede venir de un solo chunk o ser el resultado de mergear multiples
+    extracciones. `merged_from` rastrea los nombres originales para debugging.
+    """
+
+    name: str
+    name_normalized: str = ""
+    type_ref: str = ""
+    type_label: str = ""
+    attributes: dict = field(default_factory=dict)
+    confidence: float = 0.0
+    source_chunk_indices: list[int] = field(default_factory=list)
+    merged_from: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        """Serializa el candidato a un diccionario."""
+        return {
+            "name": self.name,
+            "name_normalized": self.name_normalized,
+            "type_ref": self.type_ref,
+            "type_label": self.type_label,
+            "attributes": self.attributes,
+            "confidence": self.confidence,
+            "source_chunk_indices": self.source_chunk_indices,
+            "merged_from": self.merged_from,
+        }
@@ -0,0 +1,145 @@
+"""Extrae entidades de un chunk de texto usando un LLM inyectado."""
+
+import sys
+import os
+import warnings
+from typing import Callable
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from entity_candidate import EntityCandidate
+
+
+def _build_system_prompt(entity_schema: list[dict], language_instruction: str) -> str:
+    """Construye el system prompt para extraccion de entidades."""
+    lines = [
+        "You are an entity extraction expert. Given text, extract all entities",
+        "matching these types. For each entity, provide: name, type_ref,",
+        "attributes (matching the metadata_fields for that type), and a",
+        "confidence score (0.0-1.0).",
+        "",
+        "Entity types:",
+    ]
+
+    for schema_entry in entity_schema:
+        label = schema_entry.get("label", "Unknown")
+        type_ref = schema_entry.get("type_ref", "")
+        metadata_fields = schema_entry.get("metadata_fields", [])
+        lines.append(f"- {label} (type_ref: {type_ref})")
+        if metadata_fields:
+            lines.append(f"  fields: {', '.join(metadata_fields)}")
+
+    lines += [
+        "",
+        'Output JSON: {"entities": [{"name": "...", "type_ref": "...", "attributes": {...}, "confidence": 0.9}]}',
+        "",
+        "Rules:",
+        "- Only extract entities explicitly mentioned in the text",
+        "- Use the exact type_ref from the schema",
+        "- Leave unknown attributes as null",
+        "- Confidence: 1.0 = explicitly named, 0.7 = strongly implied, 0.5 = weakly implied",
+        f"- {language_instruction}",
+    ]
+
+    return "\n".join(lines)
+
+
+def extract_entities_llm(
+    text: str,
+    entity_schema: list[dict],
+    llm_chat_json: Callable[[list[dict]], dict],
+    language_instruction: str = "Respond in English.",
+) -> list[EntityCandidate]:
+    """Extrae entidades de un chunk de texto usando un LLM inyectado.
+
+    Construye un system prompt con el schema de entity types, llama al LLM
+    y valida la respuesta retornando una lista de EntityCandidate.
+
+    Args:
+        text: Chunk de texto a analizar.
+        entity_schema: Lista de tipos con metadata fields. Cada entrada es un
+            dict con las claves 'type_ref', 'label' y opcionalmente
+            'metadata_fields'. Ejemplo:
+            [{"type_ref": "osint_person_go_cybersecurity", "label": "Person",
+              "metadata_fields": ["full_name", "alias"]}]
+        llm_chat_json: Funcion que recibe una lista de mensajes OpenAI-style
+            y retorna un dict con la respuesta JSON del LLM. Interfaz:
+            llm_chat_json([{"role": "system", "content": "..."}, ...]) -> dict
+        language_instruction: Instruccion de idioma para el LLM. Por defecto
+            "Respond in English."
+
+    Returns:
+        Lista de EntityCandidate extraidos. Retorna lista vacia si el LLM
+        no retorna JSON valido o si no se encuentran entidades.
+
+    Raises:
+        ValueError: Si entity_schema esta vacio.
+    """
+    if not entity_schema:
+        raise ValueError("entity_schema no puede estar vacio")
+
+    valid_type_refs = {entry.get("type_ref", "") for entry in entity_schema}
+    type_ref_to_label = {
+        entry.get("type_ref", ""): entry.get("label", "") for entry in entity_schema
+    }
+
+    system_prompt = _build_system_prompt(entity_schema, language_instruction)
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": text},
+    ]
+
+    try:
+        response = llm_chat_json(messages)
+    except Exception as exc:
+        warnings.warn(f"extract_entities_llm: error llamando al LLM: {exc}", stacklevel=2)
+        return []
+
+    raw_entities = response.get("entities", [])
+    if not isinstance(raw_entities, list):
+        warnings.warn(
+            "extract_entities_llm: la respuesta del LLM no contiene 'entities' como lista",
+            stacklevel=2,
+        )
+        return []
+
+    candidates: list[EntityCandidate] = []
+    for item in raw_entities:
+        if not isinstance(item, dict):
+            continue
+
+        name = item.get("name", "")
+        if not name:
+            continue
+
+        type_ref = item.get("type_ref", "")
+        if type_ref not in valid_type_refs:
+            warnings.warn(
+                f"extract_entities_llm: type_ref '{type_ref}' no esta en el schema, descartando entidad '{name}'",
+                stacklevel=2,
+            )
+            continue
+
+        attributes = item.get("attributes", {})
+        if not isinstance(attributes, dict):
+            attributes = {}
+        # Normalizar null values a None
+        attributes = {k: v for k, v in attributes.items() if v is not None}
+
+        confidence = item.get("confidence", 0.0)
+        if not isinstance(confidence, (int, float)):
+            confidence = 0.0
+        confidence = float(max(0.0, min(1.0, confidence)))
+
+        candidates.append(
+            EntityCandidate(
+                name=name,
+                type_ref=type_ref,
+                type_label=type_ref_to_label.get(type_ref, ""),
+                attributes=attributes,
+                confidence=confidence,
+            )
+        )
+
+    return candidates
@@ -0,0 +1,141 @@
+"""extract_relations_llm — extrae relaciones entre entidades usando un LLM."""
+
+import logging
+import sys
+import os
+from typing import Callable
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ""))
+
+from entity_candidate import EntityCandidate
+from relation_candidate import RelationCandidate
+
+logger = logging.getLogger(__name__)
+
+
+def extract_relations_llm(
+    text: str,
+    entities: list[EntityCandidate],
+    relation_types: list[str],
+    llm_chat_json: Callable[[list[dict]], dict],
+    language_instruction: str = "Respond in English.",
+) -> list[RelationCandidate]:
+    """Extrae relaciones entre entidades de un chunk de texto usando un LLM.
+
+    Dado el texto original y las entidades ya extraidas, pide al LLM que
+    identifique relaciones entre pares de entidades. Las relaciones cuyo
+    from_name o to_name no coincidan con ninguna entidad existente se descartan.
+    Los tipos de relacion no permitidos se reemplazan por "related_to".
+
+    Args:
+        text: chunk de texto (el mismo que se uso para extraer las entidades).
+        entities: entidades ya extraidas del chunk.
+        relation_types: tipos de relacion permitidos, ej: ["funds", "employs",
+            "communicates_with", "owns", "related_to"].
+        llm_chat_json: funcion inyectada que recibe una lista de mensajes
+            (dicts con "role" y "content") y retorna un dict con la respuesta
+            JSON del LLM.
+        language_instruction: instruccion de idioma para el LLM.
+
+    Returns:
+        Lista de RelationCandidate validados. Vacia si hay menos de 2 entidades
+        o si el LLM no encuentra relaciones.
+    """
+    if len(entities) < 2:
+        return []
+
+    entity_names = {e.name for e in entities}
+    relation_types_set = set(relation_types)
+
+    # Construir lista de entidades para el prompt
+    entity_lines = "\n".join(
+        f'- "{e.name}" ({e.type_label or e.type_ref or "Entity"})' for e in entities
+    )
+
+    # Construir tipos de relacion para el prompt
+    relation_types_str = ", ".join(relation_types)
+
+    system_prompt = f"""\
+You are a relation extraction expert. Given text and a list of entities already \
+extracted, identify relationships between them.
+
+Entities found in this text:
+{entity_lines}
+
+Allowed relation types: {relation_types_str}
+
+Output JSON: {{"relations": [
+  {{"from_name": "Entity A", "to_name": "Entity B",
+   "relation_type": "employs", "description": "...", "confidence": 0.8}}
+]}}
+
+Rules:
+- Only extract relations explicitly stated or strongly implied in the text
+- from_name and to_name must match entity names exactly as listed above
+- relation_type must be one of the allowed types
+- Confidence: 1.0 = explicitly stated, 0.7 = strongly implied, 0.5 = weakly implied
+- Do not invent entities not in the list above
+- {language_instruction}"""
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": text},
+    ]
+
+    try:
+        response = llm_chat_json(messages)
+    except Exception as exc:
+        logger.warning("extract_relations_llm: LLM call failed: %s", exc)
+        return []
+
+    raw_relations = response.get("relations", [])
+    if not isinstance(raw_relations, list):
+        logger.warning("extract_relations_llm: 'relations' is not a list in LLM response")
+        return []
+
+    results: list[RelationCandidate] = []
+    for item in raw_relations:
+        if not isinstance(item, dict):
+            continue
+
+        from_name = item.get("from_name", "")
+        to_name = item.get("to_name", "")
+
+        # Validar que ambos nombres corresponden a entidades existentes
+        if from_name not in entity_names:
+            logger.debug(
+                "extract_relations_llm: from_name '%s' no coincide con ninguna entidad — descartando",
+                from_name,
+            )
+            continue
+        if to_name not in entity_names:
+            logger.debug(
+                "extract_relations_llm: to_name '%s' no coincide con ninguna entidad — descartando",
+                to_name,
+            )
+            continue
+
+        relation_type = item.get("relation_type", "")
+        if relation_type not in relation_types_set:
+            logger.debug(
+                "extract_relations_llm: tipo '%s' no permitido — usando 'related_to'",
+                relation_type,
+            )
+            relation_type = "related_to"
+
+        confidence = item.get("confidence", 0.0)
+        if not isinstance(confidence, (int, float)):
+            confidence = 0.0
+        confidence = float(max(0.0, min(1.0, confidence)))
+
+        results.append(
+            RelationCandidate(
+                from_name=from_name,
+                to_name=to_name,
+                relation_type=relation_type,
+                description=item.get("description", ""),
+                confidence=confidence,
+            )
+        )
+
+    return results
@@ -0,0 +1,92 @@
+"""Extract plain text from PDF, Markdown, or TXT files."""
+
+
+SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
+
+
+def _detect_encoding(data: bytes) -> str:
+    """Detect encoding of raw bytes using multiple fallback strategies."""
+    # Strategy 1: UTF-8
+    try:
+        data.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Strategy 2: charset_normalizer
+    try:
+        from charset_normalizer import from_bytes
+
+        result = from_bytes(data).best()
+        if result is not None and result.encoding:
+            return result.encoding
+    except ImportError:
+        pass
+
+    # Strategy 3: chardet
+    try:
+        import chardet
+
+        detected = chardet.detect(data)
+        if detected and detected.get("encoding"):
+            return detected["encoding"]
+    except ImportError:
+        pass
+
+    # Last resort: UTF-8 with replacement
+    return "utf-8"
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract plain text from a file. Supports PDF, Markdown and TXT.
+
+    For PDF files uses PyMuPDF (fitz) to extract text from each page,
+    joining them with double newlines. For text-based files (.md, .markdown,
+    .txt) reads the file with automatic encoding detection.
+
+    Args:
+        file_path: Absolute or relative path to the file.
+
+    Returns:
+        str: Extracted plain text content.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If the file extension is not supported.
+        ImportError: If PyMuPDF is not installed and a PDF is provided.
+    """
+    import os
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    _, ext = os.path.splitext(file_path.lower())
+
+    if ext == ".pdf":
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is required for PDF extraction. "
+                "Install it with: pip install PyMuPDF"
+            ) from e
+
+        doc = fitz.open(file_path)
+        pages = [page.get_text() for page in doc]
+        return "\n\n".join(pages)
+
+    elif ext in {".md", ".markdown", ".txt"}:
+        with open(file_path, "rb") as f:
+            raw = f.read()
+
+        encoding = _detect_encoding(raw)
+        try:
+            return raw.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            return raw.decode("utf-8", errors="replace")
+
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
@@ -0,0 +1,208 @@
+"""Pipeline de extraccion de entidades y relaciones desde un documento."""
+
+from __future__ import annotations
+
+import sys
+import os
+import time
+import warnings
+from typing import Callable
+
+# Soporte para ejecucion desde la raiz del registry o desde el directorio del archivo
+
+from extract_text_from_file import extract_text_from_file
+from core_functions import preprocess_text
+from split_text_into_chunks import split_text_into_chunks
+from build_entity_schema_prompt import build_entity_schema_prompt
+from build_relation_schema_prompt import build_relation_schema_prompt
+from extract_entities_llm import extract_entities_llm
+from extract_relations_llm import extract_relations_llm
+from deduplicate_entities import deduplicate_entities
+from deduplicate_relations import deduplicate_relations
+from entity_candidate import EntityCandidate
+from extraction_result import ExtractionResult
+from extraction_stats import ExtractionStats
+
+
+def extraction_pipeline(
+    file_path: str,
+    entity_presets: list[dict],
+    relation_types: list[str],
+    llm_chat_json: Callable[[list[dict]], dict],
+    chunk_size: int = 500,
+    chunk_overlap: int = 50,
+    confidence_threshold: float = 0.5,
+    dedup_threshold: float = 0.85,
+    on_progress: Callable[[str, float], None] | None = None,
+) -> ExtractionResult:
+    """Pipeline completa de extraccion de entidades y relaciones desde un documento.
+
+    Orquesta extract_text_from_file -> preprocess_text -> split_text_into_chunks
+    -> extract_entities_llm por chunk -> deduplicate_entities ->
+    extract_relations_llm por chunk -> deduplicate_relations.
+
+    Args:
+        file_path: ruta al archivo a procesar (PDF, Markdown, TXT).
+        entity_presets: lista de dicts con type_ref, label y metadata_fields.
+            Ejemplo: [{"type_ref": "osint_person_go_cybersecurity",
+                        "label": "Person",
+                        "metadata_fields": ["full_name", "nationality"]}]
+        relation_types: tipos de relacion permitidos para extraccion.
+            Ejemplo: ["funds", "employs", "communicates_with", "owns"]
+        llm_chat_json: funcion inyectada que recibe messages OpenAI y retorna dict
+            con la respuesta JSON ya parseada. Sin acoplamiento a ningun proveedor.
+        chunk_size: numero de caracteres por chunk (default 500).
+        chunk_overlap: overlap entre chunks consecutivos (default 50).
+        confidence_threshold: umbral minimo de confidence para aceptar entidades
+            candidatas antes de deduplicar (default 0.5).
+        dedup_threshold: score minimo de similitud para mergear entidades (default 0.85).
+        on_progress: callback opcional de progreso (message: str, pct: float 0-1).
+            0-40%: extraccion de entidades, 40-80%: extraccion de relaciones,
+            80-100%: deduplicacion.
+
+    Returns:
+        ExtractionResult con entidades y relaciones deduplicadas y stats del proceso.
+
+    Raises:
+        FileNotFoundError: si file_path no existe.
+        ValueError: si entity_presets esta vacio.
+    """
+    if not entity_presets:
+        raise ValueError("entity_presets no puede estar vacio")
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
+
+    def _progress(msg: str, pct: float) -> None:
+        if on_progress is not None:
+            try:
+                on_progress(msg, pct)
+            except Exception:
+                pass
+
+    start_time = time.monotonic()
+    stats = ExtractionStats()
+
+    # ── Paso 1: Extraer texto ──────────────────────────────────────────────────
+    _progress("Extracting text from file...", 0.0)
+    try:
+        raw_text = extract_text_from_file(file_path)
+    except Exception as exc:
+        warnings.warn(f"extraction_pipeline: error al extraer texto: {exc}")
+        raw_text = ""
+
+    # ── Paso 2: Preprocesar ────────────────────────────────────────────────────
+    clean_text = preprocess_text(raw_text)
+    stats.total_chars = len(clean_text)
+
+    # ── Paso 3: Dividir en chunks ──────────────────────────────────────────────
+    chunks = split_text_into_chunks(clean_text, chunk_size=chunk_size, overlap=chunk_overlap)
+    n = len(chunks)
+    stats.total_chunks = n
+
+    if n == 0:
+        stats.processing_time_seconds = time.monotonic() - start_time
+        return ExtractionResult(entities=[], relations=[], stats=stats)
+
+    # ── Paso 4: Extraer entidades por chunk ────────────────────────────────────
+    all_raw_entities: list[EntityCandidate] = []
+
+    for i, chunk in enumerate(chunks):
+        _progress(f"Extracting entities from chunk {i + 1}/{n}", (i / n) * 0.4)
+        try:
+            candidates = extract_entities_llm(
+                text=chunk,
+                entity_schema=entity_presets,
+                llm_chat_json=llm_chat_json,
+            )
+        except Exception as exc:
+            warnings.warn(
+                f"extraction_pipeline: error en extract_entities_llm chunk {i}: {exc}"
+            )
+            candidates = []
+
+        for candidate in candidates:
+            # Anotar el chunk de origen
+            if i not in candidate.source_chunk_indices:
+                candidate.source_chunk_indices.append(i)
+            all_raw_entities.append(candidate)
+
+    # ── Paso 5: Filtrar por confidence ─────────────────────────────────────────
+    filtered_entities = [
+        e for e in all_raw_entities if e.confidence >= confidence_threshold
+    ]
+    stats.raw_entities_count = len(filtered_entities)
+
+    # Actualizar stats de tipos
+    for ent in filtered_entities:
+        stats.entity_types_found[ent.type_ref] = (
+            stats.entity_types_found.get(ent.type_ref, 0) + 1
+        )
+
+    # ── Paso 6: Deduplicar entidades ───────────────────────────────────────────
+    _progress("Deduplicating entities...", 0.4)
+    dedup_result = deduplicate_entities(filtered_entities, name_threshold=dedup_threshold)
+
+    stats.final_entities_count = dedup_result.total_after
+    stats.entities_merged = dedup_result.total_before - dedup_result.total_after
+
+    final_entities = dedup_result.entities
+    entity_id_map = dedup_result.name_to_id  # nombre_original -> entity_id
+
+    # ── Paso 7: Extraer relaciones por chunk ───────────────────────────────────
+    all_raw_relations = []
+
+    for i, chunk in enumerate(chunks):
+        _progress(f"Extracting relations...", 0.4 + (i / n) * 0.4)
+
+        # Obtener entidades relevantes de este chunk
+        chunk_entities = [
+            e for e in final_entities if i in e.source_chunk_indices
+        ]
+        # Si no hay entidades en este chunk especifico, usar todas
+        if not chunk_entities:
+            chunk_entities = final_entities
+
+        if len(chunk_entities) < 2:
+            continue
+
+        try:
+            chunk_relations = extract_relations_llm(
+                text=chunk,
+                entities=chunk_entities,
+                relation_types=relation_types,
+                llm_chat_json=llm_chat_json,
+            )
+        except Exception as exc:
+            warnings.warn(
+                f"extraction_pipeline: error en extract_relations_llm chunk {i}: {exc}"
+            )
+            chunk_relations = []
+
+        for rel in chunk_relations:
+            rel.source_chunk_index = i
+        all_raw_relations.extend(chunk_relations)
+
+    stats.raw_relations_count = len(all_raw_relations)
+
+    # Actualizar stats de tipos de relacion
+    for rel in all_raw_relations:
+        stats.relation_types_found[rel.relation_type] = (
+            stats.relation_types_found.get(rel.relation_type, 0) + 1
+        )
+
+    # ── Paso 8: Deduplicar relaciones ──────────────────────────────────────────
+    _progress("Deduplicating relations...", 0.8)
+    final_relations = deduplicate_relations(all_raw_relations, entity_id_map)
+
+    stats.final_relations_count = len(final_relations)
+    stats.relations_merged = stats.raw_relations_count - len(final_relations)
+    stats.processing_time_seconds = time.monotonic() - start_time
+
+    _progress("Done", 1.0)
+
+    return ExtractionResult(
+        entities=final_entities,
+        relations=final_relations,
+        stats=stats,
+    )
@@ -0,0 +1,20 @@
+"""ExtractionResult — resultado final del pipeline de extraccion."""
+
+from dataclasses import dataclass, field
+
+from entity_candidate import EntityCandidate
+from extraction_stats import ExtractionStats
+from relation_candidate import RelationCandidate
+
+
+@dataclass
+class ExtractionResult:
+    """Resultado final del pipeline de extraccion de entidades y relaciones.
+
+    Contiene las listas deduplicadas de entidades y relaciones junto con
+    las estadisticas del proceso completo.
+    """
+
+    entities: list[EntityCandidate]
+    relations: list[RelationCandidate]
+    stats: ExtractionStats = field(default_factory=ExtractionStats)
@@ -0,0 +1,25 @@
+"""ExtractionStats — estadisticas del proceso de extraccion."""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ExtractionStats:
+    """Estadisticas del proceso de extraccion.
+
+    Util para reporting y debugging. Registra conteos antes y despues de
+    deduplicacion, tiempo de procesamiento y distribucion de tipos encontrados.
+    """
+
+    total_chunks: int = 0
+    total_chars: int = 0
+    raw_entities_count: int = 0
+    final_entities_count: int = 0
+    entities_merged: int = 0
+    raw_relations_count: int = 0
+    final_relations_count: int = 0
+    relations_merged: int = 0
+    relations_discarded: int = 0
+    entity_types_found: dict[str, int] = field(default_factory=dict)
+    relation_types_found: dict[str, int] = field(default_factory=dict)
+    processing_time_seconds: float = 0.0
@@ -0,0 +1,78 @@
+"""Combina atributos de multiples candidatos de la misma entidad."""
+
+from __future__ import annotations
+
+_NUMERIC_FIELDS = {"risk_score", "balance", "cvss"}
+_DATE_MIN_FIELDS = {"first_seen", "created_date"}
+_DATE_MAX_FIELDS = {"last_seen", "expires_date"}
+_BOOL_FIELDS = {"verified", "exploited"}
+
+
+def merge_entity_attributes(attr_list: list[dict]) -> dict:
+    """Combina atributos de multiples candidatos de la misma entidad.
+
+    Para cada campo presente en cualquier candidato recopila todos los valores
+    non-null y aplica heuristicas de resolucion por tipo de campo:
+    - Numerico (risk_score, balance, cvss): max
+    - Fecha min (first_seen, created_date): min (mas antigua)
+    - Fecha max (last_seen, expires_date): max (mas reciente)
+    - Lista (cualquier valor de tipo list): union sin duplicados
+    - Boolean (verified, exploited): OR logico
+    - String: el mas largo
+
+    Args:
+        attr_list: Lista de dicts con los atributos de cada candidato.
+
+    Returns:
+        Dict con los atributos fusionados.
+    """
+    if not attr_list:
+        return {}
+
+    # Recopilar todas las claves presentes en cualquier candidato
+    all_keys: set[str] = set()
+    for attrs in attr_list:
+        all_keys.update(attrs.keys())
+
+    merged: dict = {}
+
+    for key in all_keys:
+        # Recopilar valores non-null
+        values = [attrs[key] for attrs in attr_list if key in attrs and attrs[key] is not None]
+
+        if not values:
+            merged[key] = None
+            continue
+
+        if len(values) == 1:
+            merged[key] = values[0]
+            continue
+
+        # Todos iguales
+        if all(v == values[0] for v in values):
+            merged[key] = values[0]
+            continue
+
+        # Resolver conflicto segun tipo de campo
+        if key in _NUMERIC_FIELDS:
+            merged[key] = max(values)
+        elif key in _DATE_MIN_FIELDS:
+            merged[key] = min(values)
+        elif key in _DATE_MAX_FIELDS:
+            merged[key] = max(values)
+        elif key in _BOOL_FIELDS:
+            merged[key] = any(values)
+        elif isinstance(values[0], list):
+            # Union de listas sin duplicados, preservando orden de aparicion
+            seen: list = []
+            for lst in values:
+                for item in lst:
+                    if item not in seen:
+                        seen.append(item)
+            merged[key] = seen
+        else:
+            # String u otro: usar el mas largo
+            str_values = [str(v) for v in values]
+            merged[key] = max(str_values, key=len)
+
+    return merged
@@ -0,0 +1,81 @@
+"""Normaliza el nombre de una entidad para comparacion y deduplicacion."""
+
+import re
+
+
+_TITLES = re.compile(
+    r"^\b(?:Dr|Mr|Mrs|Ms|Miss|Prof|Sr|Jr|Ing|Lic|Gen|Col|Maj|Capt|Sgt|Rev|Hon)\.?\s+",
+    re.IGNORECASE,
+)
+
+_LEGAL_SUFFIXES = re.compile(
+    r"\b(?:Inc|LLC|Ltd|Corp|Co|S\.?A|GmbH|B\.?V|N\.?V|PLC|AG|SRL|S\.?L|Pty|"
+    r"LP|LLP|LLLP|PC|PA|PLLC|Foundation|Group|Holdings|Enterprises?|"
+    r"International|Industries|Services?|Solutions?|Systems?|Technologies?)\.?\s*$",
+    re.IGNORECASE,
+)
+
+_MULTI_SPACE = re.compile(r"\s+")
+
+
+def normalize_entity_name(name: str, entity_type: str = "") -> str:
+    """Normaliza el nombre de una entidad para comparacion y deduplicacion.
+
+    Aplica reglas diferentes segun el tipo de entidad:
+    - ip / email / domain / crypto_wallet / phone: normalizacion tecnica
+    - person: normalizacion de nombre humano (titulos, formato apellido-nombre)
+    - organization: normalizacion corporativa (sufijos legales)
+    - default: lower + strip + colapsar espacios
+
+    Args:
+        name: nombre de la entidad a normalizar.
+        entity_type: tipo de entidad (ip, email, domain, crypto_wallet, phone,
+                     person, organization). Vacio = default.
+
+    Returns:
+        nombre normalizado como string.
+    """
+    name = name.strip()
+    et = entity_type.lower().strip()
+
+    if et == "ip":
+        return name.lower()
+
+    if et == "email":
+        return name.lower()
+
+    if et == "domain":
+        result = name.lower().rstrip(".")
+        if result.startswith("www."):
+            result = result[4:]
+        return result
+
+    if et == "crypto_wallet":
+        # Bitcoin addresses son case-sensitive — solo strip
+        return name
+
+    if et == "phone":
+        # Mantener solo digitos y el signo +
+        return re.sub(r"[^\d+]", "", name)
+
+    if et == "person":
+        # Remover titulos al inicio
+        result = _TITLES.sub("", name).strip()
+        # Detectar formato "Apellido, Nombre"
+        if "," in result:
+            parts = result.split(",", 1)
+            last = parts[0].strip()
+            first = parts[1].strip()
+            result = f"{first} {last}"
+        # Colapsar espacios y title case
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        return result.title()
+
+    if et == "organization":
+        result = _LEGAL_SUFFIXES.sub("", name).strip()
+        result = _MULTI_SPACE.sub(" ", result).strip()
+        # Title case para consistencia
+        return result.title()
+
+    # Default: lower, strip, colapsar espacios
+    return _MULTI_SPACE.sub(" ", name.lower()).strip()
@@ -0,0 +1,35 @@
+"""RelationCandidate — candidato de relacion extraido por el LLM."""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class RelationCandidate:
+    """Candidato de relacion entre dos entidades extraido por el LLM.
+
+    `from_name` y `to_name` contienen los nombres crudos del texto. `from_id`
+    y `to_id` se llenan durante la fase de deduplicacion cuando se resuelven
+    contra los EntityCandidate finales.
+    """
+
+    from_name: str
+    to_name: str
+    from_id: str = ""
+    to_id: str = ""
+    relation_type: str = ""
+    description: str = ""
+    confidence: float = 0.0
+    source_chunk_index: int = -1
+
+    def to_dict(self) -> dict:
+        """Serializa el candidato a un diccionario."""
+        return {
+            "from_name": self.from_name,
+            "to_name": self.to_name,
+            "from_id": self.from_id,
+            "to_id": self.to_id,
+            "relation_type": self.relation_type,
+            "description": self.description,
+            "confidence": self.confidence,
+            "source_chunk_index": self.source_chunk_index,
+        }
@@ -0,0 +1,234 @@
+"""Renderiza un grafo sigma.js como HTML standalone con dark theme y layout ForceAtlas2."""
+
+import json
+import os
+
+
+_HTML_TEMPLATE = """\
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>{title}</title>
+    <script src="https://cdn.jsdelivr.net/npm/graphology@0.25.4/dist/graphology.umd.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/graphology-library@0.8.0/dist/graphology-library.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/sigma@2.4.0/build/sigma.min.js"></script>
+    <style>
+        * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+        body {{ background: #1a1a2e; color: #eee; font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }}
+        #container {{ width: 100vw; height: 100vh; }}
+        #panel {{
+            position: absolute; top: 12px; right: 12px;
+            background: rgba(10, 10, 30, 0.88);
+            border: 1px solid rgba(255,255,255,0.12);
+            padding: 16px; border-radius: 10px;
+            z-index: 10; min-width: 200px; max-width: 260px;
+            backdrop-filter: blur(6px);
+        }}
+        #panel h3 {{ font-size: 14px; font-weight: 600; margin-bottom: 12px; color: #a0c4ff; letter-spacing: 0.5px; }}
+        #stats {{ font-size: 11px; color: #888; margin-bottom: 12px; }}
+        #filters {{ display: flex; flex-direction: column; gap: 6px; }}
+        .filter-item {{ display: flex; align-items: center; gap: 8px; font-size: 12px; cursor: pointer; }}
+        .filter-item input {{ cursor: pointer; accent-color: #a0c4ff; }}
+        .color-dot {{ width: 10px; height: 10px; border-radius: 50%; flex-shrink: 0; }}
+        #tooltip {{
+            position: absolute; display: none;
+            background: rgba(5, 5, 20, 0.95);
+            border: 1px solid rgba(255,255,255,0.15);
+            padding: 10px 14px; border-radius: 8px;
+            pointer-events: none; z-index: 20;
+            max-width: 300px; font-size: 12px; line-height: 1.6;
+        }}
+        #tooltip .tt-title {{ font-weight: 600; color: #a0c4ff; margin-bottom: 6px; font-size: 13px; }}
+        #tooltip .tt-row {{ display: flex; gap: 6px; }}
+        #tooltip .tt-key {{ color: #888; min-width: 80px; }}
+        #tooltip .tt-val {{ color: #eee; word-break: break-all; }}
+    </style>
+</head>
+<body>
+    <div id="container"></div>
+    <div id="panel">
+        <h3>{title}</h3>
+        <div id="stats"></div>
+        <div id="filters"></div>
+    </div>
+    <div id="tooltip"></div>
+
+    <script>
+    (function () {{
+        const graphData = {json_data};
+
+        // ── Build graphology graph ──────────────────────────────────────────────
+        const Graph = graphology.Graph || graphology;
+        const g = new Graph({{ multi: true, type: 'directed' }});
+
+        // Assign random initial positions
+        graphData.nodes.forEach(function (n) {{
+            g.addNode(n.key, Object.assign({{
+                x: (Math.random() - 0.5) * 10,
+                y: (Math.random() - 0.5) * 10,
+            }}, n.attributes));
+        }});
+
+        graphData.edges.forEach(function (e) {{
+            try {{
+                g.addEdgeWithKey(e.key, e.source, e.target, e.attributes || {{}});
+            }} catch (err) {{
+                // skip duplicate edge keys gracefully
+            }}
+        }});
+
+        // ── ForceAtlas2 layout (synchronous, 500 iterations) ───────────────────
+        const FA2 = graphologyLibrary.layoutForceAtlas2;
+        FA2.assign(g, {{
+            iterations: 500,
+            settings: {{
+                gravity: 1,
+                scalingRatio: 2,
+                slowDown: 5,
+                barnesHutOptimize: g.order > 300,
+            }},
+        }});
+
+        // ── Sigma renderer ──────────────────────────────────────────────────────
+        const renderer = new Sigma(g, document.getElementById('container'), {{
+            renderEdgeLabels: false,
+            defaultEdgeColor: '#444',
+            defaultNodeColor: '#95a5a6',
+            labelColor: {{ color: '#ccc' }},
+            labelSize: 11,
+            edgeReducer: function (edge, data) {{
+                return Object.assign({{}}, data, {{ size: Math.max(1, (data.weight || 1) * 0.8) }});
+            }},
+        }});
+
+        // ── Stats panel ─────────────────────────────────────────────────────────
+        document.getElementById('stats').textContent =
+            graphData.nodes.length + ' nodes · ' + graphData.edges.length + ' edges';
+
+        // ── Filter panel by node type ───────────────────────────────────────────
+        const typeColors = {{}};
+        graphData.nodes.forEach(function (n) {{
+            const t = n.attributes.entity_type || 'unknown';
+            typeColors[t] = n.attributes.color || '#95a5a6';
+        }});
+
+        const hiddenTypes = new Set();
+        const filtersDiv = document.getElementById('filters');
+
+        Object.keys(typeColors).sort().forEach(function (type) {{
+            const color = typeColors[type];
+            const label = document.createElement('label');
+            label.className = 'filter-item';
+
+            const cb = document.createElement('input');
+            cb.type = 'checkbox';
+            cb.checked = true;
+            cb.addEventListener('change', function () {{
+                if (cb.checked) hiddenTypes.delete(type);
+                else hiddenTypes.add(type);
+                renderer.refresh();
+            }});
+
+            const dot = document.createElement('span');
+            dot.className = 'color-dot';
+            dot.style.background = color;
+
+            label.appendChild(cb);
+            label.appendChild(dot);
+            label.appendChild(document.createTextNode(type));
+            filtersDiv.appendChild(label);
+        }});
+
+        // Node reducer applies type filter
+        renderer.setSetting('nodeReducer', function (node, data) {{
+            if (hiddenTypes.has(data.entity_type)) return Object.assign({{}}, data, {{ hidden: true }});
+            return data;
+        }});
+
+        // ── Tooltip on hover ────────────────────────────────────────────────────
+        const tooltip = document.getElementById('tooltip');
+
+        renderer.on('enterNode', function (ref) {{
+            const nodeAttrs = g.getNodeAttributes(ref.node);
+            const reserved = new Set(['x', 'y', 'size', 'color', 'label', 'type', 'hidden']);
+
+            let html = '<div class="tt-title">' + escHtml(nodeAttrs.label || ref.node) + '</div>';
+            html += '<div class="tt-row"><span class="tt-key">type</span><span class="tt-val">' + escHtml(nodeAttrs.entity_type || '') + '</span></div>';
+            html += '<div class="tt-row"><span class="tt-key">status</span><span class="tt-val">' + escHtml(nodeAttrs.status || '') + '</span></div>';
+            html += '<div class="tt-row"><span class="tt-key">domain</span><span class="tt-val">' + escHtml(nodeAttrs.domain || '') + '</span></div>';
+
+            Object.keys(nodeAttrs).sort().forEach(function (k) {{
+                if (!reserved.has(k) && !['status', 'domain', 'type', 'label'].includes(k)) {{
+                    html += '<div class="tt-row"><span class="tt-key">' + escHtml(k) + '</span><span class="tt-val">' + escHtml(String(nodeAttrs[k])) + '</span></div>';
+                }}
+            }});
+
+            tooltip.innerHTML = html;
+            tooltip.style.display = 'block';
+        }});
+
+        renderer.on('leaveNode', function () {{
+            tooltip.style.display = 'none';
+        }});
+
+        document.getElementById('container').addEventListener('mousemove', function (e) {{
+            tooltip.style.left = (e.clientX + 16) + 'px';
+            tooltip.style.top = (e.clientY + 16) + 'px';
+        }});
+
+        function escHtml(str) {{
+            return String(str)
+                .replace(/&/g, '&amp;')
+                .replace(/</g, '&lt;')
+                .replace(/>/g, '&gt;')
+                .replace(/"/g, '&quot;');
+        }}
+    }})();
+    </script>
+</body>
+</html>
+"""
+
+
+def render_sigma_html(
+    graph_data: dict,
+    output_path: str,
+    title: str = "OSINT Graph",
+) -> str:
+    """Genera un HTML standalone con sigma.js que visualiza el grafo OSINT.
+
+    Recibe el dict producido por ops_to_sigma_json, embebe los datos como JSON
+    en el HTML, aplica ForceAtlas2 (500 iteraciones sincrono) y renderiza con
+    sigma.js v2.4. Incluye dark theme, panel de filtros por tipo de nodo y
+    tooltip con metadata al hacer hover.
+
+    Args:
+        graph_data: Dict con claves 'nodes' y 'edges' en formato graphology/sigma.
+        output_path: Ruta del archivo HTML a escribir.
+        title: Titulo del grafo mostrado en el panel y la pestana.
+
+    Returns:
+        Ruta absoluta del archivo HTML escrito.
+
+    Raises:
+        Exception: Si no se puede escribir el archivo en output_path.
+    """
+    json_data = json.dumps(graph_data, ensure_ascii=False)
+
+    html = _HTML_TEMPLATE.format(
+        title=title,
+        json_data=json_data,
+    )
+
+    abs_path = os.path.abspath(output_path)
+    os.makedirs(os.path.dirname(abs_path) or ".", exist_ok=True)
+
+    try:
+        with open(abs_path, "w", encoding="utf-8") as f:
+            f.write(html)
+    except OSError as exc:
+        raise Exception(f"render_sigma_html: no se pudo escribir '{abs_path}': {exc}") from exc
+
+    return abs_path
@@ -0,0 +1,66 @@
+"""Split text into overlapping chunks with sentence-boundary awareness."""
+
+
+def split_text_into_chunks(
+    text: str, chunk_size: int = 500, overlap: int = 50
+) -> list[str]:
+    """Divide texto en chunks de tamaño fijo con overlap, cortando en límites de oración.
+
+    Args:
+        text: Texto a dividir.
+        chunk_size: Tamaño máximo de cada chunk en caracteres.
+        overlap: Número de caracteres de solapamiento entre chunks consecutivos.
+
+    Returns:
+        Lista de chunks. Vacía si el texto es vacío.
+    """
+    if not text:
+        return []
+
+    if len(text) <= chunk_size:
+        stripped = text.strip()
+        return [stripped] if stripped else []
+
+    # Separadores en orden de prioridad (más específicos primero)
+    separators = ["。", "！", "？", ".\n", "!\n", "?\n", "\n\n", ". ", "! ", "? "]
+
+    chunks: list[str] = []
+    start = 0
+    text_len = len(text)
+
+    while start < text_len:
+        end = start + chunk_size
+
+        if end < text_len:
+            # Buscar el último separador de oración dentro de text[start:end]
+            # Solo aceptar si está después del 30% del chunk
+            min_pos = start + int(chunk_size * 0.30)
+            best_end = None
+
+            for sep in separators:
+                sep_len = len(sep)
+                # Buscar la última ocurrencia del separador en text[start:end]
+                search_region = text[start:end]
+                pos = search_region.rfind(sep)
+                if pos == -1:
+                    continue
+                abs_pos = start + pos + sep_len
+                if abs_pos > min_pos:
+                    # Usar este separador solo si produce un corte más tarde que el mínimo
+                    # y más temprano que chunk_size (ya garantizado por rfind en [start:end])
+                    if best_end is None or abs_pos > best_end:
+                        best_end = abs_pos
+
+            if best_end is not None:
+                end = best_end
+
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+
+        start = end - overlap
+        # Protección contra bucle infinito si overlap >= chunk_size o end no avanza
+        if start >= end:
+            start = end
+
+    return chunks