178 changed files with 13060 additions and 1 deletions
@@ -0,0 +1,48 @@
+---
+name: build_tree_from_headers
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def build_tree_from_headers(node_list: list[dict]) -> list[dict]"
+description: "Construye arbol jerarquico anidado desde lista plana de headers markdown con niveles (h1>h2>h3)."
+tags: [tree, markdown, headers, hierarchy]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index_md.py"
+---
+
+## Ejemplo
+
+```python
+headers = [
+    {"title": "Intro", "level": 1, "line_num": 1},
+    {"title": "Background", "level": 2, "line_num": 5},
+    {"title": "Details", "level": 3, "line_num": 10},
+    {"title": "Methods", "level": 1, "line_num": 20},
+]
+tree = build_tree_from_headers(headers)
+# [
+#   {"title": "Intro", "node_id": "0001", "nodes": [
+#     {"title": "Background", "node_id": "0002", "nodes": [
+#       {"title": "Details", "node_id": "0003"}
+#     ]}
+#   ]},
+#   {"title": "Methods", "node_id": "0004"}
+# ]
+```
+
+## Notas
+
+Funcion pura. Asigna node_id secuencial (0001...) automaticamente. Usa stack para resolver jerarquia por nivel de header.
@@ -0,0 +1,57 @@
+---
+name: cache_decorator
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def cache_decorator(store: Any, ttl: float = 0, key_fn: callable | None = None)"
+description: "Decorator que cachea el resultado de una funcion en cualquier store persistente compatible (CacheStore o FileCache). La key se genera hasheando (func.__name__, args, sorted(kwargs)) con SHA-256. Soporta funciones sincronas y asincronas."
+tags: [cache, decorator, memoize, persistence, async, functional]
+uses_functions: ["cache_to_sqlite_py_infra", "cache_to_file_py_infra"]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["asyncio", "functools", "hashlib", "json"]
+tested: true
+tests:
+  - "Funcion llamada una vez, segunda vez desde cache"
+  - "TTL expirado → llama de nuevo"
+  - "key_fn custom"
+  - "Argumentos distintos → keys distintas"
+  - "Funciona con async"
+test_file_path: "python/functions/core/cache_decorator_test.py"
+file_path: "python/functions/core/cache_decorator.py"
+---
+
+## Ejemplo
+
+```python
+from infra.cache_to_sqlite import cache_to_sqlite
+from core.cache_decorator import cache_decorator
+
+store = cache_to_sqlite("cache.db", namespace="llm")
+
+@cache_decorator(store, ttl=3600)
+def call_llm(prompt: str) -> str:
+    # llamada costosa a LLM
+    return client.complete(prompt)
+
+result = call_llm("explain X")  # primera vez: llama LLM
+result = call_llm("explain X")  # segunda vez: desde cache
+
+# Con key_fn custom
+@cache_decorator(store, ttl=600, key_fn=lambda fn, args, kw: args[0])
+def fetch_user(user_id: str) -> dict:
+    return api.get_user(user_id)
+
+# Con async
+@cache_decorator(store, ttl=3600)
+async def async_call(prompt: str) -> str:
+    return await async_client.complete(prompt)
+```
+
+## Notas
+
+El store debe implementar `get(key: str) -> Any | None` y `set(key: str, value: Any, ttl: float) -> None`. Detecta automaticamente funciones asincronas con `asyncio.iscoroutinefunction`. La key por defecto usa `json.dumps(..., default=str)` para serializar argumentos no serializables. Si `store.get()` retorna `None`, siempre se ejecuta la funcion (no distingue entre "no en cache" y "valor None almacenado"); para valores que pueden ser None usar `get_or_set` directamente.
@@ -0,0 +1,67 @@
+"""Decorator que cachea el resultado de una funcion en un store persistente."""
+
+import asyncio
+import functools
+import hashlib
+import json
+from typing import Any, Callable
+
+
+def _default_key(func: Callable, args: tuple, kwargs: dict) -> str:
+    """Genera una cache key a partir del nombre de funcion y sus argumentos."""
+    payload = json.dumps((func.__name__, args, sorted(kwargs.items())), default=str)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def cache_decorator(store: Any, ttl: float = 0, key_fn: Callable | None = None):
+    """Retorna un decorator que cachea resultados en un store persistente.
+
+    Args:
+        store: Cualquier objeto con metodos get(key) y set(key, value, ttl).
+               Compatible con CacheStore (cache_to_sqlite) y FileCache (cache_to_file).
+        ttl: Tiempo de vida en segundos. 0 = sin expiracion.
+        key_fn: Funcion opcional para generar la key. Recibe (func, args, kwargs).
+                Si es None, se usa SHA-256 de (func.__name__, args, sorted(kwargs)).
+
+    Returns:
+        Decorator aplicable a funciones sincronas o asincronas.
+
+    Example::
+
+        store = cache_to_sqlite("cache.db")
+
+        @cache_decorator(store, ttl=3600)
+        def call_llm(prompt: str) -> str:
+            ...  # llamada costosa
+
+        result = call_llm("explain X")  # primera vez: ejecuta la funcion
+        result = call_llm("explain X")  # segunda vez: desde cache
+    """
+
+    def decorator(func: Callable) -> Callable:
+        if asyncio.iscoroutinefunction(func):
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                make_key = key_fn or _default_key
+                key = make_key(func, args, kwargs)
+                cached = store.get(key)
+                if cached is not None:
+                    return cached
+                result = await func(*args, **kwargs)
+                store.set(key, result, ttl)
+                return result
+            return async_wrapper
+        else:
+            @functools.wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                make_key = key_fn or _default_key
+                key = make_key(func, args, kwargs)
+                cached = store.get(key)
+                if cached is not None:
+                    return cached
+                result = func(*args, **kwargs)
+                store.set(key, result, ttl)
+                return result
+            return sync_wrapper
+
+    return decorator
@@ -0,0 +1,96 @@
+"""Tests para cache_decorator."""
+
+import asyncio
+import sys
+import os
+import tempfile
+import time
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "infra"))
+
+from cache_decorator import cache_decorator
+from cache_to_sqlite import cache_to_sqlite
+
+
+@pytest.fixture
+def store(tmp_path):
+    return cache_to_sqlite(str(tmp_path / "test.db"))
+
+
+def test_funcion_llamada_una_vez_segunda_vez_desde_cache(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    def compute(x: int) -> int:
+        calls.append(x)
+        return x * 10
+
+    assert compute(5) == 50
+    assert compute(5) == 50
+    assert len(calls) == 1
+
+
+def test_ttl_expirado_llama_de_nuevo(store):
+    calls = []
+
+    @cache_decorator(store, ttl=0.05)
+    def work(n: int) -> int:
+        calls.append(n)
+        return n + 1
+
+    work(3)
+    time.sleep(0.1)
+    work(3)
+    assert len(calls) == 2
+
+
+def test_key_fn_custom(store):
+    calls = []
+
+    def my_key_fn(func, args, kwargs):
+        return f"custom:{args[0]}"
+
+    @cache_decorator(store, ttl=60, key_fn=my_key_fn)
+    def fn(x: int) -> str:
+        calls.append(x)
+        return f"result_{x}"
+
+    fn(7)
+    fn(7)
+    assert len(calls) == 1
+
+
+def test_argumentos_distintos_keys_distintas(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    def fn(x: int) -> int:
+        calls.append(x)
+        return x * 2
+
+    fn(1)
+    fn(2)
+    fn(1)
+    assert len(calls) == 2
+
+
+def test_funciona_con_async(store):
+    calls = []
+
+    @cache_decorator(store, ttl=60)
+    async def async_fn(x: int) -> int:
+        calls.append(x)
+        return x + 100
+
+    async def run():
+        r1 = await async_fn(5)
+        r2 = await async_fn(5)
+        return r1, r2
+
+    r1, r2 = asyncio.run(run())
+    assert r1 == 105
+    assert r2 == 105
+    assert len(calls) == 1
@@ -0,0 +1,48 @@
+---
+name: calculate_media_strategy
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "calculate_media_strategy(image_count: int, line_count: int) -> str"
+description: "Determina la estrategia optima de procesamiento de medios para un documento basado en la proporcion de imagenes vs texto. Retorna full_page_vlm, extract o text_only."
+tags: [media, strategy, document, vision, vlm, images, classification]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "0 imagenes text_only"
+  - "2 imagenes 100 lineas extract"
+  - "10 imagenes 20 lineas full_page_vlm"
+  - "5 imagenes 100 lineas full_page_vlm"
+  - "0 lineas division por cero evitada"
+test_file_path: "python/functions/core/calculate_media_strategy_test.py"
+file_path: "python/functions/core/calculate_media_strategy.py"
+---
+
+## Ejemplo
+
+```python
+calculate_media_strategy(0, 50)    # "text_only"
+calculate_media_strategy(2, 100)   # "extract"  (ratio 0.02, pocas imagenes)
+calculate_media_strategy(10, 20)   # "full_page_vlm"  (ratio 0.5 > 0.3)
+calculate_media_strategy(5, 100)   # "full_page_vlm"  (>= 5 imagenes)
+calculate_media_strategy(3, 0)     # "text_only"  (sin texto, sin contexto)
+```
+
+## Notas
+
+Logica de clasificacion en tres niveles:
+
+1. `full_page_vlm` — documento dominado por imagenes: ratio imagen/linea > 0.3 o al menos 5 imagenes. Se usa un vision-language model sobre la pagina completa.
+2. `extract` — pocas imagenes en documento con texto: extraer y procesar imagenes individualmente.
+3. `text_only` — sin imagenes o sin lineas de texto: procesar solo el texto.
+
+El guard `line_count > 0` evita la division por cero y trata documentos sin lineas como `text_only` independientemente del conteo de imagenes, ya que sin texto no hay contexto suficiente para clasificar como `extract`.
+
+Funcion pura, sin dependencias externas. Reimplementada conceptualmente a partir de la logica de clasificacion de medios de OpenViking (AGPL-3.0).
@@ -0,0 +1,24 @@
+"""Determina la estrategia optima de procesamiento de medios para un documento."""
+
+
+def calculate_media_strategy(image_count: int, line_count: int) -> str:
+    """Determina la estrategia optima de procesamiento de medios.
+
+    Clasifica un documento en una de tres estrategias basandose en la
+    proporcion de imagenes respecto al texto:
+    - full_page_vlm: documento dominado por imagenes, usar vision-language model
+    - extract: pocas imagenes, extraer y procesar individualmente
+    - text_only: sin imagenes, solo texto
+
+    Args:
+        image_count: numero de imagenes en el documento.
+        line_count: numero de lineas de texto en el documento.
+
+    Returns:
+        "full_page_vlm", "extract" o "text_only".
+    """
+    if line_count > 0 and (image_count / line_count > 0.3 or image_count >= 5):
+        return "full_page_vlm"
+    if line_count > 0 and image_count > 0:
+        return "extract"
+    return "text_only"
@@ -0,0 +1,23 @@
+"""Tests para calculate_media_strategy."""
+
+from calculate_media_strategy import calculate_media_strategy
+
+
+def test_0_imagenes_text_only():
+    assert calculate_media_strategy(0, 50) == "text_only"
+
+
+def test_2_imagenes_100_lineas_extract():
+    assert calculate_media_strategy(2, 100) == "extract"
+
+
+def test_10_imagenes_20_lineas_full_page_vlm():
+    assert calculate_media_strategy(10, 20) == "full_page_vlm"
+
+
+def test_5_imagenes_100_lineas_full_page_vlm():
+    assert calculate_media_strategy(5, 100) == "full_page_vlm"
+
+
+def test_0_lineas_division_por_cero_evitada():
+    assert calculate_media_strategy(3, 0) == "text_only"
@@ -0,0 +1,40 @@
+---
+name: calculate_page_offset
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def calculate_page_offset(pairs: list[dict]) -> int"
+description: "Calcula offset entre numeros de pagina logicos y fisicos usando pares de referencia (moda de diferencias)."
+tags: [pagination, offset, calculation]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index.py"
+---
+
+## Ejemplo
+
+```python
+pairs = [
+    {"page": 1, "physical_index": 5},
+    {"page": 2, "physical_index": 6},
+    {"page": 10, "physical_index": 14},
+]
+calculate_page_offset(pairs)
+# 4 (la moda de las diferencias physical_index - page)
+```
+
+## Notas
+
+Funcion pura. Cada par necesita campos 'page' (numero logico) y 'physical_index' (indice fisico). Retorna la diferencia mas frecuente (moda). Retorna 0 si no hay pares validos.
@@ -0,0 +1,55 @@
+---
+name: call_batch_with_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def call_batch_with_retry(items: list[T], process_func: Callable[[T], R], max_retries: int = 3, initial_delay: float = 1.0, max_delay: float = 30.0, backoff_factor: float = 2.0, exceptions: tuple[type[Exception], ...] = (Exception,), continue_on_failure: bool = True) -> tuple[list[R], list[dict]]"
+description: "Procesa una lista de items con retry individual por item y exponential backoff. Los fallos individuales no bloquean el resto del batch. Retorna (results, failures) donde failures contiene index, item y error de cada item que agoto sus reintentos."
+tags: [retry, batch, backoff, resilience, error-handling, core]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["time", "random", "typing.Callable", "typing.TypeVar"]
+tested: true
+tests:
+  - "todos los items exito"
+  - "item falla permanentemente, continue True"
+  - "item falla, abort continue False"
+  - "item falla luego exito retry funciona"
+  - "failures contiene index correcto"
+test_file_path: "python/functions/core/call_batch_with_retry_test.py"
+file_path: "python/functions/core/call_batch_with_retry.py"
+---
+
+## Ejemplo
+
+```python
+results, failures = call_batch_with_retry(
+    items=["url1", "url2", "url3"],
+    process_func=fetch_url,
+    max_retries=3,
+    initial_delay=1.0,
+    max_delay=30.0,
+    backoff_factor=2.0,
+    exceptions=(ConnectionError, TimeoutError),
+    continue_on_failure=True,
+)
+
+for r in results:
+    print("OK:", r)
+
+for f in failures:
+    print(f"FAIL index={f['index']} item={f['item']} error={f['error']}")
+```
+
+## Notas
+
+Diferencia con `retry_sync_py_core`: ese reintenta una sola llamada. Este maneja listas completas donde cada item se reintenta independientemente — los fallos individuales quedan registrados en `failures` sin interrumpir el procesamiento del batch (cuando `continue_on_failure=True`).
+
+El backoff usa la formula `min(initial_delay * backoff_factor^attempt, max_delay)` con jitter de hasta el 10% del delay calculado para evitar thundering herd. El primer intento es siempre inmediato — el delay se aplica antes del primer retry (attempt=0).
+
+Cuando `continue_on_failure=False`, el primer item que agota sus reintentos re-lanza la excepcion inmediatamente, abortando el batch.
@@ -0,0 +1,81 @@
+"""Process a batch of items with per-item exponential backoff retry."""
+
+import time
+import random
+from typing import Callable, TypeVar
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def call_batch_with_retry(
+    items: list,
+    process_func: Callable,
+    max_retries: int = 3,
+    initial_delay: float = 1.0,
+    max_delay: float = 30.0,
+    backoff_factor: float = 2.0,
+    exceptions: tuple = (Exception,),
+    continue_on_failure: bool = True,
+) -> tuple:
+    """Process a list of items with independent per-item retry and exponential backoff.
+
+    Each item is processed by process_func. If it raises one of the specified
+    exceptions, it is retried up to max_retries times with exponential backoff.
+    If all retries are exhausted, the item is recorded as a failure.
+
+    Args:
+        items: List of items to process.
+        process_func: Callable that takes a single item and returns a result.
+        max_retries: Maximum number of retry attempts per item after first failure.
+        initial_delay: Initial delay in seconds before the first retry.
+        max_delay: Maximum delay cap in seconds between retries.
+        backoff_factor: Multiplier applied to delay on each successive retry.
+        exceptions: Tuple of exception types to catch and retry on.
+        continue_on_failure: If True, continue processing remaining items when an
+            item exhausts all retries. If False, re-raise the exception immediately.
+
+    Returns:
+        A tuple (results, failures) where:
+        - results is a list of successful return values from process_func.
+        - failures is a list of dicts with keys "index", "item", and "error"
+          for each item that failed after all retries.
+
+    Raises:
+        Exception: The last exception for a failed item when continue_on_failure
+            is False.
+    """
+    results = []
+    failures = []
+
+    for index, item in enumerate(items):
+        last_exc = None
+        succeeded = False
+
+        for attempt in range(max_retries + 1):
+            try:
+                result = process_func(item)
+                results.append(result)
+                succeeded = True
+                break
+            except exceptions as exc:
+                last_exc = exc
+                if attempt < max_retries:
+                    delay = min(
+                        initial_delay * (backoff_factor ** attempt),
+                        max_delay,
+                    )
+                    # Add small jitter (up to 10% of delay) to avoid thundering herd
+                    delay += random.uniform(0, delay * 0.1)
+                    time.sleep(delay)
+
+        if not succeeded:
+            if not continue_on_failure:
+                raise last_exc
+            failures.append({
+                "index": index,
+                "item": item,
+                "error": str(last_exc),
+            })
+
+    return results, failures
@@ -0,0 +1,102 @@
+"""Tests para call_batch_with_retry."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from call_batch_with_retry import call_batch_with_retry
+
+
+def test_todos_los_items_exito():
+    results, failures = call_batch_with_retry(
+        items=[1, 2, 3],
+        process_func=lambda x: x * 2,
+        max_retries=3,
+    )
+    assert results == [2, 4, 6]
+    assert failures == []
+
+
+def test_item_falla_permanentemente_continue_true():
+    def process(x):
+        if x == 2:
+            raise ValueError("fallo permanente")
+        return x * 10
+
+    results, failures = call_batch_with_retry(
+        items=[1, 2, 3],
+        process_func=process,
+        max_retries=2,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [10, 30]
+    assert len(failures) == 1
+    assert failures[0]["index"] == 1
+    assert failures[0]["item"] == 2
+    assert "fallo permanente" in failures[0]["error"]
+
+
+def test_item_falla_abort_continue_false():
+    call_count = {"n": 0}
+
+    def process(x):
+        call_count["n"] += 1
+        if x == 2:
+            raise RuntimeError("error fatal")
+        return x
+
+    try:
+        call_batch_with_retry(
+            items=[1, 2, 3],
+            process_func=process,
+            max_retries=1,
+            initial_delay=0.0,
+            continue_on_failure=False,
+        )
+        assert False, "Deberia haber lanzado excepcion"
+    except RuntimeError as e:
+        assert "error fatal" in str(e)
+    # item 3 nunca fue procesado
+    assert call_count["n"] < 6  # 1 ok + 2 intentos para item 2 + 0 para item 3
+
+
+def test_item_falla_luego_exito_retry_funciona():
+    attempt_counts = {}
+
+    def process(x):
+        attempt_counts[x] = attempt_counts.get(x, 0) + 1
+        # item 5 falla las primeras 2 veces, exito en la tercera
+        if x == 5 and attempt_counts[x] < 3:
+            raise ValueError("fallo temporal")
+        return x * 2
+
+    results, failures = call_batch_with_retry(
+        items=[1, 5, 9],
+        process_func=process,
+        max_retries=3,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [2, 10, 18]
+    assert failures == []
+    assert attempt_counts[5] == 3
+
+
+def test_failures_contiene_index_correcto():
+    def process(x):
+        if x in (0, 2, 4):
+            raise ValueError(f"fallo en {x}")
+        return x
+
+    results, failures = call_batch_with_retry(
+        items=[0, 1, 2, 3, 4],
+        process_func=process,
+        max_retries=0,
+        initial_delay=0.0,
+        continue_on_failure=True,
+    )
+    assert results == [1, 3]
+    assert [f["index"] for f in failures] == [0, 2, 4]
+    assert [f["item"] for f in failures] == [0, 2, 4]
@@ -0,0 +1,66 @@
+---
+name: circuit_breaker
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "class CircuitBreaker:\n  def __init__(self, failure_threshold: int = 5, reset_timeout: float = 300.0): ...\n  def check(self) -> None: ...\n  def record_success(self) -> None: ...\n  def record_failure(self, error: Exception) -> None: ...\n  @property\n  def retry_after(self) -> float: ..."
+description: "Patron circuit breaker thread-safe para proteger llamadas a APIs externas. Tres estados: CLOSED (normal), OPEN (bloqueando), HALF_OPEN (permitiendo 1 request de prueba). Integra con classify_api_error para distinguir errores permanentes de transitorios."
+tags: [circuit-breaker, resilience, api, retry, error-handling, thread-safe]
+uses_functions: [classify_api_error_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [threading, time, enum]
+tested: true
+tests:
+  - "Transicion CLOSED → OPEN despues de N fallos"
+  - "Transicion OPEN → HALF_OPEN despues de timeout"
+  - "Transicion HALF_OPEN → CLOSED en exito"
+  - "Transicion HALF_OPEN → OPEN en fallo"
+  - "Error permanente abre inmediatamente"
+  - "Thread safety (concurrencia)"
+  - "retry_after retorna 0 cuando no esta OPEN"
+test_file_path: "python/functions/core/circuit_breaker_test.py"
+file_path: "python/functions/core/circuit_breaker.py"
+---
+
+## Ejemplo
+
+```python
+from circuit_breaker import CircuitBreaker, CircuitBreakerOpen
+
+cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
+
+def call_api() -> dict:
+    cb.check()  # raises CircuitBreakerOpen if circuit is open
+    try:
+        result = requests.get("https://api.example.com/data").json()
+        cb.record_success()
+        return result
+    except Exception as exc:
+        cb.record_failure(exc)
+        raise
+
+# After 3 consecutive failures the circuit opens:
+# CircuitBreakerOpen: Circuit breaker is open. Retry after 30.0s
+try:
+    cb.check()
+except CircuitBreakerOpen as e:
+    print(f"Circuit open, retry in {e.retry_after}s")
+
+# retry_after property (capped at 30s):
+print(cb.retry_after)  # e.g. 28.4
+```
+
+## Notas
+
+- **CLOSED**: Requests pasan normalmente. Tras `failure_threshold` fallos consecutivos transiciona a OPEN.
+- **OPEN**: Requests bloqueados con `CircuitBreakerOpen`. Tras `reset_timeout` segundos transiciona a HALF_OPEN.
+- **HALF_OPEN**: Permite 1 request de prueba. Exito → CLOSED. Fallo → OPEN.
+- Errores permanentes (401, 403) abren el circuito inmediatamente sin esperar al umbral.
+- `retry_after` devuelve 0.0 cuando el estado no es OPEN; en OPEN devuelve el tiempo restante, cap 30s.
+- Thread-safe via `threading.Lock` protegiendo todo el estado interno.
+- La dependencia en `classify_api_error` es opcional: si no se puede importar, hay fallback de texto.
@@ -0,0 +1,141 @@
+"""Circuit breaker pattern for protecting external API calls."""
+
+import threading
+import time
+from enum import Enum
+
+
+class CircuitBreakerState(Enum):
+    CLOSED = "closed"
+    OPEN = "open"
+    HALF_OPEN = "half_open"
+
+
+class CircuitBreakerOpen(Exception):
+    """Raised when the circuit breaker is open and blocking requests."""
+
+    def __init__(self, retry_after: float) -> None:
+        self.retry_after = retry_after
+        super().__init__(f"Circuit breaker is open. Retry after {retry_after:.1f}s")
+
+
+def _is_permanent_error(error: Exception) -> bool:
+    """Return True if the error is permanent (should open circuit immediately)."""
+    try:
+        from classify_api_error import classify_api_error
+
+        return classify_api_error(error) == "permanent"
+    except ImportError:
+        # Fallback: inspect error text directly
+        text = str(error)
+        if error.__cause__ is not None:
+            text += " " + str(error.__cause__)
+        permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
+        return any(p in text for p in permanent_patterns)
+
+
+class CircuitBreaker:
+    """Thread-safe circuit breaker for protecting external API calls.
+
+    Implements three states:
+    - CLOSED: requests pass through normally.
+    - OPEN: requests are blocked with CircuitBreakerOpen.
+    - HALF_OPEN: one probe request is allowed through.
+
+    Args:
+        failure_threshold: Consecutive failures before opening. Default 5.
+        reset_timeout: Seconds to wait in OPEN before trying HALF_OPEN. Default 300.0.
+    """
+
+    def __init__(
+        self,
+        failure_threshold: int = 5,
+        reset_timeout: float = 300.0,
+    ) -> None:
+        self._failure_threshold = failure_threshold
+        self._reset_timeout = reset_timeout
+        self._lock = threading.Lock()
+
+        self._state = CircuitBreakerState.CLOSED
+        self._failure_count = 0
+        self._opened_at: float | None = None
+
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+
+    def check(self) -> None:
+        """Check whether a request is allowed through.
+
+        Raises:
+            CircuitBreakerOpen: If the circuit is open and reset_timeout
+                has not elapsed yet.
+        """
+        with self._lock:
+            if self._state is CircuitBreakerState.CLOSED:
+                return
+
+            if self._state is CircuitBreakerState.OPEN:
+                elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
+                if elapsed >= self._reset_timeout:
+                    self._state = CircuitBreakerState.HALF_OPEN
+                    return
+                remaining = self._reset_timeout - elapsed
+                raise CircuitBreakerOpen(min(remaining, 30.0))
+
+            # HALF_OPEN: allow exactly one probe — caller holds the slot
+            if self._state is CircuitBreakerState.HALF_OPEN:
+                return
+
+    def record_success(self) -> None:
+        """Record a successful request. Resets the breaker to CLOSED."""
+        with self._lock:
+            self._state = CircuitBreakerState.CLOSED
+            self._failure_count = 0
+            self._opened_at = None
+
+    def record_failure(self, error: Exception) -> None:
+        """Record a failed request.
+
+        If the error is permanent (e.g. 401/403), opens immediately.
+        Otherwise increments the failure counter and opens once it
+        reaches failure_threshold.
+
+        Args:
+            error: The exception that was raised.
+        """
+        with self._lock:
+            if _is_permanent_error(error):
+                self._trip()
+                return
+
+            if self._state is CircuitBreakerState.HALF_OPEN:
+                self._trip()
+                return
+
+            self._failure_count += 1
+            if self._failure_count >= self._failure_threshold:
+                self._trip()
+
+    @property
+    def retry_after(self) -> float:
+        """Seconds until the circuit transitions to HALF_OPEN.
+
+        Returns 0.0 when not in OPEN state, capped at 30 seconds.
+        """
+        with self._lock:
+            if self._state is not CircuitBreakerState.OPEN:
+                return 0.0
+            elapsed = time.monotonic() - self._opened_at  # type: ignore[operator]
+            remaining = self._reset_timeout - elapsed
+            return min(max(remaining, 0.0), 30.0)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _trip(self) -> None:
+        """Open the circuit (must be called with _lock held)."""
+        self._state = CircuitBreakerState.OPEN
+        self._failure_count = 0
+        self._opened_at = time.monotonic()
@@ -0,0 +1,156 @@
+"""Tests para circuit_breaker."""
+
+import sys
+import os
+import threading
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from circuit_breaker import CircuitBreaker, CircuitBreakerOpen, CircuitBreakerState
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _transient_error() -> Exception:
+    return Exception("HTTP 503 Service Unavailable")
+
+
+def _permanent_error() -> Exception:
+    return Exception("HTTP 401 Unauthorized")
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_closed_to_open_after_n_failures() -> None:
+    """Transicion CLOSED → OPEN despues de N fallos"""
+    cb = CircuitBreaker(failure_threshold=3, reset_timeout=60.0)
+
+    cb.check()  # Should not raise
+
+    cb.record_failure(_transient_error())
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.CLOSED  # Still closed after 2
+
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    try:
+        cb.check()
+        assert False, "Should have raised CircuitBreakerOpen"
+    except CircuitBreakerOpen:
+        pass
+
+    print("PASS: Transicion CLOSED → OPEN despues de N fallos")
+
+
+def test_open_to_half_open_after_timeout() -> None:
+    """Transicion OPEN → HALF_OPEN despues de timeout"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    time.sleep(0.1)
+
+    cb.check()  # Should not raise — transitions to HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    print("PASS: Transicion OPEN → HALF_OPEN despues de timeout")
+
+
+def test_half_open_to_closed_on_success() -> None:
+    """Transicion HALF_OPEN → CLOSED en exito"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    time.sleep(0.1)
+    cb.check()  # enters HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    cb.record_success()
+    assert cb._state is CircuitBreakerState.CLOSED
+
+    cb.check()  # Should not raise
+
+    print("PASS: Transicion HALF_OPEN → CLOSED en exito")
+
+
+def test_half_open_to_open_on_failure() -> None:
+    """Transicion HALF_OPEN → OPEN en fallo"""
+    cb = CircuitBreaker(failure_threshold=1, reset_timeout=0.05)
+    cb.record_failure(_transient_error())
+    time.sleep(0.1)
+    cb.check()  # enters HALF_OPEN
+    assert cb._state is CircuitBreakerState.HALF_OPEN
+
+    cb.record_failure(_transient_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    print("PASS: Transicion HALF_OPEN → OPEN en fallo")
+
+
+def test_permanent_error_opens_immediately() -> None:
+    """Error permanente abre inmediatamente"""
+    cb = CircuitBreaker(failure_threshold=10, reset_timeout=60.0)
+    assert cb._state is CircuitBreakerState.CLOSED
+
+    cb.record_failure(_permanent_error())
+    assert cb._state is CircuitBreakerState.OPEN
+
+    print("PASS: Error permanente abre inmediatamente")
+
+
+def test_thread_safety() -> None:
+    """Thread safety (concurrencia)"""
+    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
+    errors: list[Exception] = []
+
+    def worker() -> None:
+        try:
+            for _ in range(10):
+                cb.check()
+                cb.record_failure(_transient_error())
+        except CircuitBreakerOpen:
+            pass
+        except Exception as exc:
+            errors.append(exc)
+
+    threads = [threading.Thread(target=worker) for _ in range(20)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+
+    assert not errors, f"Thread errors: {errors}"
+    # After concurrent failures the circuit must be OPEN or HALF_OPEN
+    assert cb._state in (CircuitBreakerState.OPEN, CircuitBreakerState.HALF_OPEN, CircuitBreakerState.CLOSED)
+
+    print("PASS: Thread safety (concurrencia)")
+
+
+def test_retry_after_returns_zero_when_not_open() -> None:
+    """retry_after retorna 0 cuando no esta OPEN"""
+    cb = CircuitBreaker(failure_threshold=5, reset_timeout=60.0)
+    assert cb.retry_after == 0.0
+
+    cb.record_failure(_transient_error())
+    # Still CLOSED (threshold not reached)
+    assert cb.retry_after == 0.0
+
+    print("PASS: retry_after retorna 0 cuando no esta OPEN")
+
+
+if __name__ == "__main__":
+    test_closed_to_open_after_n_failures()
+    test_open_to_half_open_after_timeout()
+    test_half_open_to_closed_on_success()
+    test_half_open_to_open_on_failure()
+    test_permanent_error_opens_immediately()
+    test_thread_safety()
+    test_retry_after_returns_zero_when_not_open()
+    print("\nAll tests passed.")
@@ -0,0 +1,41 @@
+---
+name: classify_api_error
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def classify_api_error(error: Exception) -> str"
+description: "Clasifica un error de API como permanente (no reintentar), transitorio (reintentar) o desconocido. Permanente tiene prioridad sobre transitorio."
+tags: [retry, error, classification, api, backoff]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["error 429 es transitorio", "error 401 es permanente", "error timeout es transitorio", "error desconocido retorna unknown", "error con __cause__ transitorio"]
+test_file_path: "python/functions/core/classify_api_error_test.py"
+file_path: "python/functions/core/classify_api_error.py"
+---
+
+## Ejemplo
+
+```python
+err = Exception("HTTP 429 TooManyRequests")
+classify_api_error(err)  # "transient"
+
+err = Exception("HTTP 401 Unauthorized")
+classify_api_error(err)  # "permanent"
+
+err = Exception("Connection timeout")
+classify_api_error(err)  # "transient"
+
+err = Exception("Something unexpected happened")
+classify_api_error(err)  # "unknown"
+```
+
+## Notas
+
+Funcion pura: solo inspecciona el texto del error y su causa directa (`__cause__`). No tiene I/O ni dependencias externas. La prioridad permanente > transitorio evita reintentar errores 400/401/403 que nunca tendran exito.
@@ -0,0 +1,38 @@
+"""Classify an API exception as permanent, transient, or unknown."""
+
+
+def classify_api_error(error: Exception) -> str:
+    """Classify an API error as permanent, transient, or unknown.
+
+    Permanent errors should not be retried (e.g. auth failures, bad requests).
+    Transient errors are safe to retry (e.g. rate limits, timeouts, server errors).
+    Permanent classification takes priority over transient.
+
+    Args:
+        error: The exception to classify.
+
+    Returns:
+        "permanent" | "transient" | "unknown"
+    """
+    parts = [str(error)]
+    if error.__cause__ is not None:
+        parts.append(str(error.__cause__))
+    text = " ".join(parts)
+
+    permanent_patterns = ["400", "401", "403", "Forbidden", "Unauthorized"]
+    transient_patterns = [
+        "429", "500", "502", "503", "504",
+        "TooManyRequests", "RateLimit",
+        "timeout", "Timeout",
+        "ConnectionError", "Connection refused", "Connection reset",
+    ]
+
+    for pattern in permanent_patterns:
+        if pattern in text:
+            return "permanent"
+
+    for pattern in transient_patterns:
+        if pattern in text:
+            return "transient"
+
+    return "unknown"
@@ -0,0 +1,50 @@
+"""Tests para classify_api_error."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from classify_api_error import classify_api_error
+
+
+def test_error_429_es_transitorio():
+    err = Exception("HTTP 429 TooManyRequests")
+    assert classify_api_error(err) == "transient"
+
+
+def test_error_401_es_permanente():
+    err = Exception("HTTP 401 Unauthorized")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_timeout_es_transitorio():
+    err = Exception("Connection timeout occurred")
+    assert classify_api_error(err) == "transient"
+
+
+def test_error_desconocido_retorna_unknown():
+    err = Exception("Something completely unexpected happened")
+    assert classify_api_error(err) == "unknown"
+
+
+def test_error_con___cause___transitorio():
+    cause = Exception("Connection reset by peer")
+    err = Exception("Request failed")
+    err.__cause__ = cause
+    assert classify_api_error(err) == "transient"
+
+
+def test_permanente_tiene_prioridad_sobre_transitorio():
+    # Mensaje que contiene patrones de ambos tipos: 401 (permanent) y 503 (transient)
+    err = Exception("401 503 mixed error")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_403_forbidden_es_permanente():
+    err = Exception("403 Forbidden")
+    assert classify_api_error(err) == "permanent"
+
+
+def test_error_500_es_transitorio():
+    err = Exception("Internal server error 500")
+    assert classify_api_error(err) == "transient"
@@ -0,0 +1,49 @@
+---
+name: coerce_types
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def coerce_types(data: dict, schema: dict[str, str]) -> tuple[dict, list[str]]"
+description: "Convierte valores de un dict a los tipos esperados segun un schema declarativo. Soporta int, float, str, bool, datetime, list[str]. Util para normalizar datos de CSV, JSON o query params. Nunca muta el original. Coerciones imposibles generan warning y mantienen el valor original."
+tags: [coercion, types, normalization, pure, core, csv, json]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [datetime]
+tested: true
+tests:
+  - "string 42 a int 42"
+  - "string 3.14 a float 3.14"
+  - "string true a bool true"
+  - "string iso8601 a datetime"
+  - "coercion fallida genera warning sin crash"
+  - "dict con mix de tipos ya correctos y strings"
+  - "campo ausente en schema pass through sin tocar"
+  - "string lista a list str"
+test_file_path: "python/functions/core/coerce_types_test.py"
+file_path: "python/functions/core/coerce_types.py"
+---
+
+## Ejemplo
+
+```python
+data = {"age": "25", "score": "9.5", "active": "yes", "tags": "go, python"}
+schema = {"age": "int", "score": "float", "active": "bool", "tags": "list[str]"}
+
+result, warnings = coerce_types(data, schema)
+# result = {"age": 25, "score": 9.5, "active": True, "tags": ["go", "python"]}
+# warnings = []
+
+# Coercion fallida — mantiene original y avisa
+result2, warnings2 = coerce_types({"n": "abc"}, {"n": "int"})
+# result2 = {"n": "abc"}
+# warnings2 = ["n: cannot coerce 'abc' to int: could not convert string to float: 'abc'"]
+```
+
+## Notas
+
+Funcion pura. Solo usa `datetime` de la stdlib. No muta el dict original — retorna uno nuevo. Schema es flat (no anidado); para validacion de estructura compleja combinar con `validate_json_schema`. Lossy coercions (float "3.7" → int 3) generan warning adicional. Campo ausente en schema se copia sin tocar.
@@ -0,0 +1,135 @@
+"""Coercion de valores de un dict a tipos esperados segun un schema declarativo."""
+
+from datetime import datetime, timezone
+
+
+def coerce_types(
+    data: dict, schema: dict[str, str]
+) -> tuple[dict, list[str]]:
+    """Convierte valores de un dict a los tipos esperados segun el schema.
+
+    Schema es un dict de {campo: tipo} donde tipo es uno de:
+    "int", "float", "str", "bool", "datetime", "list[str]".
+
+    Coerciones soportadas (todas desde str):
+    - str → int: int(v), warning si tenia decimales
+    - str → float: float(v)
+    - str → bool: "true/1/yes" → True, "false/0/no" → False (case-insensitive)
+    - str → datetime: ISO 8601 parse
+    - str → list[str]: split por "," y strip de cada elemento
+    - Valor ya del tipo correcto → pass through
+    - Campo ausente en schema → pass through sin tocar
+    - Coercion imposible → mantener original + warning
+
+    Args:
+        data: Dict con los valores a coercionar.
+        schema: Dict de {campo: tipo_esperado}.
+
+    Returns:
+        (coerced_data, warnings) — nuevo dict con tipos corregidos (no muta el
+        original), lista de warnings para coerciones lossy o fallidas.
+    """
+    result = dict(data)
+    warnings: list[str] = []
+
+    for field, target_type in schema.items():
+        if field not in data:
+            continue
+
+        value = data[field]
+        try:
+            result[field] = _coerce_value(value, target_type, field, warnings)
+        except Exception as exc:
+            warnings.append(
+                f"{field}: cannot coerce {value!r} to {target_type}: {exc}"
+            )
+            result[field] = value
+
+    return result, warnings
+
+
+_BOOL_TRUE = {"true", "1", "yes"}
+_BOOL_FALSE = {"false", "0", "no"}
+
+
+def _coerce_value(
+    value: object, target: str, field: str, warnings: list[str]
+) -> object:
+    # --- int ---
+    if target == "int":
+        if isinstance(value, int) and not isinstance(value, bool):
+            return value
+        if isinstance(value, float):
+            if value != int(value):
+                warnings.append(
+                    f"{field}: lossy coercion float→int: {value} → {int(value)}"
+                )
+            return int(value)
+        if isinstance(value, str):
+            stripped = value.strip()
+            # detectar si tiene parte decimal no cero
+            try:
+                as_float = float(stripped)
+                if as_float != int(as_float):
+                    warnings.append(
+                        f"{field}: lossy coercion str→int: {value!r} → {int(as_float)}"
+                    )
+                return int(as_float)
+            except ValueError:
+                raise ValueError(f"cannot parse {value!r} as int")
+        raise TypeError(f"cannot coerce {type(value).__name__} to int")
+
+    # --- float ---
+    if target == "float":
+        if isinstance(value, float):
+            return value
+        if isinstance(value, int) and not isinstance(value, bool):
+            return float(value)
+        if isinstance(value, str):
+            return float(value.strip())
+        raise TypeError(f"cannot coerce {type(value).__name__} to float")
+
+    # --- str ---
+    if target == "str":
+        if isinstance(value, str):
+            return value
+        return str(value)
+
+    # --- bool ---
+    if target == "bool":
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, str):
+            low = value.strip().lower()
+            if low in _BOOL_TRUE:
+                return True
+            if low in _BOOL_FALSE:
+                return False
+            raise ValueError(
+                f"cannot parse {value!r} as bool; expected true/false/1/0/yes/no"
+            )
+        if isinstance(value, int):
+            return bool(value)
+        raise TypeError(f"cannot coerce {type(value).__name__} to bool")
+
+    # --- datetime ---
+    if target == "datetime":
+        if isinstance(value, datetime):
+            return value
+        if isinstance(value, str):
+            s = value.strip()
+            # Intentar parse ISO 8601 con y sin Z
+            if s.endswith("Z"):
+                s = s[:-1] + "+00:00"
+            return datetime.fromisoformat(s)
+        raise TypeError(f"cannot coerce {type(value).__name__} to datetime")
+
+    # --- list[str] ---
+    if target == "list[str]":
+        if isinstance(value, list):
+            return [str(item) for item in value]
+        if isinstance(value, str):
+            return [item.strip() for item in value.split(",")]
+        raise TypeError(f"cannot coerce {type(value).__name__} to list[str]")
+
+    raise ValueError(f"unknown target type: {target!r}")
@@ -0,0 +1,84 @@
+"""Tests para coerce_types."""
+
+import sys
+import os
+from datetime import datetime, timezone
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from coerce_types import coerce_types
+
+
+def test_string_42_a_int_42():
+    result, warnings = coerce_types({"n": "42"}, {"n": "int"})
+    assert result["n"] == 42
+    assert isinstance(result["n"], int)
+    assert warnings == []
+
+
+def test_string_3_14_a_float_3_14():
+    result, warnings = coerce_types({"x": "3.14"}, {"x": "float"})
+    assert abs(result["x"] - 3.14) < 1e-9
+    assert warnings == []
+
+
+def test_string_true_a_bool_true():
+    result, warnings = coerce_types({"flag": "true"}, {"flag": "bool"})
+    assert result["flag"] is True
+    assert warnings == []
+
+    result2, _ = coerce_types({"flag": "yes"}, {"flag": "bool"})
+    assert result2["flag"] is True
+
+    result3, _ = coerce_types({"flag": "1"}, {"flag": "bool"})
+    assert result3["flag"] is True
+
+    result4, _ = coerce_types({"flag": "false"}, {"flag": "bool"})
+    assert result4["flag"] is False
+
+
+def test_string_iso8601_a_datetime():
+    result, warnings = coerce_types(
+        {"ts": "2024-01-15T10:30:00Z"}, {"ts": "datetime"}
+    )
+    assert isinstance(result["ts"], datetime)
+    assert result["ts"].year == 2024
+    assert result["ts"].month == 1
+    assert result["ts"].day == 15
+    assert warnings == []
+
+
+def test_coercion_fallida_genera_warning_sin_crash():
+    result, warnings = coerce_types({"n": "not-a-number"}, {"n": "int"})
+    # mantiene el original
+    assert result["n"] == "not-a-number"
+    assert len(warnings) == 1
+    assert "n" in warnings[0]
+
+
+def test_dict_con_mix_de_tipos_ya_correctos_y_strings():
+    data = {"a": "10", "b": 3.14, "c": True, "d": "hello"}
+    schema = {"a": "int", "b": "float", "c": "bool", "d": "str"}
+    result, warnings = coerce_types(data, schema)
+    assert result["a"] == 10
+    assert abs(result["b"] - 3.14) < 1e-9
+    assert result["c"] is True
+    assert result["d"] == "hello"
+    assert warnings == []
+
+
+def test_campo_ausente_en_schema_pass_through_sin_tocar():
+    data = {"a": "42", "b": [1, 2, 3]}
+    schema = {"a": "int"}  # "b" no esta en schema
+    result, warnings = coerce_types(data, schema)
+    assert result["a"] == 42
+    assert result["b"] == [1, 2, 3]
+    assert warnings == []
+
+
+def test_string_lista_a_list_str():
+    result, warnings = coerce_types(
+        {"tags": "python, go, bash"}, {"tags": "list[str]"}
+    )
+    assert result["tags"] == ["python", "go", "bash"]
+    assert warnings == []
@@ -0,0 +1,41 @@
+---
+name: compute_backoff_delay
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def compute_backoff_delay(attempt: int, base_delay: float = 0.5, max_delay: float = 8.0, jitter: bool = True) -> float"
+description: "Calcula el delay para exponential backoff con jitter opcional. delay = min(base_delay * 2^attempt, max_delay). Con jitter anade random.uniform(0, min(base_delay, delay))."
+tags: [retry, backoff, exponential, delay, jitter]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [random]
+tested: true
+tests: ["attempt 0 retorna base_delay sin jitter", "attempt alto se cappea a max_delay", "sin jitter es determinista"]
+test_file_path: "python/functions/core/compute_backoff_delay_test.py"
+file_path: "python/functions/core/compute_backoff_delay.py"
+---
+
+## Ejemplo
+
+```python
+# Primer reintento (attempt=0): delay = 0.5 * 2^0 = 0.5s
+compute_backoff_delay(0, jitter=False)  # 0.5
+
+# Tercer reintento (attempt=2): delay = 0.5 * 2^2 = 2.0s
+compute_backoff_delay(2, jitter=False)  # 2.0
+
+# Intento alto, capped a 8.0s
+compute_backoff_delay(10, jitter=False)  # 8.0
+
+# Con jitter (no determinista)
+compute_backoff_delay(1)  # entre 1.0 y 1.5
+```
+
+## Notas
+
+Usa `random` de la stdlib. Con jitter=True el resultado no es determinista, pero la funcion es clasificada como pura conceptualmente dado que el jitter es intencional y no hay I/O. Para tests deterministicos usar jitter=False.
@@ -0,0 +1,26 @@
+"""Compute exponential backoff delay with optional jitter."""
+
+import random
+
+
+def compute_backoff_delay(
+    attempt: int,
+    base_delay: float = 0.5,
+    max_delay: float = 8.0,
+    jitter: bool = True,
+) -> float:
+    """Compute exponential backoff delay for a given attempt number.
+
+    Args:
+        attempt: Zero-based attempt index (0 = first retry).
+        base_delay: Base delay in seconds before exponential scaling.
+        max_delay: Maximum delay cap in seconds.
+        jitter: If True, adds random jitter to avoid thundering herd.
+
+    Returns:
+        Delay in seconds to wait before the next attempt.
+    """
+    delay = min(base_delay * (2 ** attempt), max_delay)
+    if jitter:
+        delay += random.uniform(0, min(base_delay, delay))
+    return delay
@@ -0,0 +1,42 @@
+"""Tests para compute_backoff_delay."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from compute_backoff_delay import compute_backoff_delay
+
+
+def test_attempt_0_retorna_base_delay_sin_jitter():
+    result = compute_backoff_delay(0, base_delay=0.5, max_delay=8.0, jitter=False)
+    assert result == 0.5
+
+
+def test_attempt_alto_se_cappea_a_max_delay():
+    result = compute_backoff_delay(10, base_delay=0.5, max_delay=8.0, jitter=False)
+    assert result == 8.0
+
+
+def test_sin_jitter_es_determinista():
+    r1 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
+    r2 = compute_backoff_delay(3, base_delay=1.0, max_delay=16.0, jitter=False)
+    assert r1 == r2
+    # attempt=3: 1.0 * 2^3 = 8.0
+    assert r1 == 8.0
+
+
+def test_escala_exponencial():
+    d0 = compute_backoff_delay(0, base_delay=1.0, max_delay=100.0, jitter=False)
+    d1 = compute_backoff_delay(1, base_delay=1.0, max_delay=100.0, jitter=False)
+    d2 = compute_backoff_delay(2, base_delay=1.0, max_delay=100.0, jitter=False)
+    assert d0 == 1.0
+    assert d1 == 2.0
+    assert d2 == 4.0
+
+
+def test_con_jitter_no_excede_max_delay_mas_base():
+    # Con jitter, delay base + jitter <= max_delay + base_delay
+    for attempt in range(5):
+        result = compute_backoff_delay(attempt, base_delay=0.5, max_delay=8.0, jitter=True)
+        assert result >= 0.5
+        assert result <= 8.0 + 0.5
@@ -0,0 +1,59 @@
+---
+name: convert_github_to_raw_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "convert_github_to_raw_url(url: str) -> str"
+description: "Convierte una URL de blob de GitHub/GitLab a su URL raw. Ej: github.com/org/repo/blob/main/file.py → raw.githubusercontent.com/org/repo/main/file.py. Retorna la URL sin cambios si no aplica."
+tags: [github, gitlab, url, raw, blob, convert, transform]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["urllib.parse"]
+tested: true
+tests:
+  - "URL GitHub blob"
+  - "URL GitLab blob"
+  - "URL que no es blob retorna sin cambios"
+  - "URL no-GitHub retorna sin cambios"
+test_file_path: "python/functions/core/convert_github_to_raw_url_test.py"
+file_path: "python/functions/core/convert_github_to_raw_url.py"
+---
+
+## Ejemplo
+
+```python
+from core.convert_github_to_raw_url import convert_github_to_raw_url
+
+# GitHub blob → raw.githubusercontent.com
+url = convert_github_to_raw_url(
+    "https://github.com/openai/whisper/blob/main/README.md"
+)
+# "https://raw.githubusercontent.com/openai/whisper/main/README.md"
+
+# GitLab blob → raw
+url = convert_github_to_raw_url(
+    "https://gitlab.com/org/repo/-/blob/main/file.py"
+)
+# "https://gitlab.com/org/repo/-/raw/main/file.py"
+
+# URL sin blob → sin cambios
+url = convert_github_to_raw_url("https://github.com/org/repo")
+# "https://github.com/org/repo"
+```
+
+## Notas
+
+Algoritmo:
+1. Parsear la URL con `urllib.parse.urlparse`.
+2. Si host es `github.com`: buscar segmento `blob` en el path.
+   - Si existe: eliminar el segmento `blob` y cambiar el dominio a `raw.githubusercontent.com`.
+3. Si host es `gitlab.com` o empieza con `gitlab.`: reemplazar `/-/blob/` por `/-/raw/`
+   o `/blob/` por `/raw/`.
+4. Cualquier otro host: retornar la URL sin cambios.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,69 @@
+"""Convierte URLs de blob de GitHub/GitLab a su equivalente raw."""
+
+from urllib.parse import urlparse, urlunparse
+
+
+def convert_github_to_raw_url(url: str) -> str:
+    """Convierte una URL de blob de GitHub o GitLab a su URL raw.
+
+    GitHub blob:
+        https://github.com/org/repo/blob/main/path/file.py
+        → https://raw.githubusercontent.com/org/repo/main/path/file.py
+
+    GitLab blob:
+        https://gitlab.com/org/repo/-/blob/main/path/file.py
+        → https://gitlab.com/org/repo/-/raw/main/path/file.py
+
+    Si la URL no contiene un path tipo blob, la retorna sin cambios.
+
+    Args:
+        url: URL de GitHub o GitLab, posiblemente apuntando a un blob.
+
+    Returns:
+        URL raw si aplica la transformacion; la URL original en caso contrario.
+    """
+    url = url.strip()
+    if not url:
+        return url
+
+    parsed = urlparse(url)
+    host = parsed.hostname or ""
+
+    # --- GitHub ---
+    if host in ("github.com", "www.github.com"):
+        # Path tipico: /org/repo/blob/ref/path/to/file
+        segments = parsed.path.split("/")
+        if "blob" in segments:
+            blob_idx = segments.index("blob")
+            # Eliminar segmento "blob": /org/repo/ref/path/...
+            new_segments = segments[:blob_idx] + segments[blob_idx + 1:]
+            new_path = "/".join(new_segments)
+            raw_url = urlunparse((
+                "https",
+                "raw.githubusercontent.com",
+                new_path,
+                parsed.params,
+                parsed.query,
+                parsed.fragment,
+            ))
+            return raw_url
+        return url
+
+    # --- GitLab ---
+    if host in ("gitlab.com", "www.gitlab.com") or host.startswith("gitlab."):
+        # Path tipico: /org/repo/-/blob/ref/path o /org/repo/blob/ref/path
+        new_path = parsed.path.replace("/-/blob/", "/-/raw/").replace("/blob/", "/raw/")
+        if new_path != parsed.path:
+            raw_url = urlunparse((
+                parsed.scheme,
+                parsed.netloc,
+                new_path,
+                parsed.params,
+                parsed.query,
+                parsed.fragment,
+            ))
+            return raw_url
+        return url
+
+    # No aplica transformacion
+    return url
@@ -0,0 +1,77 @@
+"""Tests para convert_github_to_raw_url."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.convert_github_to_raw_url import convert_github_to_raw_url
+
+
+def test_url_github_blob():
+    """URL de GitHub blob se convierte correctamente a raw.githubusercontent.com."""
+    url = "https://github.com/openai/whisper/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/openai/whisper/main/README.md"
+
+
+def test_url_github_blob_subdirectorio():
+    """URL de GitHub blob con subdirectorio se convierte correctamente."""
+    url = "https://github.com/org/repo/blob/main/src/utils/helper.py"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/org/repo/main/src/utils/helper.py"
+
+
+def test_url_github_blob_otra_rama():
+    """URL de GitHub blob con rama distinta a main se convierte correctamente."""
+    url = "https://github.com/org/repo/blob/develop/config.yaml"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://raw.githubusercontent.com/org/repo/develop/config.yaml"
+
+
+def test_url_gitlab_blob():
+    """URL de GitLab blob se convierte a raw."""
+    url = "https://gitlab.com/org/repo/-/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://gitlab.com/org/repo/-/raw/main/README.md"
+
+
+def test_url_gitlab_blob_sin_guion():
+    """URL de GitLab blob sin '/-/' tambien se convierte."""
+    url = "https://gitlab.com/org/repo/blob/main/README.md"
+    result = convert_github_to_raw_url(url)
+    assert result == "https://gitlab.com/org/repo/raw/main/README.md"
+
+
+def test_url_que_no_es_blob_retorna_sin_cambios():
+    """URL de GitHub sin blob retorna sin cambios."""
+    url = "https://github.com/org/repo"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_github_tree_retorna_sin_cambios():
+    """URL de GitHub tree (no blob) retorna sin cambios."""
+    url = "https://github.com/org/repo/tree/main/src"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_no_github_retorna_sin_cambios():
+    """URL de otro dominio retorna sin cambios."""
+    url = "https://example.com/org/repo/blob/main/file.py"
+    result = convert_github_to_raw_url(url)
+    assert result == url
+
+
+def test_url_vacia_retorna_sin_cambios():
+    """URL vacia retorna string vacio."""
+    result = convert_github_to_raw_url("")
+    assert result == ""
+
+
+def test_url_raw_githubusercontent_retorna_sin_cambios():
+    """URL ya en raw.githubusercontent.com no se modifica."""
+    url = "https://raw.githubusercontent.com/org/repo/main/file.py"
+    result = convert_github_to_raw_url(url)
+    assert result == url
@@ -1,7 +1,9 @@
 """Core functional programming utilities — pure functions for list/collection operations."""

+import hashlib
+import re
 from functools import reduce as _reduce
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple


 def filter_list(xs: list, pred: Callable) -> list:
@@ -133,3 +135,680 @@ def compose(*fns: Callable) -> Callable:
            result = fn(result)
        return result
    return composed
+
+
+# ── Tree manipulation ────────────────────────────────────────────────────────
+
+
+def flatten_tree(structure: Any) -> List[Dict]:
+    """Flatten a hierarchical tree (dict with 'nodes') to a list without children."""
+    import copy
+    if isinstance(structure, dict):
+        node = copy.deepcopy(structure)
+        node.pop('nodes', None)
+        nodes = [node]
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                nodes.extend(flatten_tree(structure[key]))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(flatten_tree(item))
+        return nodes
+    return []
+
+
+def tree_to_flat_list(structure: Any) -> List[Dict]:
+    """Convert hierarchical tree to flat list preserving DFS order (keeps internal nodes)."""
+    if isinstance(structure, dict):
+        nodes = [structure]
+        if 'nodes' in structure:
+            nodes.extend(tree_to_flat_list(structure['nodes']))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(tree_to_flat_list(item))
+        return nodes
+    return []
+
+
+def get_leaf_nodes(structure: Any) -> List[Dict]:
+    """Extract only leaf nodes (no children) from a hierarchical tree."""
+    import copy
+    if isinstance(structure, dict):
+        if not structure.get('nodes'):
+            node = copy.deepcopy(structure)
+            node.pop('nodes', None)
+            return [node]
+        leaf_nodes = []
+        for key in list(structure.keys()):
+            if 'nodes' in key:
+                leaf_nodes.extend(get_leaf_nodes(structure[key]))
+        return leaf_nodes
+    elif isinstance(structure, list):
+        leaf_nodes = []
+        for item in structure:
+            leaf_nodes.extend(get_leaf_nodes(item))
+        return leaf_nodes
+    return []
+
+
+def write_node_ids(data: Any, node_id: int = 0) -> int:
+    """Assign sequential zero-padded IDs (0001, 0002...) to all nodes in a tree. Returns next counter."""
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_ids(data[key], node_id)
+    elif isinstance(data, list):
+        for item in data:
+            node_id = write_node_ids(item, node_id)
+    return node_id
+
+
+def list_to_tree(data: List[Dict]) -> List[Dict]:
+    """Convert flat list with structure codes ('1.2.3') to nested tree."""
+    def get_parent_structure(structure):
+        if not structure:
+            return None
+        parts = str(structure).split('.')
+        return '.'.join(parts[:-1]) if len(parts) > 1 else None
+
+    nodes = {}
+    root_nodes = []
+
+    for item in data:
+        structure = item.get('structure')
+        node = {
+            'title': item.get('title'),
+            'start_index': item.get('start_index'),
+            'end_index': item.get('end_index'),
+            'nodes': []
+        }
+        nodes[structure] = node
+        parent_structure = get_parent_structure(structure)
+
+        if parent_structure and parent_structure in nodes:
+            nodes[parent_structure]['nodes'].append(node)
+        else:
+            root_nodes.append(node)
+
+    def clean_node(node):
+        if not node['nodes']:
+            del node['nodes']
+        else:
+            for child in node['nodes']:
+                clean_node(child)
+        return node
+
+    return [clean_node(node) for node in root_nodes]
+
+
+def remove_tree_fields(data: Any, fields: List[str] = None) -> Any:
+    """Recursively remove specified fields from a tree (dict/list)."""
+    if fields is None:
+        fields = ['text']
+    if isinstance(data, dict):
+        return {k: remove_tree_fields(v, fields) for k, v in data.items() if k not in fields}
+    elif isinstance(data, list):
+        return [remove_tree_fields(item, fields) for item in data]
+    return data
+
+
+def format_tree_structure(structure: Any, order: List[str] = None) -> Any:
+    """Reorder fields of each node in a tree according to specified key order."""
+    if not order:
+        return structure
+    if isinstance(structure, dict):
+        if 'nodes' in structure:
+            structure['nodes'] = format_tree_structure(structure['nodes'], order)
+        if not structure.get('nodes'):
+            structure.pop('nodes', None)
+        return {key: structure[key] for key in order if key in structure}
+    elif isinstance(structure, list):
+        return [format_tree_structure(item, order) for item in structure]
+    return structure
+
+
+def create_node_mapping(tree: List[Dict]) -> Dict[str, Dict]:
+    """Create flat dict mapping node_id to node for O(1) lookup."""
+    mapping = {}
+    def _traverse(nodes):
+        for node in nodes:
+            if node.get('node_id'):
+                mapping[node['node_id']] = node
+            if node.get('nodes'):
+                _traverse(node['nodes'])
+    _traverse(tree)
+    return mapping
+
+
+# ── Text / JSON extraction ───────────────────────────────────────────────────
+
+
+def extract_json_from_llm(content: str) -> Dict:
+    """Extract and parse JSON from LLM responses. Handles ```json blocks, trailing commas, None->null."""
+    import json
+    try:
+        start_idx = content.find("```json")
+        if start_idx != -1:
+            start_idx += 7
+            end_idx = content.rfind("```")
+            json_content = content[start_idx:end_idx].strip()
+        else:
+            json_content = content.strip()
+
+        json_content = json_content.replace('None', 'null')
+        json_content = json_content.replace('\n', ' ').replace('\r', ' ')
+        json_content = ' '.join(json_content.split())
+
+        return json.loads(json_content)
+    except (json.JSONDecodeError, Exception):
+        try:
+            json_content = json_content.replace(',]', ']').replace(',}', '}')
+            return json.loads(json_content)
+        except Exception:
+            return {}
+
+
+def parse_page_range(pages: str) -> List[int]:
+    """Parse page range string ('5-7', '3,8', '12') into sorted list of unique ints."""
+    result = []
+    for part in pages.split(','):
+        part = part.strip()
+        if '-' in part:
+            start, end = int(part.split('-', 1)[0].strip()), int(part.split('-', 1)[1].strip())
+            if start > end:
+                raise ValueError(f"Invalid range '{part}': start must be <= end")
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(set(result))
+
+
+# ── Markdown parsing ─────────────────────────────────────────────────────────
+
+
+def extract_markdown_headers(markdown_content: str) -> Tuple[List[Dict], List[str]]:
+    """Extract all headers (h1-h6) from markdown with line numbers, skipping code blocks."""
+    import re
+    header_pattern = r'^(#{1,6})\s+(.+)$'
+    code_block_pattern = r'^```'
+    node_list = []
+    lines = markdown_content.split('\n')
+    in_code_block = False
+
+    for line_num, line in enumerate(lines, 1):
+        stripped_line = line.strip()
+        if re.match(code_block_pattern, stripped_line):
+            in_code_block = not in_code_block
+            continue
+        if not stripped_line:
+            continue
+        if not in_code_block:
+            match = re.match(header_pattern, stripped_line)
+            if match:
+                level = len(match.group(1))
+                title = match.group(2).strip()
+                node_list.append({'title': title, 'level': level, 'line_num': line_num})
+
+    return node_list, lines
+
+
+def build_tree_from_headers(node_list: List[Dict]) -> List[Dict]:
+    """Build nested tree from flat list of headers with levels (h1>h2>h3)."""
+    if not node_list:
+        return []
+
+    stack = []
+    root_nodes = []
+    node_counter = 1
+
+    for node in node_list:
+        current_level = node['level']
+        tree_node = {
+            'title': node['title'],
+            'node_id': str(node_counter).zfill(4),
+            'line_num': node['line_num'],
+            'nodes': []
+        }
+        node_counter += 1
+
+        while stack and stack[-1][1] >= current_level:
+            stack.pop()
+
+        if not stack:
+            root_nodes.append(tree_node)
+        else:
+            parent_node, _ = stack[-1]
+            parent_node['nodes'].append(tree_node)
+
+        stack.append((tree_node, current_level))
+
+    def clean_empty_nodes(nodes):
+        for n in nodes:
+            if n['nodes']:
+                clean_empty_nodes(n['nodes'])
+            else:
+                del n['nodes']
+        return nodes
+
+    return clean_empty_nodes(root_nodes)
+
+
+# ── Pagination / chunking ────────────────────────────────────────────────────
+
+
+def page_list_to_groups(page_contents: List[str], token_lengths: List[int],
+                        max_tokens: int = 20000, overlap_pages: int = 1) -> List[str]:
+    """Group pages into text chunks respecting token limit with configurable overlap."""
+    import math
+    num_tokens = sum(token_lengths)
+
+    if num_tokens <= max_tokens:
+        return ["".join(page_contents)]
+
+    subsets = []
+    current_subset = []
+    current_token_count = 0
+
+    expected_parts = math.ceil(num_tokens / max_tokens)
+    avg_tokens = math.ceil(((num_tokens / expected_parts) + max_tokens) / 2)
+
+    for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
+        if current_token_count + page_tokens > avg_tokens:
+            subsets.append(''.join(current_subset))
+            overlap_start = max(i - overlap_pages, 0)
+            current_subset = list(page_contents[overlap_start:i])
+            current_token_count = sum(token_lengths[overlap_start:i])
+
+        current_subset.append(page_content)
+        current_token_count += page_tokens
+
+    if current_subset:
+        subsets.append(''.join(current_subset))
+
+    return subsets
+
+
+def calculate_page_offset(pairs: List[Dict]) -> int:
+    """Calculate offset between logical page numbers and physical indices using reference pairs."""
+    differences = []
+    for pair in pairs:
+        try:
+            difference = pair['physical_index'] - pair['page']
+            differences.append(difference)
+        except (KeyError, TypeError):
+            continue
+
+    if not differences:
+        return 0
+
+    counts: Dict[int, int] = {}
+    for diff in differences:
+        counts[diff] = counts.get(diff, 0) + 1
+
+    return max(counts.items(), key=lambda x: x[1])[0]
+
+
+# ── Text preprocessing ───────────────────────────────────────────────────────
+
+
+def preprocess_text(text: str) -> str:
+    """Normalize whitespace and newlines in raw text.
+
+    Args:
+        text: Raw text to normalize.
+
+    Returns:
+        Normalized text with consistent newlines, stripped lines, and no
+        excessive blank lines.
+    """
+    # Normalize line endings: \r\n and \r -> \n
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Reduce 3+ consecutive newlines to at most 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Strip whitespace from each line
+    text = '\n'.join(line.strip() for line in text.split('\n'))
+    # Strip globally
+    return text.strip()
+
+
+def get_text_stats(text: str) -> dict:
+    """Compute basic statistics of a text: characters, lines, words.
+
+    Args:
+        text: Input text to analyze.
+
+    Returns:
+        Dict with keys total_chars (int), total_lines (int), total_words (int).
+    """
+    return {
+        'total_chars': len(text),
+        'total_lines': text.count('\n') + 1,
+        'total_words': len(text.split()),
+    }
+
+
+# ── Git URL parsing ──────────────────────────────────────────────────────────
+
+_DEFAULT_GIT_HOSTS = ["github.com", "gitlab.com"]
+
+
+def _sanitize_git_segment(segment: str) -> str:
+    """Strip .git suffix then keep only [a-zA-Z0-9_-] chars."""
+    if segment.endswith(".git"):
+        segment = segment[:-4]
+    return re.sub(r"[^a-zA-Z0-9_\-]", "", segment)
+
+
+def parse_git_url(url: str, known_hosts: Optional[List[str]] = None) -> Optional[str]:
+    """Parse a code-hosting URL and return the 'org/repo' path component.
+
+    Supports HTTPS, HTTP, git://, ssh:// and SSH shorthand (git@host:path).
+    Returns None if the URL does not match any known host or is malformed.
+
+    Args:
+        url: Repository URL in any supported format.
+        known_hosts: List of accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        'org/repo' string or None.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    if url.startswith("git@"):
+        # git@github.com:org/repo.git
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return None
+        host, path = rest.split(":", 1)
+        if host not in hosts:
+            return None
+        segments = [s for s in path.split("/") if s]
+        if len(segments) < 2:
+            return None
+        org = _sanitize_git_segment(segments[0])
+        repo = _sanitize_git_segment(segments[1])
+        if not org or not repo:
+            return None
+        return f"{org}/{repo}"
+
+    for prefix in ("http://", "https://", "git://", "ssh://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            netloc = parsed.hostname or ""
+            if netloc not in hosts:
+                return None
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) < 2:
+                return None
+            org = _sanitize_git_segment(segments[0])
+            repo = _sanitize_git_segment(segments[1])
+            if not org or not repo:
+                return None
+            return f"{org}/{repo}"
+
+    return None
+
+
+def is_git_repo_url(url: str, known_hosts: Optional[List[str]] = None) -> bool:
+    """Return True only if url points to a clonable git repository.
+
+    Accepts org/repo and org/repo/tree/<ref> paths.
+    Rejects paths that navigate to sub-resources (issues, blobs, PRs, etc.).
+
+    Args:
+        url: URL to verify.
+        known_hosts: Accepted hostnames. Defaults to github.com and gitlab.com.
+
+    Returns:
+        True if url is a clonable repository URL.
+    """
+    from urllib.parse import urlparse
+
+    hosts = known_hosts if known_hosts is not None else _DEFAULT_GIT_HOSTS
+    url = url.strip()
+
+    # SSH shorthand — always repo-level if host matches
+    if url.startswith("git@"):
+        rest = url[len("git@"):]
+        if ":" not in rest:
+            return False
+        host, _ = rest.split(":", 1)
+        return host in hosts
+
+    # git:// and ssh:// — always repo-level if host matches
+    for prefix in ("ssh://", "git://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            return (parsed.hostname or "") in hosts
+
+    # http:// and https:// — must have exactly org/repo or org/repo/tree/<ref>
+    for prefix in ("http://", "https://"):
+        if url.startswith(prefix):
+            parsed = urlparse(url)
+            if (parsed.hostname or "") not in hosts:
+                return False
+            segments = [s for s in parsed.path.split("/") if s]
+            if len(segments) == 2:
+                return True
+            if len(segments) == 4 and segments[2] == "tree":
+                return True
+            return False
+
+    return False
+
+
+def validate_git_ssh_uri(url: str) -> None:
+    """Validate a git SSH URI of the form git@host:path.
+
+    Raises ValueError with a descriptive message if the URI is malformed.
+
+    Args:
+        url: URI string to validate.
+
+    Raises:
+        ValueError: If the URI does not conform to git SSH format.
+    """
+    if not url.startswith("git@"):
+        raise ValueError(f"git SSH URI must start with 'git@', got: {url!r}")
+    rest = url[len("git@"):]
+    if ":" not in rest:
+        raise ValueError(f"git SSH URI must contain ':', got: {url!r}")
+    _, path = rest.split(":", 1)
+    if not path:
+        raise ValueError(f"git SSH URI must have a non-empty path after ':', got: {url!r}")
+
+
+# ---------------------------------------------------------------------------
+# Markdown parsing utilities
+# ---------------------------------------------------------------------------
+
+
+def extract_frontmatter(content: str) -> Tuple[str, Optional[Dict]]:
+    """Extract YAML frontmatter delimited by '---' from the start of a markdown string.
+
+    Args:
+        content: Raw markdown string, optionally starting with YAML frontmatter.
+
+    Returns:
+        Tuple of (content_without_frontmatter, frontmatter_dict).
+        frontmatter_dict is None when no frontmatter is found.
+    """
+    pattern = re.compile(r'^---\n(.*?)\n---\n', re.DOTALL)
+    match = pattern.match(content)
+    if not match:
+        return content, None
+
+    raw = match.group(1)
+    remaining = content[match.end():]
+
+    try:
+        import yaml  # type: ignore
+        data = yaml.safe_load(raw)
+        if not isinstance(data, dict):
+            data = None
+    except Exception:
+        # Fallback: simple key: value parser (no yaml dependency)
+        data = {}
+        for line in raw.splitlines():
+            if ':' in line:
+                key, _, value = line.partition(':')
+                data[key.strip()] = value.strip()
+
+    return remaining, data
+
+
+def find_headings(content: str) -> List[Tuple[int, int, str, int]]:
+    """Find all markdown headings (# to ######), excluding those inside code blocks,
+    HTML comments, and indented blocks.
+
+    Args:
+        content: Markdown text to search.
+
+    Returns:
+        List of (start_pos, end_pos, title, level) for each heading found.
+    """
+    excluded: List[Tuple[int, int]] = []
+
+    # Code blocks (triple backtick)
+    for m in re.finditer(r'```.*?```', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # HTML comments
+    for m in re.finditer(r'<!--.*?-->', content, re.DOTALL):
+        excluded.append((m.start(), m.end()))
+
+    # Indented blocks (lines starting with 4 spaces or a tab)
+    for m in re.finditer(r'^(    |\t).+$', content, re.MULTILINE):
+        excluded.append((m.start(), m.end()))
+
+    def is_excluded(pos: int) -> bool:
+        return any(start <= pos < end for start, end in excluded)
+
+    results: List[Tuple[int, int, str, int]] = []
+    for m in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
+        # Skip escaped headings (\#)
+        before = content[m.start() - 1] if m.start() > 0 else ''
+        if before == '\\':
+            continue
+        if is_excluded(m.start()):
+            continue
+        level = len(m.group(1))
+        title = m.group(2).strip()
+        results.append((m.start(), m.end(), title, level))
+
+    return results
+
+
+def estimate_token_count(content: str) -> int:
+    """Estimate token count without a tokenizer.
+
+    CJK characters count as ~0.7 tokens each; other non-whitespace characters
+    count as ~0.3 tokens each.
+
+    Args:
+        content: Text to estimate.
+
+    Returns:
+        Estimated integer token count.
+    """
+    cjk = re.findall(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', content)
+    without_cjk = re.sub(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]', '', content)
+    others = re.findall(r'\S', without_cjk)
+    return int(len(cjk) * 0.7 + len(others) * 0.3)
+
+
+def smart_split_content(
+    content: str,
+    max_tokens: int = 1024,
+    max_chars: int = 8000,
+) -> List[str]:
+    """Split large content into parts respecting token and character limits.
+
+    Splits by paragraphs (double newline). If a single paragraph exceeds the
+    limit it is force-cut into chunks of max_chars.
+
+    Args:
+        content: Text to split.
+        max_tokens: Maximum estimated tokens per part.
+        max_chars: Maximum characters per part.
+
+    Returns:
+        List of string parts.
+    """
+    paragraphs = content.split('\n\n')
+    parts: List[str] = []
+    current_parts: List[str] = []
+    current_tokens = 0
+    current_chars = 0
+
+    def flush() -> None:
+        if current_parts:
+            parts.append('\n\n'.join(current_parts))
+            current_parts.clear()
+
+    for para in paragraphs:
+        para_tokens = estimate_token_count(para)
+        para_chars = len(para)
+
+        # Single paragraph exceeds limits — force-cut it
+        if para_tokens > max_tokens or para_chars > max_chars:
+            flush()
+            current_tokens = 0
+            current_chars = 0
+            for i in range(0, len(para), max_chars):
+                parts.append(para[i:i + max_chars])
+            continue
+
+        # Would exceed limits if added — flush first
+        if (current_tokens + para_tokens > max_tokens or
+                current_chars + para_chars > max_chars):
+            flush()
+            current_tokens = 0
+            current_chars = 0
+
+        current_parts.append(para)
+        current_tokens += para_tokens
+        current_chars += para_chars
+
+    flush()
+    return parts if parts else [content]
+
+
+def sanitize_for_path(text: str, max_length: int = 50) -> str:
+    """Convert text to a safe string for use in file paths.
+
+    Keeps word characters, CJK characters, spaces and hyphens. Replaces spaces
+    with underscores. Truncates with a sha256 suffix if the result exceeds
+    max_length.
+
+    Args:
+        text: Input text to sanitize.
+        max_length: Maximum length of the returned string.
+
+    Returns:
+        Safe path-friendly string.
+    """
+    cleaned = re.sub(
+        r'[^\w\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af \-]',
+        '',
+        text,
+    )
+    cleaned = cleaned.replace(' ', '_').strip('_')
+
+    if not cleaned:
+        return 'section'
+
+    if len(cleaned) <= max_length:
+        return cleaned
+
+    suffix = '_' + hashlib.sha256(text.encode()).hexdigest()[:8]
+    return cleaned[:max_length - len(suffix)] + suffix
@@ -0,0 +1,36 @@
+---
+name: create_node_mapping
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def create_node_mapping(tree: list[dict]) -> dict[str, dict]"
+description: "Crea dict plano node_id->node para lookup O(1) en un arbol jerarquico."
+tags: [tree, mapping, index, lookup]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"node_id": "0001", "title": "A", "nodes": [{"node_id": "0002", "title": "B"}]}]
+mapping = create_node_mapping(tree)
+mapping["0002"]["title"]  # "B"
+```
+
+## Notas
+
+Funcion pura. Los valores son referencias a los nodos originales, no copias.
@@ -0,0 +1,66 @@
+---
+name: cursor_paginate
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def cursor_paginate(fetch_page: Callable[..., list[T]], get_cursor: Callable[[T], str | None], page_size: int = 100, max_items: int = 2000, max_retries: int = 3, retry_delay: float = 2.0, retryable_exceptions: tuple[type[Exception], ...] = (ConnectionError, TimeoutError, OSError)) -> list[T]"
+description: "Paginador generico basado en cursor que funciona con cualquier API que use cursor-based pagination. Cada pagina se obtiene con retry automatico con exponential backoff. Se detiene cuando la pagina esta vacia, el batch es menor que page_size, se alcanza max_items, o el cursor del ultimo item es None."
+tags: [pagination, cursor, retry, generic, api, backoff]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["time", "typing.Callable", "typing.TypeVar"]
+tested: true
+tests:
+  - "API que retorna 3 paginas de 10 items"
+  - "API que falla 1 vez por pagina (retry funciona)"
+  - "max_items limita correctamente"
+  - "API que retorna pagina parcial (ultima pagina)"
+  - "Cursor None en ultimo item (se detiene)"
+test_file_path: "python/functions/core/cursor_paginate_test.py"
+file_path: "python/functions/core/cursor_paginate.py"
+---
+
+## Ejemplo
+
+```python
+from cursor_paginate import cursor_paginate
+
+def fetch_users(limit: int, cursor: str | None) -> list[dict]:
+    params = {"limit": limit}
+    if cursor:
+        params["cursor"] = cursor
+    return requests.get("https://api.example.com/users", params=params).json()["items"]
+
+def get_cursor(user: dict) -> str | None:
+    return user.get("next_cursor")
+
+users = cursor_paginate(
+    fetch_page=fetch_users,
+    get_cursor=get_cursor,
+    page_size=100,
+    max_items=5000,
+    max_retries=3,
+    retry_delay=2.0,
+)
+```
+
+## Notas
+
+El caller solo necesita proveer dos callables:
+- `fetch_page(limit, cursor)`: recibe `limit` y `cursor` como kwargs, retorna lista de items.
+- `get_cursor(item)`: extrae el cursor del ultimo item de la pagina; retornar None indica fin de datos.
+
+El exponential backoff interno aplica `retry_delay * 2^attempt` sin jitter. Solo se reintentan las excepciones en `retryable_exceptions`; cualquier otra excepcion propaga inmediatamente.
+
+Condiciones de parada (cualquiera de ellas):
+1. La pagina retornada esta vacia.
+2. La pagina retornada tiene menos items que `page_size` (pagina parcial = ultima pagina).
+3. El total acumulado alcanza o supera `max_items` (se trunca y se para).
+4. `get_cursor(batch[-1])` retorna `None`.
+
+Funcion impura: llama a `fetch_page` que tipicamente hace I/O de red y usa `time.sleep` en los reintentos.
@@ -0,0 +1,105 @@
+"""Generic cursor-based paginator for any API that uses cursor pagination."""
+
+import time
+from typing import Callable, TypeVar
+
+T = TypeVar("T")
+
+
+def cursor_paginate(
+    fetch_page: Callable[..., list[T]],
+    get_cursor: Callable[[T], str | None],
+    page_size: int = 100,
+    max_items: int = 2000,
+    max_retries: int = 3,
+    retry_delay: float = 2.0,
+    retryable_exceptions: tuple[type[Exception], ...] = (
+        ConnectionError,
+        TimeoutError,
+        OSError,
+    ),
+) -> list[T]:
+    """Paginate through a cursor-based API, collecting all items.
+
+    Fetches pages one at a time by calling fetch_page with limit and cursor
+    kwargs. Retries each page on transient errors using exponential backoff.
+    Stops when a page is empty, a partial page is returned, max_items is
+    reached, or the cursor from the last item is None.
+
+    Args:
+        fetch_page: Callable that accepts ``limit`` and ``cursor`` as keyword
+            arguments and returns a list of items for that page.
+        get_cursor: Callable that receives the last item of a page and returns
+            the cursor string to use for the next page, or None if there are
+            no more pages.
+        page_size: Number of items to request per page.
+        max_items: Hard cap on total items collected. Collection stops and the
+            list is truncated once this limit is reached.
+        max_retries: Maximum number of retry attempts per page after the first
+            failure.
+        retry_delay: Base delay in seconds between retries (doubled each
+            attempt — exponential backoff without jitter).
+        retryable_exceptions: Tuple of exception types that trigger a retry.
+            Any other exception propagates immediately.
+
+    Returns:
+        List of all collected items, in the order they were returned by the
+        API, truncated to max_items.
+
+    Raises:
+        Exception: Re-raises the last exception if all retries for a page are
+            exhausted.
+    """
+    all_items: list[T] = []
+    cursor: str | None = None
+
+    while True:
+        batch = _fetch_with_retry(
+            fetch_page=fetch_page,
+            page_size=page_size,
+            cursor=cursor,
+            max_retries=max_retries,
+            retry_delay=retry_delay,
+            retryable_exceptions=retryable_exceptions,
+        )
+
+        if not batch:
+            break
+
+        all_items.extend(batch)
+
+        if len(all_items) >= max_items:
+            del all_items[max_items:]
+            break
+
+        if len(batch) < page_size:
+            break
+
+        cursor = get_cursor(batch[-1])
+        if cursor is None:
+            break
+
+    return all_items
+
+
+def _fetch_with_retry(
+    fetch_page: Callable[..., list[T]],
+    page_size: int,
+    cursor: str | None,
+    max_retries: int,
+    retry_delay: float,
+    retryable_exceptions: tuple[type[Exception], ...],
+) -> list[T]:
+    """Call fetch_page once, retrying on retryable_exceptions with exponential backoff."""
+    last_exc: Exception | None = None
+    for attempt in range(max_retries + 1):
+        try:
+            return fetch_page(limit=page_size, cursor=cursor)
+        except retryable_exceptions as exc:
+            last_exc = exc
+            if attempt >= max_retries:
+                raise
+            delay = retry_delay * (2 ** attempt)
+            time.sleep(delay)
+
+    raise last_exc  # unreachable; satisfies type checkers
@@ -0,0 +1,148 @@
+"""Tests para cursor_paginate."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import pytest
+from cursor_paginate import cursor_paginate
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_api(pages: list[list[dict]]) -> callable:
+    """Return a fetch_page callable that serves pages from a pre-built list."""
+    call_count = [0]
+
+    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
+        idx = call_count[0]
+        call_count[0] += 1
+        if idx >= len(pages):
+            return []
+        return pages[idx][:limit]
+
+    return fetch_page
+
+
+def get_cursor(item: dict) -> str | None:
+    return item.get("cursor")
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_api_retorna_3_paginas_de_10_items():
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(0, 10)],
+        [{"id": i, "cursor": str(i)} for i in range(10, 20)],
+        [{"id": i, "cursor": str(i)} for i in range(20, 30)],
+        [],  # sentinel: empty page ends pagination
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 30
+    assert result[0]["id"] == 0
+    assert result[-1]["id"] == 29
+
+
+def test_api_falla_1_vez_por_pagina_retry_funciona():
+    """fetch_page falla en el primer intento de cada llamada, pero el retry recupera."""
+    call_counter = [0]
+    # Cada pagina tiene 5 items. 2 paginas en total, luego vacio.
+    items_by_page = [
+        [{"id": i, "cursor": str(i)} for i in range(0, 5)],
+        [{"id": i, "cursor": str(i)} for i in range(5, 10)],
+    ]
+    page_idx = [0]
+    fail_flags = [True, True]  # falla una vez por pagina
+
+    def fetch_page(limit: int, cursor: str | None) -> list[dict]:
+        idx = page_idx[0]
+        if idx < len(fail_flags) and fail_flags[idx]:
+            fail_flags[idx] = False
+            raise ConnectionError("transient failure")
+        page_idx[0] += 1
+        if idx >= len(items_by_page):
+            return []
+        return items_by_page[idx]
+
+    result = cursor_paginate(
+        fetch_page=fetch_page,
+        get_cursor=get_cursor,
+        page_size=5,
+        max_items=2000,
+        max_retries=3,
+        retry_delay=0.0,
+        retryable_exceptions=(ConnectionError, TimeoutError, OSError),
+    )
+    assert len(result) == 10
+
+
+def test_max_items_limita_correctamente():
+    # 50 items disponibles en 5 paginas de 10, pero max_items=25
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(j * 10, j * 10 + 10)]
+        for j in range(5)
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=25,
+        max_retries=0,
+    )
+    assert len(result) == 25
+    assert result[-1]["id"] == 24
+
+
+def test_api_retorna_pagina_parcial_ultima_pagina():
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(10)],  # full page
+        [{"id": i, "cursor": str(i)} for i in range(10, 17)],  # partial — 7 items
+    ]
+    api = make_api(pages)
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 17
+    assert result[-1]["id"] == 16
+
+
+def test_cursor_none_en_ultimo_item_se_detiene():
+    """Cuando el ultimo item no tiene cursor, la paginacion debe detenerse."""
+    pages = [
+        [{"id": i, "cursor": str(i)} for i in range(10)],
+        # last item has no cursor — signals end of data
+        [{"id": i, "cursor": (str(i) if i < 19 else None)} for i in range(10, 20)],
+    ]
+    api = make_api(pages)
+
+    def get_cursor_nullable(item: dict) -> str | None:
+        return item.get("cursor")
+
+    result = cursor_paginate(
+        fetch_page=api,
+        get_cursor=get_cursor_nullable,
+        page_size=10,
+        max_items=2000,
+        max_retries=0,
+    )
+    assert len(result) == 20
+    assert result[-1]["id"] == 19
@@ -0,0 +1,37 @@
+---
+name: detect_headings_by_font
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def detect_headings_by_font(pdf, min_delta: float = 2.0, max_levels: int = 4) -> list[dict]"
+description: "Detecta headings en un PDF analizando la distribucion de font sizes. El font size mas comun es el body; sizes significativamente mayores se clasifican como heading levels. Filtra headers/footers repetitivos."
+tags: [pdf, headings, font, detection, parsing, pdfplumber]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pdfplumber, collections]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/detect_headings_by_font.py"
+---
+
+## Ejemplo
+
+```python
+import pdfplumber
+from detect_headings_by_font import detect_headings_by_font
+
+with pdfplumber.open("document.pdf") as pdf:
+    headings = detect_headings_by_font(pdf, min_delta=2.0, max_levels=4)
+    for h in headings:
+        print(f"Page {h['page_num']}: {'#' * h['level']} {h['title']}")
+```
+
+## Notas
+
+Samplea cada 5ta pagina para construir el Counter de font sizes (optimizacion de rendimiento). El body_size es el font size mas frecuente. Los heading sizes deben ser >= body_size + min_delta Y tener frecuencia < 50% del body. Se limita a max_levels heading sizes ordenados desc (el mas grande = nivel 1). Titulos que aparecen en >30% de paginas son considerados headers/footers y se eliminan. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,135 @@
+"""Detect headings in a PDF by analyzing font size distribution."""
+
+from collections import Counter
+
+import pdfplumber
+
+
+def detect_headings_by_font(
+    pdf: pdfplumber.PDF,
+    min_delta: float = 2.0,
+    max_levels: int = 4,
+) -> list[dict]:
+    """Detect headings by analyzing font size distribution across pages.
+
+    The most common font size is treated as body text. Font sizes significantly
+    larger than body (by at least min_delta) and appearing in fewer than 50% of
+    chars are classified as heading levels.
+
+    Args:
+        pdf: An open pdfplumber.PDF object.
+        min_delta: Minimum size difference above body size to qualify as heading.
+        max_levels: Maximum number of heading levels to detect.
+
+    Returns:
+        list[dict]: List of {"level": int, "title": str, "page_num": int}
+                    sorted by page number. Returns empty list if no headings detected.
+    """
+    if not pdf.pages:
+        return []
+
+    # Step 1: Sample font sizes from every 5th page to determine body size
+    size_counter: Counter = Counter()
+    sample_pages = [pdf.pages[i] for i in range(0, len(pdf.pages), 5)]
+    if not sample_pages:
+        sample_pages = [pdf.pages[0]]
+
+    for page in sample_pages:
+        try:
+            chars = page.chars
+            for ch in chars:
+                size = ch.get("size")
+                if size is not None:
+                    size_counter[round(float(size), 1)] += 1
+        except Exception:
+            continue
+
+    if not size_counter:
+        return []
+
+    # Step 2: Determine body size (most common font size)
+    body_size, body_count = size_counter.most_common(1)[0]
+
+    # Step 3: Identify heading sizes
+    # Must be >= body_size + min_delta and frequency < 50% of body count
+    heading_sizes = sorted(
+        [
+            size
+            for size, count in size_counter.items()
+            if size >= body_size + min_delta and count < body_count * 0.5
+        ],
+        reverse=True,
+    )[:max_levels]
+
+    if not heading_sizes:
+        return []
+
+    # Build size -> level mapping
+    size_to_level = {size: i + 1 for i, size in enumerate(heading_sizes)}
+
+    # Step 4: Collect heading text per page
+    raw_headings: list[dict] = []
+    total_pages = len(pdf.pages)
+
+    for page_idx, page in enumerate(pdf.pages):
+        page_num = page_idx + 1
+        try:
+            chars = page.chars
+        except Exception:
+            continue
+
+        # Group consecutive chars of same heading size into text blocks
+        current_size = None
+        current_text = []
+
+        for ch in chars:
+            size = ch.get("size")
+            if size is None:
+                continue
+            rounded = round(float(size), 1)
+            if rounded in size_to_level:
+                if rounded == current_size:
+                    current_text.append(ch.get("text", ""))
+                else:
+                    if current_text and current_size is not None:
+                        text = "".join(current_text).strip()
+                        if text:
+                            raw_headings.append({
+                                "level": size_to_level[current_size],
+                                "title": text,
+                                "page_num": page_num,
+                            })
+                    current_size = rounded
+                    current_text = [ch.get("text", "")]
+            else:
+                if current_text and current_size is not None:
+                    text = "".join(current_text).strip()
+                    if text:
+                        raw_headings.append({
+                            "level": size_to_level[current_size],
+                            "title": text,
+                            "page_num": page_num,
+                        })
+                current_size = None
+                current_text = []
+
+        # Flush remaining
+        if current_text and current_size is not None:
+            text = "".join(current_text).strip()
+            if text:
+                raw_headings.append({
+                    "level": size_to_level[current_size],
+                    "title": text,
+                    "page_num": page_num,
+                })
+
+    if not raw_headings:
+        return []
+
+    # Step 5: Deduplicate — remove titles appearing on > 30% of pages (headers/footers)
+    title_page_counts: Counter = Counter(h["title"] for h in raw_headings)
+    threshold = total_pages * 0.3
+
+    filtered = [h for h in raw_headings if title_page_counts[h["title"]] <= threshold]
+
+    return filtered
@@ -0,0 +1,59 @@
+---
+name: detect_url_type
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]"
+description: "Detecta el tipo de contenido de una URL. Retorna tipo ('webpage', 'pdf', 'markdown', 'text', 'code_repository') y metadata. Hace HTTP HEAD request solo si no puede determinarse por patron o extension."
+tags: [url, content-type, http, detect, classification, head-request]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["urllib.parse", "httpx"]
+tested: true
+tests:
+  - "URL .pdf por extension"
+  - "URL github repo"
+  - "URL markdown por extension"
+  - "URL SSH git"
+  - "URL .html por extension"
+test_file_path: "python/functions/core/detect_url_type_test.py"
+file_path: "python/functions/core/detect_url_type.py"
+---
+
+## Ejemplo
+
+```python
+from core.detect_url_type import detect_url_type
+
+# Por patron URL (sin HTTP request)
+url_type, meta = detect_url_type("https://github.com/openai/whisper")
+# url_type = "code_repository", meta = {"detection": "url_pattern", ...}
+
+# Por extension (sin HTTP request)
+url_type, meta = detect_url_type("https://example.com/doc.pdf")
+# url_type = "pdf", meta = {"detection": "extension", ...}
+
+# Por HTTP HEAD request (cuando no se puede determinar sin red)
+url_type, meta = detect_url_type("https://example.com/page")
+# url_type = "webpage", meta = {"detection": "content_type_header", "content_type": "text/html", ...}
+```
+
+## Notas
+
+Algoritmo en orden de prioridad:
+1. SSH git shorthand (`git@host:path`) → `code_repository` inmediatamente.
+2. Patron URL de repos conocidos (github.com/org/repo, gitlab.com/org/repo) → `code_repository`.
+3. Extension del path de la URL (.pdf, .md, .txt, .html, .git) → tipo correspondiente.
+4. HTTP HEAD request → leer `Content-Type` header.
+5. Default: `"webpage"`.
+
+Hosts reconocidos como repos de codigo: github.com, gitlab.com, bitbucket.org, codeberg.org.
+
+Sub-recursos (issues, pulls, blob, tree, etc.) NO se clasifican como `code_repository`.
+
+Lanza `Exception` con mensaje descriptivo si el HEAD request falla (timeout, DNS, red).
@@ -0,0 +1,144 @@
+"""Detecta el tipo de contenido de una URL (webpage, pdf, markdown, text, code_repository)."""
+
+import re
+from urllib.parse import urlparse
+
+
+# Patrones de repos de codigo por hostname
+_CODE_REPO_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
+
+# Extensiones reconocidas → tipo
+_EXT_TYPE_MAP = {
+    ".pdf": "pdf",
+    ".md": "markdown",
+    ".markdown": "markdown",
+    ".rst": "text",
+    ".txt": "text",
+    ".html": "webpage",
+    ".htm": "webpage",
+    ".xml": "text",
+    ".json": "text",
+    ".csv": "text",
+    ".py": "text",
+    ".js": "text",
+    ".ts": "text",
+    ".go": "text",
+    ".rs": "text",
+    ".cpp": "text",
+    ".c": "text",
+    ".java": "text",
+    ".rb": "text",
+    ".git": "code_repository",
+}
+
+# Content-Type header prefixes → tipo
+_CONTENT_TYPE_MAP = {
+    "application/pdf": "pdf",
+    "text/markdown": "markdown",
+    "text/x-markdown": "markdown",
+    "text/plain": "text",
+    "text/html": "webpage",
+    "text/xml": "text",
+    "application/xml": "text",
+    "application/json": "text",
+}
+
+
+def _is_code_repo_url(parsed, path_segments: list[str]) -> bool:
+    """Return True si la URL apunta a la raiz de un repositorio de codigo."""
+    host = parsed.hostname or ""
+    if host not in _CODE_REPO_HOSTS:
+        return False
+    # Acepta org/repo o org/repo/ o org/repo.git (2 segmentos minimos)
+    if len(path_segments) < 2:
+        return False
+    # Rechaza sub-recursos conocidos: issues, pulls, blob, tree, releases, etc.
+    _SUB_RESOURCES = {"issues", "pulls", "blob", "tree", "releases", "tags",
+                      "commits", "compare", "wiki", "discussions", "actions",
+                      "security", "pulse", "graphs", "-", "settings"}
+    if len(path_segments) >= 3 and path_segments[2].rstrip(".git") in _SUB_RESOURCES:
+        return False
+    return True
+
+
+def _is_ssh_git_url(url: str) -> bool:
+    """Return True si la URL es un SSH git shorthand (git@host:path)."""
+    return url.strip().startswith("git@")
+
+
+def _type_from_extension(path: str) -> str | None:
+    """Detecta tipo segun la extension del path de la URL. Retorna None si no aplica."""
+    # Ignorar query string / fragment
+    clean_path = path.split("?")[0].split("#")[0]
+    for ext, url_type in _EXT_TYPE_MAP.items():
+        if clean_path.lower().endswith(ext):
+            return url_type
+    return None
+
+
+def _type_from_content_type(content_type_header: str) -> str:
+    """Mapea un Content-Type header al tipo de URL."""
+    ct = content_type_header.lower().split(";")[0].strip()
+    for prefix, url_type in _CONTENT_TYPE_MAP.items():
+        if ct.startswith(prefix):
+            return url_type
+    return "webpage"
+
+
+def detect_url_type(url: str, timeout: float = 10.0) -> tuple[str, dict]:
+    """Detecta el tipo de contenido de una URL.
+
+    Algoritmo:
+    1. Verificar si la URL es un patron de repo de codigo (git@, github.com/org/repo).
+    2. Verificar extension en el path de la URL (.pdf, .md, .txt, .html, .git).
+    3. Si no se determino: HTTP HEAD request para leer Content-Type header.
+    4. Default: "webpage".
+
+    Args:
+        url: URL a analizar.
+        timeout: Timeout en segundos para el HTTP HEAD request (si es necesario).
+
+    Returns:
+        Tuple de (tipo, metadata) donde tipo es uno de:
+        "webpage", "pdf", "markdown", "text", "code_repository".
+        metadata incluye la informacion disponible (extension, content_type, etc.).
+
+    Raises:
+        Exception: Si falla la conexion HTTP cuando es necesaria.
+    """
+    import httpx
+
+    url = url.strip()
+    metadata: dict = {"url": url}
+
+    # 1. SSH git shorthand
+    if _is_ssh_git_url(url):
+        metadata["detection"] = "ssh_pattern"
+        return "code_repository", metadata
+
+    parsed = urlparse(url)
+    path_segments = [s for s in parsed.path.split("/") if s]
+
+    # 2. Code repo by URL pattern
+    if _is_code_repo_url(parsed, path_segments):
+        metadata["detection"] = "url_pattern"
+        metadata["host"] = parsed.hostname
+        return "code_repository", metadata
+
+    # 3. Extension-based detection
+    ext_type = _type_from_extension(parsed.path)
+    if ext_type is not None:
+        metadata["detection"] = "extension"
+        metadata["path"] = parsed.path
+        return ext_type, metadata
+
+    # 4. HTTP HEAD request
+    try:
+        response = httpx.head(url, timeout=timeout, follow_redirects=True)
+        content_type = response.headers.get("content-type", "")
+        metadata["detection"] = "content_type_header"
+        metadata["content_type"] = content_type
+        metadata["status_code"] = response.status_code
+        return _type_from_content_type(content_type), metadata
+    except Exception as exc:
+        raise Exception(f"detect_url_type: HEAD request failed for {url!r}: {exc}") from exc
@@ -0,0 +1,89 @@
+"""Tests para detect_url_type (tests que no requieren red)."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.detect_url_type import detect_url_type, _type_from_extension, _type_from_content_type, _is_ssh_git_url
+
+
+def test_url_pdf_por_extension():
+    """URL .pdf se detecta por extension sin hacer request HTTP."""
+    url_type, metadata = detect_url_type("https://example.com/report.pdf")
+    assert url_type == "pdf"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_github_repo():
+    """URL de GitHub org/repo se detecta como code_repository por patron URL."""
+    url_type, metadata = detect_url_type("https://github.com/openai/whisper")
+    assert url_type == "code_repository"
+    assert metadata["detection"] == "url_pattern"
+
+
+def test_url_github_con_git_suffix():
+    """URL github terminada en .git se detecta como code_repository."""
+    url_type, metadata = detect_url_type("https://github.com/openai/whisper.git")
+    assert url_type == "code_repository"
+
+
+def test_url_markdown_por_extension():
+    """URL .md se detecta como markdown por extension."""
+    url_type, metadata = detect_url_type("https://example.com/README.md")
+    assert url_type == "markdown"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_ssh_git():
+    """URL SSH git@ se detecta como code_repository."""
+    url_type, metadata = detect_url_type("git@github.com:openai/whisper.git")
+    assert url_type == "code_repository"
+    assert metadata["detection"] == "ssh_pattern"
+
+
+def test_url_html_por_extension():
+    """URL .html se detecta como webpage por extension."""
+    url_type, metadata = detect_url_type("https://example.com/page.html")
+    assert url_type == "webpage"
+    assert metadata["detection"] == "extension"
+
+
+def test_url_txt_por_extension():
+    """URL .txt se detecta como text por extension."""
+    url_type, metadata = detect_url_type("https://example.com/data.txt")
+    assert url_type == "text"
+
+
+def test_github_subrepo_no_es_repo():
+    """URL de GitHub apuntando a un issue/blob no se trata como code_repository."""
+    # Debe intentar HEAD request (que fallara sin red) — verificamos que no clasifica como repo
+    # Solo comprobamos que no devuelve code_repository por patron URL
+    url = "https://github.com/openai/whisper/blob/main/README.md"
+    # Extension .md deberia detectarse primero
+    url_type, metadata = detect_url_type(url)
+    assert url_type == "markdown"
+
+
+def test_helper_type_from_extension():
+    """_type_from_extension funciona para extensiones conocidas."""
+    assert _type_from_extension("/doc.pdf") == "pdf"
+    assert _type_from_extension("/README.md") == "markdown"
+    assert _type_from_extension("/notes.txt") == "text"
+    assert _type_from_extension("/unknown.xyz") is None
+
+
+def test_helper_type_from_content_type():
+    """_type_from_content_type mapea headers correctamente."""
+    assert _type_from_content_type("application/pdf; charset=utf-8") == "pdf"
+    assert _type_from_content_type("text/html; charset=utf-8") == "webpage"
+    assert _type_from_content_type("text/plain") == "text"
+    assert _type_from_content_type("text/markdown") == "markdown"
+    assert _type_from_content_type("application/octet-stream") == "webpage"
+
+
+def test_helper_is_ssh_git_url():
+    """_is_ssh_git_url detecta formato git@."""
+    assert _is_ssh_git_url("git@github.com:org/repo.git") is True
+    assert _is_ssh_git_url("https://github.com/org/repo") is False
+    assert _is_ssh_git_url("ssh://git@github.com/org/repo") is False
@@ -0,0 +1,40 @@
+---
+name: docx_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "docx_to_markdown(docx_path: str) -> str"
+description: "Convierte un documento Word (.docx) a markdown preservando estructura (headings), formato inline (bold, italic, underline) y tablas en su posicion original."
+tags: [docx, markdown, word, conversion, document, parsing, text]
+uses_functions: [format_table_to_markdown_py_core]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [python-docx, lxml]
+tested: true
+tests: ["docx con headings y parrafos", "docx con tablas intercaladas", "docx con formato bold/italic", "docx vacio", "archivo no encontrado lanza FileNotFoundError"]
+test_file_path: "python/functions/core/docx_to_markdown_test.py"
+file_path: "python/functions/core/docx_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+md = docx_to_markdown("informe.docx")
+# # Titulo
+#
+# Primer parrafo.
+#
+# | Col1 | Col2 |
+# | ---- | ---- |
+# | a    | b    |
+#
+# Parrafo despues de la tabla.
+```
+
+## Notas
+
+Recorre `doc.element.body` en orden (no `doc.paragraphs` + `doc.tables` por separado) para preservar la posicion original de las tablas. Construye un mapa `{id(tbl_element): Table}` para lookup O(1). El formato inline aplica underline (`<ins>`), italic (`*`) y bold (`**`) en ese orden de mas interno a mas externo. Los headings se detectan por el estilo del parrafo (`Heading 1`, `Heading 2`, etc.). Requiere `python-docx` instalado en el entorno.
@@ -0,0 +1,153 @@
+"""Convert a Word .docx document to Markdown, preserving structure, inline
+formatting and tables in their original document order."""
+
+import os
+from lxml import etree
+
+from format_table_to_markdown import format_table_to_markdown
+
+
+# XML namespace used by python-docx element tags
+_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+_TAG_P = f"{{{_W}}}p"
+_TAG_TBL = f"{{{_W}}}tbl"
+_TAG_TR = f"{{{_W}}}tr"
+_TAG_TC = f"{{{_W}}}tc"
+_TAG_R = f"{{{_W}}}r"
+_TAG_T = f"{{{_W}}}t"
+_TAG_RPR = f"{{{_W}}}rPr"
+_TAG_B = f"{{{_W}}}b"
+_TAG_I = f"{{{_W}}}i"
+_TAG_U = f"{{{_W}}}u"
+_TAG_PSTYLE = f"{{{_W}}}pStyle"
+_TAG_PPR = f"{{{_W}}}pPr"
+
+
+def _heading_level(paragraph) -> int:
+    """Return heading level (1-6) if the paragraph is a heading, else 0."""
+    pPr = paragraph._p.find(_TAG_PPR)
+    if pPr is None:
+        return 0
+    pStyle = pPr.find(_TAG_PSTYLE)
+    if pStyle is None:
+        return 0
+    val = pStyle.get(f"{{{_W}}}val", "")
+    if val.lower().startswith("heading"):
+        parts = val.split()
+        if len(parts) == 2:
+            try:
+                return int(parts[1])
+            except ValueError:
+                pass
+        # Some locales use "Heading1" (no space)
+        suffix = val[len("heading"):]
+        if suffix.isdigit():
+            return int(suffix)
+    return 0
+
+
+def _run_to_md(run_elem) -> str:
+    """Convert a single <w:r> element to a markdown-formatted string."""
+    # Collect text
+    text_parts = []
+    for t in run_elem.findall(_TAG_T):
+        text_parts.append(t.text or "")
+    text = "".join(text_parts)
+    if not text:
+        return ""
+
+    # Read formatting from <w:rPr>
+    rPr = run_elem.find(_TAG_RPR)
+    bold = False
+    italic = False
+    underline = False
+    if rPr is not None:
+        bold = rPr.find(_TAG_B) is not None
+        italic = rPr.find(_TAG_I) is not None
+        u_elem = rPr.find(_TAG_U)
+        if u_elem is not None:
+            u_val = u_elem.get(f"{{{_W}}}val", "")
+            underline = u_val not in ("none", "")
+
+    # Apply markdown formatting (innermost first: underline → italic → bold)
+    if underline:
+        text = f"<ins>{text}</ins>"
+    if italic:
+        text = f"*{text}*"
+    if bold:
+        text = f"**{text}**"
+    return text
+
+
+def _paragraph_to_md(paragraph) -> str:
+    """Convert a python-docx Paragraph to a markdown string."""
+    level = _heading_level(paragraph)
+    runs_md = "".join(
+        _run_to_md(elem)
+        for elem in paragraph._p
+        if elem.tag == _TAG_R
+    )
+    if level:
+        return f"{'#' * level} {runs_md}"
+    return runs_md
+
+
+def _table_to_md(table) -> str:
+    """Convert a python-docx Table to a markdown table string."""
+    rows: list[list[str]] = []
+    for row in table.rows:
+        cells = []
+        for cell in row.cells:
+            # Join all paragraphs in the cell with a space
+            cell_text = " ".join(p.text for p in cell.paragraphs).strip()
+            cells.append(cell_text)
+        rows.append(cells)
+    return format_table_to_markdown(rows, has_header=True)
+
+
+def docx_to_markdown(docx_path: str) -> str:
+    """Convert a Word .docx document to Markdown.
+
+    Preserves document structure (headings), inline formatting (bold, italic,
+    underline) and tables in their original position.
+
+    Args:
+        docx_path: Absolute or relative path to the .docx file.
+
+    Returns:
+        Markdown string representing the document.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        Exception: If the file cannot be parsed as a .docx document.
+    """
+    import docx  # deferred so the module is importable without python-docx installed
+
+    if not os.path.exists(docx_path):
+        raise FileNotFoundError(f"File not found: {docx_path}")
+
+    doc = docx.Document(docx_path)
+
+    # Build a mapping from the XML element id to the Table object for O(1) lookup
+    table_map: dict[int, object] = {
+        id(table._tbl): table for table in doc.tables
+    }
+
+    parts: list[str] = []
+
+    for child in doc.element.body:
+        if child.tag == _TAG_P:
+            # Wrap in a temporary paragraph object to reuse _paragraph_to_md
+            from docx.text.paragraph import Paragraph
+            para = Paragraph(child, doc)
+            md = _paragraph_to_md(para)
+            if md.strip():
+                parts.append(md)
+        elif child.tag == _TAG_TBL:
+            table = table_map.get(id(child))
+            if table is not None:
+                md = _table_to_md(table)
+                if md:
+                    parts.append(md)
+
+    return "\n\n".join(parts)
@@ -0,0 +1,129 @@
+"""Tests para docx_to_markdown."""
+
+import os
+import sys
+import tempfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import docx as python_docx
+from docx_to_markdown import docx_to_markdown
+
+
+def _make_docx(builder_fn) -> str:
+    """Create a temporary .docx file using builder_fn(doc) and return its path."""
+    doc = python_docx.Document()
+    builder_fn(doc)
+    tmp = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
+    doc.save(tmp.name)
+    tmp.close()
+    return tmp.name
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_docx_con_headings_y_parrafos():
+    """docx con headings y parrafos"""
+
+    def build(doc):
+        doc.add_heading("Titulo Principal", level=1)
+        doc.add_paragraph("Primer parrafo de contenido.")
+        doc.add_heading("Seccion", level=2)
+        doc.add_paragraph("Segundo parrafo.")
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        assert "# Titulo Principal" in result
+        assert "## Seccion" in result
+        assert "Primer parrafo de contenido." in result
+        assert "Segundo parrafo." in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_con_tablas_intercaladas():
+    """docx con tablas intercaladas"""
+
+    def build(doc):
+        doc.add_paragraph("Texto antes de la tabla.")
+        table = doc.add_table(rows=2, cols=3)
+        table.cell(0, 0).text = "Col1"
+        table.cell(0, 1).text = "Col2"
+        table.cell(0, 2).text = "Col3"
+        table.cell(1, 0).text = "a"
+        table.cell(1, 1).text = "b"
+        table.cell(1, 2).text = "c"
+        doc.add_paragraph("Texto despues de la tabla.")
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        # Table must appear BETWEEN the two paragraphs
+        before_idx = result.index("Texto antes de la tabla.")
+        table_idx = result.index("| Col1")
+        after_idx = result.index("Texto despues de la tabla.")
+        assert before_idx < table_idx < after_idx
+        assert "| Col2" in result
+        assert "| a" in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_con_formato_bold_italic():
+    """docx con formato bold/italic"""
+
+    def build(doc):
+        para = doc.add_paragraph()
+        run_bold = para.add_run("negrita")
+        run_bold.bold = True
+        run_normal = para.add_run(" texto normal ")
+        run_italic = para.add_run("cursiva")
+        run_italic.italic = True
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        assert "**negrita**" in result
+        assert "*cursiva*" in result
+        assert "texto normal" in result
+    finally:
+        os.unlink(path)
+
+
+def test_docx_vacio():
+    """docx vacio"""
+
+    def build(doc):
+        # python-docx adds a default empty paragraph; remove all content
+        # by just not adding anything — the default empty paragraph will
+        # produce an empty string that gets filtered out.
+        pass
+
+    path = _make_docx(build)
+    try:
+        result = docx_to_markdown(path)
+        # Empty document should produce empty or whitespace-only output
+        assert result.strip() == ""
+    finally:
+        os.unlink(path)
+
+
+def test_archivo_no_encontrado():
+    """archivo no encontrado lanza FileNotFoundError"""
+    with pytest.raises(FileNotFoundError):
+        docx_to_markdown("/tmp/nonexistent_file_fn_registry.docx")
+
+
+if __name__ == "__main__":
+    test_docx_con_headings_y_parrafos()
+    test_docx_con_tablas_intercaladas()
+    test_docx_con_formato_bold_italic()
+    test_docx_vacio()
+    test_archivo_no_encontrado()
+    print("All tests passed.")
@@ -0,0 +1,52 @@
+---
+name: epub_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def epub_to_markdown(epub_path: str) -> str"
+description: "Convierte un ebook EPUB a markdown. Intenta ebooklib primero para extraccion estructurada (titulo, autor, documentos); fallback a extraccion manual con zipfile si ebooklib no esta instalado."
+tags: [epub, markdown, ebook, parsing, conversion, html, text-extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [zipfile, html, re, ebooklib]
+tested: true
+tests:
+  - "conversion de headings h1-h3"
+  - "conversion de bold e italic"
+  - "script y style se eliminan del output"
+  - "HTML entities se convierten a caracteres"
+  - "epub sin ebooklib extrae texto de archivos html"
+  - "epub con ebooklib incluye titulo y autor en el output"
+  - "epub corrupto lanza excepcion"
+test_file_path: "python/functions/core/epub_to_markdown_test.py"
+file_path: "python/functions/core/epub_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+md = epub_to_markdown("/path/to/book.epub")
+print(md[:500])
+# # Mi Libro
+# **Author:** Ana Perez
+#
+# # Introduccion
+# Primer parrafo...
+```
+
+## Notas
+
+Conversion HTML a markdown cubre: headings h1-h6, bold (`<strong>`/`<b>`), italic (`<em>`/`<i>`), paragraphs, line breaks. Elimina `<script>` y `<style>`. Desescapa entidades HTML y normaliza whitespace.
+
+Con ebooklib: extrae metadata DC (titulo, autor) del OPF y procesa solo los ITEM_DOCUMENT del spine.
+
+Sin ebooklib (fallback ZIP): lista archivos `.html`/`.xhtml`/`.htm` en orden alfabetico y extrae su contenido. No hay metadata de titulo/autor en este modo.
+
+Dependencia opcional: `pip install ebooklib`. Si no esta instalada la funcion sigue funcionando via zipfile.
+
+Reimplementacion conceptual desde OpenViking `openviking/parse/parsers/epub.py` (AGPL-3.0). El codigo es original.
@@ -0,0 +1,128 @@
+"""Convert an EPUB file to markdown text."""
+
+import re
+import zipfile
+from html import unescape
+from html.parser import HTMLParser
+
+
+def _remove_tags(html: str, tag: str) -> str:
+    """Remove a tag and its content from HTML string."""
+    pattern = re.compile(rf'<{tag}[^>]*>.*?</{tag}>', re.IGNORECASE | re.DOTALL)
+    return pattern.sub('', html)
+
+
+def _html_to_markdown(html: str) -> str:
+    """Convert basic HTML to markdown.
+
+    Handles headings, bold, italic, paragraphs, line breaks
+    and strips remaining tags.
+
+    Args:
+        html: HTML string to convert.
+
+    Returns:
+        Markdown-formatted string.
+    """
+    # Remove script and style blocks
+    text = _remove_tags(html, 'script')
+    text = _remove_tags(text, 'style')
+
+    # Headings h1-h6
+    for level in range(6, 0, -1):
+        hashes = '#' * level
+        text = re.sub(
+            rf'<h{level}[^>]*>(.*?)</h{level}>',
+            lambda m, h=hashes: f'{h} {m.group(1).strip()}',
+            text,
+            flags=re.IGNORECASE | re.DOTALL,
+        )
+
+    # Bold
+    text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
+    text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Italic
+    text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
+    text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Paragraphs — append double newline after content
+    text = re.sub(r'<p[^>]*>(.*?)</p>', lambda m: m.group(1).strip() + '\n\n', text, flags=re.IGNORECASE | re.DOTALL)
+
+    # Line breaks
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+
+    # Strip remaining HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+
+    # Unescape HTML entities
+    text = unescape(text)
+
+    # Normalize whitespace: collapse multiple blank lines into two
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r'[ \t]+', ' ', text)
+
+    return text.strip()
+
+
+def _epub_via_ebooklib(epub_path: str) -> str:
+    """Extract markdown from EPUB using ebooklib."""
+    import ebooklib
+    from ebooklib import epub
+
+    book = epub.read_epub(epub_path)
+
+    # Metadata
+    title_meta = book.get_metadata('DC', 'title')
+    author_meta = book.get_metadata('DC', 'creator')
+    title = title_meta[0][0] if title_meta else 'Unknown Title'
+    author = author_meta[0][0] if author_meta else 'Unknown Author'
+
+    parts = [f'# {title}', f'**Author:** {author}']
+
+    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+        content = item.get_content().decode('utf-8', errors='replace')
+        md = _html_to_markdown(content)
+        if md:
+            parts.append(md)
+
+    return '\n\n'.join(parts)
+
+
+def _epub_via_zipfile(epub_path: str) -> str:
+    """Extract markdown from EPUB using zipfile (fallback)."""
+    parts = []
+    with zipfile.ZipFile(epub_path, 'r') as zf:
+        html_files = sorted(
+            name for name in zf.namelist()
+            if name.lower().endswith(('.html', '.xhtml', '.htm'))
+        )
+        for name in html_files:
+            raw = zf.read(name).decode('utf-8', errors='replace')
+            md = _html_to_markdown(raw)
+            if md:
+                parts.append(md)
+
+    return '\n\n'.join(parts)
+
+
+def epub_to_markdown(epub_path: str) -> str:
+    """Convert an EPUB ebook to markdown.
+
+    Attempts to use ebooklib for structured extraction (title, author,
+    document items). Falls back to manual ZIP extraction if ebooklib is
+    not installed.
+
+    Args:
+        epub_path: Path to the .epub file.
+
+    Returns:
+        Markdown string with the book content.
+
+    Raises:
+        Exception: If the file cannot be read or is not a valid EPUB.
+    """
+    try:
+        return _epub_via_ebooklib(epub_path)
+    except ImportError:
+        return _epub_via_zipfile(epub_path)
@@ -0,0 +1,163 @@
+"""Tests para epub_to_markdown."""
+
+import io
+import os
+import struct
+import sys
+import zipfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from epub_to_markdown import _html_to_markdown, _epub_via_zipfile, epub_to_markdown
+
+
+# ---------------------------------------------------------------------------
+# Helpers para construir EPUBs minimos en memoria
+# ---------------------------------------------------------------------------
+
+def _build_epub(files: dict[str, str]) -> str:
+    """Crea un EPUB minimo como ZIP en disco y retorna el path."""
+    import tempfile
+    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
+    with zipfile.ZipFile(tmp, 'w') as zf:
+        for name, content in files.items():
+            zf.writestr(name, content)
+    tmp.close()
+    return tmp.name
+
+
+def _build_epub_with_opf(title: str, author: str, body_html: str) -> str:
+    """Crea un EPUB con OPF y un documento HTML valido para ebooklib."""
+    opf = f"""<?xml version='1.0' encoding='utf-8'?>
+<package xmlns='http://www.idpf.org/2007/opf' unique-identifier='uid' version='2.0'>
+  <metadata xmlns:dc='http://purl.org/dc/elements/1.1/'>
+    <dc:title>{title}</dc:title>
+    <dc:creator>{author}</dc:creator>
+    <dc:identifier id='uid'>test-uid</dc:identifier>
+    <dc:language>en</dc:language>
+  </metadata>
+  <manifest>
+    <item id='ch1' href='chapter1.xhtml' media-type='application/xhtml+xml'/>
+    <item id='ncx' href='toc.ncx' media-type='application/x-dtbncx+xml'/>
+  </manifest>
+  <spine toc='ncx'>
+    <itemref idref='ch1'/>
+  </spine>
+</package>"""
+
+    ncx = """<?xml version='1.0' encoding='utf-8'?>
+<ncx xmlns='http://www.daisy.org/z3986/2005/ncx/' version='2005-1'>
+  <head><meta name='dtb:uid' content='test-uid'/></head>
+  <docTitle><text>Test</text></docTitle>
+  <navMap/>
+</ncx>"""
+
+    chapter = f"""<?xml version='1.0' encoding='utf-8'?>
+<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>
+<html xmlns='http://www.w3.org/1999/xhtml'>
+<head><title>Chapter</title></head>
+<body>{body_html}</body>
+</html>"""
+
+    return _build_epub({
+        'mimetype': 'application/epub+zip',
+        'META-INF/container.xml': """<?xml version='1.0'?>
+<container version='1.0' xmlns='urn:oasis:names:tc:opendocument:xmlns:container'>
+  <rootfiles>
+    <rootfile full-path='content.opf' media-type='application/oebps-package+xml'/>
+  </rootfiles>
+</container>""",
+        'content.opf': opf,
+        'toc.ncx': ncx,
+        'chapter1.xhtml': chapter,
+    })
+
+
+# ---------------------------------------------------------------------------
+# Tests de _html_to_markdown (pura, sin disco)
+# ---------------------------------------------------------------------------
+
+def test_html_heading_conversion():
+    """conversion de headings h1-h3."""
+    html = '<h1>Titulo</h1><h2>Subtitulo</h2><h3>Seccion</h3>'
+    result = _html_to_markdown(html)
+    assert '# Titulo' in result
+    assert '## Subtitulo' in result
+    assert '### Seccion' in result
+
+
+def test_html_bold_italic():
+    """conversion de bold e italic."""
+    html = '<p><strong>negrita</strong> y <em>italica</em></p>'
+    result = _html_to_markdown(html)
+    assert '**negrita**' in result
+    assert '*italica*' in result
+
+
+def test_html_script_style_removed():
+    """script y style se eliminan del output."""
+    html = '<script>alert(1)</script><style>body{}</style><p>Contenido</p>'
+    result = _html_to_markdown(html)
+    assert 'alert' not in result
+    assert 'body{}' not in result
+    assert 'Contenido' in result
+
+
+def test_html_entities_unescaped():
+    """HTML entities se convierten a caracteres."""
+    html = '<p>Tom &amp; Jerry &lt;show&gt;</p>'
+    result = _html_to_markdown(html)
+    assert 'Tom & Jerry' in result
+    assert '<show>' in result
+
+
+# ---------------------------------------------------------------------------
+# Tests de epub_via_zipfile (sin ebooklib)
+# ---------------------------------------------------------------------------
+
+def test_epub_via_zipfile_extrae_html():
+    """epub sin ebooklib extrae texto de archivos html."""
+    path = _build_epub({
+        'chapter.html': '<html><body><h1>Capitulo Uno</h1><p>Hola mundo.</p></body></html>',
+    })
+    try:
+        result = _epub_via_zipfile(path)
+        assert 'Capitulo Uno' in result
+        assert 'Hola mundo' in result
+    finally:
+        os.unlink(path)
+
+
+# ---------------------------------------------------------------------------
+# Tests de epub_to_markdown (integracion)
+# ---------------------------------------------------------------------------
+
+def test_epub_con_ebooklib_metadata():
+    """epub con ebooklib incluye titulo y autor en el output."""
+    pytest.importorskip('ebooklib')
+    path = _build_epub_with_opf(
+        title='Mi Libro',
+        author='Ana Perez',
+        body_html='<h1>Introduccion</h1><p>Primer parrafo.</p>',
+    )
+    try:
+        result = epub_to_markdown(path)
+        assert '# Mi Libro' in result
+        assert 'Ana Perez' in result
+        assert 'Introduccion' in result
+    finally:
+        os.unlink(path)
+
+
+def test_epub_corrupto_lanza_excepcion():
+    """epub corrupto lanza Exception."""
+    import tempfile
+    tmp = tempfile.NamedTemporaryFile(suffix='.epub', delete=False)
+    tmp.write(b'esto no es un epub valido')
+    tmp.close()
+    try:
+        with pytest.raises(Exception):
+            epub_to_markdown(tmp.name)
+    finally:
+        os.unlink(tmp.name)
@@ -0,0 +1,37 @@
+---
+name: estimate_token_count
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def estimate_token_count(content: str) -> int"
+description: "Estimacion rapida de tokens sin tokenizer. CJK chars cuentan ~0.7 token/char, otros non-whitespace ~0.3 token/char."
+tags: [tokens, estimation, nlp, cjk, text]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "texto vacio retorna cero"
+  - "solo latin"
+  - "solo CJK"
+  - "texto mixto"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+estimate_token_count("hello world")  # 3
+estimate_token_count("中文语")        # 2  (3 * 0.7 = 2)
+estimate_token_count("")             # 0
+```
+
+## Notas
+
+Funcion pura. No requiere ninguna dependencia externa. Precision aproximada: util para guardianes de limite de contexto antes de llamar a LLMs, no para conteo exacto de tokens BPE. CJK range: `[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]` (CJK unificado, Hiragana/Katakana, Hangul).
@@ -0,0 +1,58 @@
+---
+name: excel_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str"
+description: "Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown con cada sheet como seccion H2. Soporta tipos de celda: fechas ISO, booleanos, errores Excel, numeros enteros y flotantes. Trunca sheets que superen max_rows_per_sheet."
+tags: [excel, markdown, xlsx, xls, conversion, parser, io]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["openpyxl", "xlrd"]
+tested: true
+tests:
+  - "xlsx con multiples sheets produce una seccion H2 por sheet"
+  - "sheet vacio produce nota de sheet vacio"
+  - "sheet truncado con nota de filas omitidas"
+  - "sheet con formulas data_only muestra valores calculados"
+  - "extension no soportada lanza ValueError"
+  - "archivo inexistente lanza FileNotFoundError"
+  - "dimensiones del sheet en metadata"
+  - "tabla markdown con formato correcto"
+test_file_path: "python/functions/core/excel_to_markdown_test.py"
+file_path: "python/functions/core/excel_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from excel_to_markdown import excel_to_markdown
+
+md = excel_to_markdown("report.xlsx")
+print(md)
+# ## Sheet: Ventas
+#
+# **Dimensions:** 101 x 4
+#
+# | Producto | Precio | Cantidad | Total |
+# | --- | --- | --- | --- |
+# | Manzana | 1 | 100 | 100 |
+# ...
+
+# Con limite de filas
+md = excel_to_markdown("big_file.xlsx", max_rows_per_sheet=50)
+```
+
+## Notas
+
+- `.xlsx` y `.xlsm`: usa `openpyxl` con `data_only=True` (lee valores calculados, no formulas).
+- `.xls` (legacy): usa `xlrd`. Manejo de tipos especiales: EMPTY/BLANK → "", DATE → ISO 8601, BOOLEAN → "TRUE"/"FALSE", ERROR → codigo Excel (#NULL!, #DIV/0!, etc.), NUMBER → entero si no tiene decimales.
+- Fechas sin hora se formatean como `YYYY-MM-DD`; con hora como `YYYY-MM-DDTHH:MM:SS`.
+- Los pipes `|` dentro de celdas se escapan como `\|`.
+- Si `xlwt` no esta disponible, los tests .xls se saltan (xlwt solo se necesita para crear fixtures, no para leer).
+- Reimplementacion desde cero, inspirada conceptualmente en OpenViking (AGPL-3.0). Sin codigo copiado.
@@ -0,0 +1,211 @@
+"""Convierte archivos Excel a Markdown con cada sheet como seccion H2."""
+
+import os
+from pathlib import Path
+
+
+# Codigos de error Excel para xlrd
+_XL_ERROR_CODES = {
+    0: "#NULL!",
+    7: "#DIV/0!",
+    15: "#VALUE!",
+    23: "#REF!",
+    29: "#NAME?",
+    36: "#NUM!",
+    42: "#N/A",
+}
+
+
+def _rows_to_markdown_table(rows: list[list[str]]) -> str:
+    """Convierte filas de strings a tabla markdown."""
+    if not rows:
+        return ""
+
+    header = rows[0]
+    col_count = len(header)
+
+    # Normalizar todas las filas al mismo numero de columnas
+    normalized = []
+    for row in rows:
+        if len(row) < col_count:
+            row = row + [""] * (col_count - len(row))
+        normalized.append(row[:col_count])
+
+    # Escapar pipes en celdas
+    def escape(cell: str) -> str:
+        return cell.replace("|", "\\|").replace("\n", " ")
+
+    lines = []
+    # Header
+    lines.append("| " + " | ".join(escape(c) for c in normalized[0]) + " |")
+    # Separator
+    lines.append("| " + " | ".join("---" for _ in range(col_count)) + " |")
+    # Data rows
+    for row in normalized[1:]:
+        lines.append("| " + " | ".join(escape(c) for c in row) + " |")
+
+    return "\n".join(lines)
+
+
+def _cell_value_xlrd(cell, workbook) -> str:
+    """Convierte una celda xlrd a string segun su tipo."""
+    import xlrd
+
+    ctype = cell.ctype
+
+    if ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK):
+        return ""
+    elif ctype == xlrd.XL_CELL_DATE:
+        try:
+            dt = xlrd.xldate_as_datetime(cell.value, workbook.datemode)
+            if dt.hour == 0 and dt.minute == 0 and dt.second == 0:
+                return dt.date().isoformat()
+            return dt.isoformat()
+        except Exception:
+            return str(cell.value)
+    elif ctype == xlrd.XL_CELL_BOOLEAN:
+        return "TRUE" if cell.value else "FALSE"
+    elif ctype == xlrd.XL_CELL_ERROR:
+        return _XL_ERROR_CODES.get(int(cell.value), "#ERROR!")
+    elif ctype == xlrd.XL_CELL_NUMBER:
+        v = cell.value
+        if v == int(v):
+            return str(int(v))
+        return str(v)
+    elif ctype == xlrd.XL_CELL_TEXT:
+        return str(cell.value)
+    else:
+        return str(cell.value)
+
+
+def _sheet_xlrd(sheet, workbook, max_rows: int) -> str:
+    """Convierte un sheet xlrd a markdown."""
+    nrows = sheet.nrows
+    ncols = sheet.ncols
+
+    lines = []
+    lines.append(f"## Sheet: {sheet.name}")
+    lines.append("")
+    lines.append(f"**Dimensions:** {nrows} x {ncols}")
+    lines.append("")
+
+    if nrows == 0 or ncols == 0:
+        lines.append("*(empty sheet)*")
+        return "\n".join(lines)
+
+    display_rows = min(nrows, max_rows)
+    rows = []
+    for r in range(display_rows):
+        row_data = [_cell_value_xlrd(sheet.cell(r, c), workbook) for c in range(ncols)]
+        rows.append(row_data)
+
+    lines.append(_rows_to_markdown_table(rows))
+
+    if nrows > max_rows:
+        omitted = nrows - max_rows
+        lines.append("")
+        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
+
+    return "\n".join(lines)
+
+
+def _cell_value_openpyxl(cell) -> str:
+    """Convierte una celda openpyxl a string."""
+    v = cell.value
+    if v is None:
+        return ""
+    if isinstance(v, bool):
+        return "TRUE" if v else "FALSE"
+    if isinstance(v, float):
+        if v == int(v):
+            return str(int(v))
+        return str(v)
+    if isinstance(v, int):
+        return str(v)
+    # Fechas y datetimes
+    import datetime
+    if isinstance(v, datetime.datetime):
+        if v.hour == 0 and v.minute == 0 and v.second == 0:
+            return v.date().isoformat()
+        return v.isoformat()
+    if isinstance(v, datetime.date):
+        return v.isoformat()
+    return str(v)
+
+
+def _sheet_openpyxl(ws, max_rows: int) -> str:
+    """Convierte un worksheet openpyxl a markdown."""
+    all_rows = list(ws.iter_rows())
+    nrows = len(all_rows)
+    ncols = ws.max_column or 0
+
+    lines = []
+    lines.append(f"## Sheet: {ws.title}")
+    lines.append("")
+    lines.append(f"**Dimensions:** {nrows} x {ncols}")
+    lines.append("")
+
+    if nrows == 0 or ncols == 0:
+        lines.append("*(empty sheet)*")
+        return "\n".join(lines)
+
+    display_rows = min(nrows, max_rows)
+    rows = []
+    for row in all_rows[:display_rows]:
+        row_data = [_cell_value_openpyxl(cell) for cell in row]
+        rows.append(row_data)
+
+    lines.append(_rows_to_markdown_table(rows))
+
+    if nrows > max_rows:
+        omitted = nrows - max_rows
+        lines.append("")
+        lines.append(f"*{omitted} rows omitted (max_rows_per_sheet={max_rows})*")
+
+    return "\n".join(lines)
+
+
+def excel_to_markdown(path: str, max_rows_per_sheet: int = 1000) -> str:
+    """Convierte un archivo Excel (.xlsx, .xls, .xlsm) a markdown.
+
+    Cada sheet se convierte en una seccion H2. Las filas se representan
+    como tablas markdown. Si el numero de filas supera max_rows_per_sheet,
+    el sheet se trunca y se añade una nota.
+
+    Args:
+        path: Ruta al archivo Excel (.xlsx, .xls, .xlsm).
+        max_rows_per_sheet: Maximo de filas a incluir por sheet (default 1000).
+
+    Returns:
+        String markdown con todos los sheets del archivo.
+
+    Raises:
+        ValueError: Si la extension no es soportada.
+        FileNotFoundError: Si el archivo no existe.
+        Exception: Si hay errores leyendo el archivo.
+    """
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+
+    ext = p.suffix.lower()
+
+    if ext == ".xls":
+        import xlrd
+        wb = xlrd.open_workbook(path)
+        sections = []
+        for sheet_name in wb.sheet_names():
+            sheet = wb.sheet_by_name(sheet_name)
+            sections.append(_sheet_xlrd(sheet, wb, max_rows_per_sheet))
+        return "\n\n".join(sections)
+
+    elif ext in (".xlsx", ".xlsm"):
+        import openpyxl
+        wb = openpyxl.load_workbook(path, data_only=True)
+        sections = []
+        for ws in wb.worksheets:
+            sections.append(_sheet_openpyxl(ws, max_rows_per_sheet))
+        return "\n\n".join(sections)
+
+    else:
+        raise ValueError(f"Unsupported extension '{ext}'. Use .xlsx, .xls, or .xlsm.")
@@ -0,0 +1,142 @@
+"""Tests para excel_to_markdown."""
+
+import datetime
+import os
+import sys
+import tempfile
+
+import openpyxl
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from excel_to_markdown import excel_to_markdown
+
+
+def _make_xlsx(sheets: dict, filename: str) -> str:
+    """Crea un archivo .xlsx temporal con los sheets dados."""
+    wb = openpyxl.Workbook()
+    first = True
+    for sheet_name, rows in sheets.items():
+        if first:
+            ws = wb.active
+            ws.title = sheet_name
+            first = False
+        else:
+            ws = wb.create_sheet(sheet_name)
+        for row in rows:
+            ws.append(row)
+    path = os.path.join(tempfile.mkdtemp(), filename)
+    wb.save(path)
+    return path
+
+
+def test_xlsx_multiples_sheets():
+    """xlsx con multiples sheets produce una seccion H2 por sheet."""
+    path = _make_xlsx(
+        {
+            "Ventas": [["Producto", "Precio", "Cantidad"], ["Manzana", 1.5, 100], ["Pera", 2.0, 50]],
+            "Resumen": [["Total", "Importe"], ["150", "225.0"]],
+        },
+        "multi.xlsx",
+    )
+    result = excel_to_markdown(path)
+
+    assert "## Sheet: Ventas" in result
+    assert "## Sheet: Resumen" in result
+    assert "Producto" in result
+    assert "Manzana" in result
+    assert "Total" in result
+
+
+def test_sheet_vacio():
+    """Sheet sin filas produce nota de sheet vacio."""
+    path = _make_xlsx({"Vacio": []}, "empty.xlsx")
+    result = excel_to_markdown(path)
+
+    assert "## Sheet: Vacio" in result
+    assert "empty sheet" in result
+
+
+def test_sheet_truncado():
+    """Sheet con mas filas que max_rows_per_sheet se trunca con nota."""
+    rows = [["col"]] + [[str(i)] for i in range(20)]
+    path = _make_xlsx({"Data": rows}, "big.xlsx")
+    result = excel_to_markdown(path, max_rows_per_sheet=5)
+
+    assert "omitted" in result
+    # 21 filas totales, 5 mostradas -> 16 omitidas
+    assert "16 rows omitted" in result
+
+
+def test_sheet_con_formulas_data_only():
+    """Archivo xlsx abierto con data_only=True muestra valores calculados (o None si no guardados)."""
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "Formulas"
+    ws.append(["A", "B", "Suma"])
+    ws.append([1, 2, "=A2+B2"])
+    path = os.path.join(tempfile.mkdtemp(), "formulas.xlsx")
+    wb.save(path)
+
+    result = excel_to_markdown(path)
+    assert "## Sheet: Formulas" in result
+    # La celda formula puede ser None con data_only=True si no fue guardada con valor
+    assert "Suma" in result
+
+
+def test_xls_legacy_con_fechas():
+    """xls legacy: la funcion debe aceptar .xls (via xlrd) y manejar fechas."""
+    # Creamos un .xls usando xlwt si disponible, si no lo saltamos
+    pytest.importorskip("xlwt", reason="xlwt no disponible para crear .xls de prueba")
+    import xlwt
+
+    wb = xlwt.Workbook()
+    ws = wb.add_sheet("Fechas")
+    ws.write(0, 0, "Nombre")
+    ws.write(0, 1, "Fecha")
+    ws.write(1, 0, "Evento A")
+
+    date_format = xlwt.XFStyle()
+    date_format.num_format_str = "YYYY-MM-DD"
+    ws.write(1, 1, datetime.date(2024, 1, 15).toordinal() - 693594, date_format)
+
+    path = os.path.join(tempfile.mkdtemp(), "legacy.xls")
+    wb.save(path)
+
+    result = excel_to_markdown(path)
+    assert "## Sheet: Fechas" in result
+    assert "Evento A" in result
+
+
+def test_extension_no_soportada():
+    """Extension no soportada lanza ValueError."""
+    path = os.path.join(tempfile.mkdtemp(), "data.csv")
+    with open(path, "w") as f:
+        f.write("a,b\n1,2\n")
+
+    with pytest.raises(ValueError, match="Unsupported extension"):
+        excel_to_markdown(path)
+
+
+def test_archivo_no_existe():
+    """Archivo inexistente lanza FileNotFoundError."""
+    with pytest.raises(FileNotFoundError):
+        excel_to_markdown("/tmp/no_existe_para_nada.xlsx")
+
+
+def test_dimensiones_en_metadata():
+    """El markdown incluye dimensiones del sheet."""
+    path = _make_xlsx({"Hoja1": [["A", "B"], [1, 2], [3, 4]]}, "dims.xlsx")
+    result = excel_to_markdown(path)
+    assert "**Dimensions:**" in result
+    assert "3 x 2" in result
+
+
+def test_tabla_markdown_formato():
+    """La tabla tiene formato correcto con separador de header."""
+    path = _make_xlsx({"Datos": [["Col1", "Col2"], ["val1", "val2"]]}, "fmt.xlsx")
+    result = excel_to_markdown(path)
+    # Debe tener linea separadora con ---
+    assert "| --- |" in result or "| --- | --- |" in result
+    assert "Col1" in result
+    assert "val1" in result
@@ -0,0 +1,43 @@
+---
+name: extract_frontmatter
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_frontmatter(content: str) -> tuple[str, dict | None]"
+description: "Extrae YAML frontmatter (delimitado por ---) del inicio de un string markdown. Retorna el contenido sin frontmatter y el dict parseado (o None si no hay)."
+tags: [markdown, frontmatter, yaml, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re, yaml]
+tested: true
+tests:
+  - "contenido con frontmatter"
+  - "sin frontmatter retorna None"
+  - "frontmatter vacio"
+  - "frontmatter con listas"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+content = "---\ntitle: Hello\nauthor: Alice\n---\n# Body\n"
+remaining, data = extract_frontmatter(content)
+# remaining = "# Body\n"
+# data = {"title": "Hello", "author": "Alice"}
+
+no_fm = "# Just markdown\n\nNo frontmatter."
+remaining, data = extract_frontmatter(no_fm)
+# remaining == no_fm
+# data is None
+```
+
+## Notas
+
+Funcion pura. Usa `yaml.safe_load` si PyYAML esta disponible; si no, cae back a un parser simple de `key: value`. Solo reconoce frontmatter al inicio estricto del string (posicion 0). El bloque debe estar delimitado por `---\n` de apertura y `\n---\n` de cierre.
@@ -0,0 +1,36 @@
+---
+name: extract_json_from_llm
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_json_from_llm(content: str) -> dict"
+description: "Extrae y parsea JSON de respuestas LLM. Maneja bloques ```json, trailing commas, None->null."
+tags: [json, llm, parsing, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [json]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+raw = '```json\n{"key": "value", "items": [1, 2, 3,]}\n```'
+result = extract_json_from_llm(raw)
+# {"key": "value", "items": [1, 2, 3]}
+```
+
+## Notas
+
+Funcion pura. Maneja errores comunes de LLMs: trailing commas, `None` en lugar de `null`, whitespace extra. Retorna dict vacio si el JSON es irrecuperable.
@@ -0,0 +1,36 @@
+---
+name: extract_markdown_headers
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def extract_markdown_headers(markdown_content: str) -> tuple[list[dict], list[str]]"
+description: "Extrae todos los headers (h1-h6) de markdown con nivel y numero de linea, ignorando code blocks."
+tags: [markdown, parsing, headers, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/page_index_md.py"
+---
+
+## Ejemplo
+
+```python
+md = "# Title\n\nSome text\n\n## Section\n\n```\n# not a header\n```"
+headers, lines = extract_markdown_headers(md)
+# headers = [{"title": "Title", "level": 1, "line_num": 1}, {"title": "Section", "level": 2, "line_num": 5}]
+```
+
+## Notas
+
+Funcion pura. Detecta y omite bloques de codigo (triple backtick). Retorna tupla: (lista de headers, lista de lineas originales).
@@ -0,0 +1,37 @@
+---
+name: extract_pdf_bookmarks
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def extract_pdf_bookmarks(pdf) -> list[dict]"
+description: "Extrae la estructura de bookmarks/outlines de un PDF abierto con pdfplumber. Retorna lista de dicts con level (1-6), title y page_num."
+tags: [pdf, bookmarks, outlines, parsing, pdfplumber]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pdfplumber]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/extract_pdf_bookmarks.py"
+---
+
+## Ejemplo
+
+```python
+import pdfplumber
+from extract_pdf_bookmarks import extract_pdf_bookmarks
+
+with pdfplumber.open("document.pdf") as pdf:
+    bookmarks = extract_pdf_bookmarks(pdf)
+    for bm in bookmarks:
+        print(f"{'#' * bm['level']} {bm['title']} (page {bm['page_num']})")
+```
+
+## Notas
+
+Recibe un objeto `pdfplumber.PDF` ya abierto (no un path). Construye un mapping interno `objid -> page_number` desde `pdf.pages` para resolver los destinos de outline. El nivel se limita al rango [1, 6] para compatibilidad markdown. Retorna lista vacia si el PDF no tiene outlines o si `get_outlines()` falla. Impure porque accede al estado interno de un objeto PDF ya abierto.
@@ -0,0 +1,63 @@
+"""Extract the bookmark/outline structure from a PDF opened with pdfplumber."""
+
+import pdfplumber
+
+
+def extract_pdf_bookmarks(pdf: pdfplumber.PDF) -> list[dict]:
+    """Extract bookmarks/outlines from an open pdfplumber PDF object.
+
+    Args:
+        pdf: An open pdfplumber.PDF object.
+
+    Returns:
+        list[dict]: List of {"level": int, "title": str, "page_num": int | None}.
+                    Level is clamped to [1, 6]. Returns empty list if no outlines.
+    """
+    try:
+        outlines = pdf.doc.get_outlines()
+    except Exception:
+        return []
+
+    if not outlines:
+        return []
+
+    # Build objid -> page_number mapping
+    objid_to_page: dict[int, int] = {}
+    for i, page in enumerate(pdf.pages):
+        try:
+            obj = page.page_obj
+            objid_to_page[obj.objid] = i + 1  # 1-indexed page numbers
+        except Exception:
+            pass
+
+    bookmarks = []
+    for item in outlines:
+        try:
+            level = item[0]  # integer level from get_outlines
+            title = item[1]
+            dest = item[2]  # destination: page object or list
+
+            # Clamp level to [1, 6]
+            level = max(1, min(6, level))
+
+            # Resolve destination to page number
+            page_num = None
+            if dest is not None:
+                if isinstance(dest, list) and len(dest) > 0:
+                    # dest[0] is the page object
+                    page_obj = dest[0]
+                    try:
+                        page_num = objid_to_page.get(page_obj.objid)
+                    except Exception:
+                        pass
+                else:
+                    try:
+                        page_num = objid_to_page.get(dest.objid)
+                    except Exception:
+                        pass
+
+            bookmarks.append({"level": level, "title": str(title), "page_num": page_num})
+        except Exception:
+            continue
+
+    return bookmarks
@@ -0,0 +1,35 @@
+---
+name: extract_pdf_text
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def extract_pdf_text(pdf_path: str) -> str"
+description: "Extrae todo el texto de un PDF concatenando todas las paginas. Usa PyPDF2."
+tags: [pdf, text, extraction, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [PyPDF2]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/extract_pdf_text.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+text = extract_pdf_text("/path/to/document.pdf")
+print(len(text))  # total characters
+```
+
+## Notas
+
+Requiere `pip install PyPDF2`. Extraccion basica de texto — no maneja OCR ni PDFs escaneados. Para PDFs complejos considerar PyMuPDF.
@@ -0,0 +1,19 @@
+"""Extract all text from a PDF file using PyPDF2."""
+
+import PyPDF2
+
+
+def extract_pdf_text(pdf_path: str) -> str:
+    """Extract all text from a PDF file.
+
+    Args:
+        pdf_path: Path to the PDF file.
+
+    Returns:
+        str: Concatenated text from all pages.
+    """
+    pdf_reader = PyPDF2.PdfReader(pdf_path)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text() or ""
+    return text
@@ -0,0 +1,51 @@
+---
+name: extract_text_from_file
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "extract_text_from_file(file_path: str) -> str"
+description: "Extrae texto plano de un archivo. Soporta PDF (PyMuPDF), Markdown y TXT con deteccion automatica de encoding."
+tags: [text, pdf, markdown, txt, encoding, extraction, file, io]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["os", "fitz (PyMuPDF)", "charset_normalizer", "chardet"]
+tested: true
+tests:
+  - "PDF con texto extrae contenido correctamente"
+  - "archivo MD UTF-8 retorna contenido"
+  - "archivo TXT latin-1 detecta encoding"
+  - "archivo inexistente lanza FileNotFoundError"
+  - "extension no soportada lanza ValueError"
+test_file_path: "python/functions/core/extract_text_from_file_test.py"
+file_path: "python/functions/core/extract_text_from_file.py"
+---
+
+## Ejemplo
+
+```python
+# PDF
+text = extract_text_from_file("report.pdf")
+
+# Markdown
+text = extract_text_from_file("README.md")
+
+# TXT con encoding desconocido
+text = extract_text_from_file("notes.txt")
+```
+
+## Notas
+
+Para PDF usa PyMuPDF (`fitz`) que produce mejor texto que PyPDF2, especialmente en PDFs con columnas o layout complejo. Las paginas se unen con `\n\n`.
+
+La deteccion de encoding para archivos de texto sigue este orden de prioridad:
+1. Intenta UTF-8 directamente
+2. `charset_normalizer.from_bytes().best().encoding`
+3. `chardet.detect(data)["encoding"]`
+4. UTF-8 con `errors='replace'` como ultimo recurso
+
+Diferencia con `extract_pdf_text_py_core`: esa funcion usa PyPDF2 y solo soporta PDF. Esta funcion usa PyMuPDF y soporta ademas MD y TXT con deteccion de encoding.
@@ -0,0 +1,92 @@
+"""Extract plain text from PDF, Markdown, or TXT files."""
+
+
+SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt"}
+
+
+def _detect_encoding(data: bytes) -> str:
+    """Detect encoding of raw bytes using multiple fallback strategies."""
+    # Strategy 1: UTF-8
+    try:
+        data.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Strategy 2: charset_normalizer
+    try:
+        from charset_normalizer import from_bytes
+
+        result = from_bytes(data).best()
+        if result is not None and result.encoding:
+            return result.encoding
+    except ImportError:
+        pass
+
+    # Strategy 3: chardet
+    try:
+        import chardet
+
+        detected = chardet.detect(data)
+        if detected and detected.get("encoding"):
+            return detected["encoding"]
+    except ImportError:
+        pass
+
+    # Last resort: UTF-8 with replacement
+    return "utf-8"
+
+
+def extract_text_from_file(file_path: str) -> str:
+    """Extract plain text from a file. Supports PDF, Markdown and TXT.
+
+    For PDF files uses PyMuPDF (fitz) to extract text from each page,
+    joining them with double newlines. For text-based files (.md, .markdown,
+    .txt) reads the file with automatic encoding detection.
+
+    Args:
+        file_path: Absolute or relative path to the file.
+
+    Returns:
+        str: Extracted plain text content.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If the file extension is not supported.
+        ImportError: If PyMuPDF is not installed and a PDF is provided.
+    """
+    import os
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    _, ext = os.path.splitext(file_path.lower())
+
+    if ext == ".pdf":
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as e:
+            raise ImportError(
+                "PyMuPDF is required for PDF extraction. "
+                "Install it with: pip install PyMuPDF"
+            ) from e
+
+        doc = fitz.open(file_path)
+        pages = [page.get_text() for page in doc]
+        return "\n\n".join(pages)
+
+    elif ext in {".md", ".markdown", ".txt"}:
+        with open(file_path, "rb") as f:
+            raw = f.read()
+
+        encoding = _detect_encoding(raw)
+        try:
+            return raw.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            return raw.decode("utf-8", errors="replace")
+
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
@@ -0,0 +1,83 @@
+"""Tests para extract_text_from_file."""
+
+import os
+import sys
+import tempfile
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(__file__))
+from extract_text_from_file import extract_text_from_file
+
+
+def test_pdf_con_texto_extrae_contenido_correctamente():
+    """PDF con texto extrae contenido correctamente."""
+    try:
+        import fitz
+    except ImportError:
+        pytest.skip("PyMuPDF no instalado")
+
+    # Create a minimal in-memory PDF using PyMuPDF and write it to a temp file
+    doc = fitz.open()
+    page = doc.new_page()
+    page.insert_text((72, 72), "Hello from PDF")
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
+        tmp_path = f.name
+    try:
+        doc.save(tmp_path)
+        doc.close()
+        result = extract_text_from_file(tmp_path)
+        assert "Hello from PDF" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_md_utf8_retorna_contenido():
+    """archivo MD UTF-8 retorna contenido."""
+    content = "# Titulo\n\nParrafo con texto UTF-8: cafe, senor, japon.\n"
+    with tempfile.NamedTemporaryFile(
+        suffix=".md", mode="wb", delete=False
+    ) as f:
+        f.write(content.encode("utf-8"))
+        tmp_path = f.name
+    try:
+        result = extract_text_from_file(tmp_path)
+        assert "# Titulo" in result
+        assert "cafe" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_txt_latin1_detecta_encoding():
+    """archivo TXT latin-1 detecta encoding."""
+    content = "Texto en latin-1: cafe, hotel, naive\n"
+    with tempfile.NamedTemporaryFile(
+        suffix=".txt", mode="wb", delete=False
+    ) as f:
+        f.write(content.encode("latin-1"))
+        tmp_path = f.name
+    try:
+        result = extract_text_from_file(tmp_path)
+        # The word "cafe" or similar should appear in the decoded result
+        assert len(result) > 0
+        assert "cafe" in result or "caf" in result
+    finally:
+        os.unlink(tmp_path)
+
+
+def test_archivo_inexistente_lanza_filenotfounderror():
+    """archivo inexistente lanza FileNotFoundError."""
+    with pytest.raises(FileNotFoundError):
+        extract_text_from_file("/tmp/no_existe_este_archivo_12345.txt")
+
+
+def test_extension_no_soportada_lanza_valueerror():
+    """extension no soportada lanza ValueError."""
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
+        f.write(b"fake docx content")
+        tmp_path = f.name
+    try:
+        with pytest.raises(ValueError, match="Unsupported file extension"):
+            extract_text_from_file(tmp_path)
+    finally:
+        os.unlink(tmp_path)
@@ -0,0 +1,50 @@
+---
+name: fetch_and_parse_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "fetch_and_parse_url(url: str, timeout: float = 30.0) -> str"
+description: "Descarga una pagina web y la convierte a markdown. Combina detect_url_type + fetch HTML + html_to_markdown en una sola operacion."
+tags: [http, fetch, html, markdown, parse, url, scraping]
+uses_functions:
+  - detect_url_type_py_core
+  - html_to_markdown_py_core
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: ["httpx"]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/fetch_and_parse_url.py"
+---
+
+## Ejemplo
+
+```python
+from core.fetch_and_parse_url import fetch_and_parse_url
+
+# Descargar y convertir una pagina web
+md = fetch_and_parse_url("https://example.com")
+print(md)
+
+# Con timeout personalizado
+md = fetch_and_parse_url("https://en.wikipedia.org/wiki/Python", timeout=15.0)
+```
+
+## Notas
+
+Algoritmo:
+1. `detect_url_type(url)` determina el tipo de contenido (por patron, extension o HEAD request).
+2. Si es `code_repository` → lanza Exception (requiere git clone, no HTTP fetch).
+3. Si es `pdf` → lanza Exception (requiere pdfminer/pypdf, no incluido).
+4. `httpx.get(url)` descarga el contenido con follow_redirects.
+5. Si es `webpage` o Content-Type HTML → `html_to_markdown(raw_html)`.
+6. Si es `markdown`, `text` o codigo → retorna el texto directamente.
+
+Lanza `Exception` con mensaje descriptivo en cualquier fallo de red o tipo no soportado.
+
+Funcion impura: hace I/O (HTTP requests).
@@ -0,0 +1,64 @@
+"""Descarga una pagina web y la convierte a markdown."""
+
+from __future__ import annotations
+
+
+def fetch_and_parse_url(url: str, timeout: float = 30.0) -> str:
+    """Descarga una pagina web y la convierte a markdown.
+
+    Detecta el tipo de URL con detect_url_type, descarga el contenido con
+    httpx y lo convierte al formato apropiado:
+    - webpage: fetch HTML → html_to_markdown
+    - markdown: retorna el texto directamente
+    - text/code: retorna el texto directamente
+    - pdf: retorna stub (requiere dependencia externa)
+    - code_repository: retorna stub (requiere clonar repo)
+
+    Args:
+        url: URL a descargar y parsear.
+        timeout: Timeout en segundos para las peticiones HTTP.
+
+    Returns:
+        Contenido de la URL en formato markdown.
+
+    Raises:
+        Exception: Si falla la descarga (timeout, DNS, HTTP error) o el tipo
+                   de URL no es soportado.
+    """
+    import httpx
+
+    from detect_url_type import detect_url_type
+    from html_to_markdown import html_to_markdown
+
+    # Detectar tipo de URL (puede hacer HEAD request)
+    url_type, _meta = detect_url_type(url, timeout=timeout)
+
+    if url_type == "code_repository":
+        raise Exception(
+            f"fetch_and_parse_url: code_repository URLs require git clone, not supported. url={url!r}"
+        )
+
+    if url_type == "pdf":
+        raise Exception(
+            f"fetch_and_parse_url: PDF parsing requires external dependency (pdfminer/pypdf). url={url!r}"
+        )
+
+    # Fetch content via GET
+    try:
+        response = httpx.get(url, timeout=timeout, follow_redirects=True)
+        response.raise_for_status()
+    except httpx.HTTPStatusError as exc:
+        raise Exception(
+            f"fetch_and_parse_url: HTTP {exc.response.status_code} for {url!r}"
+        ) from exc
+    except Exception as exc:
+        raise Exception(f"fetch_and_parse_url: request failed for {url!r}: {exc}") from exc
+
+    content_type = response.headers.get("content-type", "").lower()
+    raw_text = response.text
+
+    if url_type == "webpage" or "text/html" in content_type:
+        return html_to_markdown(raw_text)
+
+    # markdown, text, or code files — return as-is
+    return raw_text
@@ -0,0 +1,38 @@
+---
+name: find_headings
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def find_headings(content: str) -> list[tuple[int, int, str, int]]"
+description: "Encuentra todos los headings markdown (# a ######), excluyendo los que estan dentro de code blocks, HTML comments y bloques indentados. Retorna lista de (start_pos, end_pos, title, level)."
+tags: [markdown, headings, parsing, extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+tested: true
+tests:
+  - "headings normales detectados correctamente"
+  - "headings dentro de code blocks no detectados"
+  - "headings escapados ignorados"
+  - "headings en HTML comments ignorados"
+test_file_path: "python/functions/core/parse_markdown_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+content = "# Title\n\nSome text\n\n## Section\n\n```\n# Ignored\n```\n"
+headings = find_headings(content)
+# [(0, 7, "Title", 1), (22, 33, "Section", 2)]
+# (positions approximated)
+```
+
+## Notas
+
+Funcion pura. Excluye tres tipos de contextos: bloques de codigo triple backtick, comentarios HTML (`<!-- ... -->`), y lineas indentadas con 4 espacios o tabulacion. Tambien filtra headings precedidos por backslash (`\#`). Diferencia clave respecto a `extract_markdown_headers`: esta funcion retorna posiciones de caracter, no numeros de linea, lo que facilita la extraccion de contenido entre headings.
@@ -0,0 +1,36 @@
+---
+name: flatten_tree
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def flatten_tree(structure: Any) -> list[dict]"
+description: "Aplana un arbol jerarquico (dict con 'nodes') a lista plana sin hijos. Deep copy de cada nodo."
+tags: [tree, flatten, hierarchy, functional]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [copy]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}]}]
+flatten_tree(tree)
+# [{"title": "A"}, {"title": "A1"}]
+```
+
+## Notas
+
+Funcion pura. Usa deep copy para no mutar el arbol original. Elimina el campo 'nodes' de cada nodo aplanado.
@@ -0,0 +1,49 @@
+---
+name: format_iso8601
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "format_iso8601(dt: datetime) -> str"
+description: "Formatea un datetime a ISO 8601 UTC con milisegundos. Formato: yyyy-MM-ddTHH:mm:ss.SSSZ. Si naive asume UTC, si aware convierte a UTC."
+tags: [datetime, iso8601, format, time, utc]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["datetime"]
+tested: true
+tests:
+  - "datetime naive formateado como UTC"
+  - "datetime con timezone convertido a UTC"
+  - "datetime UTC sin conversion"
+test_file_path: "python/functions/core/format_iso8601_test.py"
+file_path: "python/functions/core/format_iso8601.py"
+---
+
+## Ejemplo
+
+```python
+from datetime import datetime, timezone, timedelta
+from format_iso8601 import format_iso8601
+
+# Naive (asume UTC)
+s = format_iso8601(datetime(2026, 2, 21, 13, 20, 23, 147000))
+# "2026-02-21T13:20:23.147Z"
+
+# Con timezone +8
+tz8 = timezone(timedelta(hours=8))
+s = format_iso8601(datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8))
+# "2026-02-21T13:20:23.147Z"
+```
+
+## Notas
+
+Algoritmo:
+1. Si naive: `dt.replace(tzinfo=timezone.utc)`.
+2. Si aware: `dt.astimezone(timezone.utc)`.
+3. `dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")`.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,24 @@
+"""Formatea un datetime a ISO 8601 UTC con milisegundos."""
+
+from datetime import datetime, timezone
+
+
+def format_iso8601(dt: datetime) -> str:
+    """Formatea un datetime a ISO 8601 UTC con milisegundos.
+
+    Formato de salida: ``yyyy-MM-ddTHH:mm:ss.SSSZ``
+
+    Si el datetime es naive (sin tzinfo), se asume UTC.
+    Si el datetime es aware, se convierte a UTC antes de formatear.
+
+    Args:
+        dt: datetime a formatear. Puede ser naive o aware.
+
+    Returns:
+        String ISO 8601 en UTC con milisegundos, terminando en 'Z'.
+    """
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    else:
+        dt = dt.astimezone(timezone.utc)
+    return dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
@@ -0,0 +1,28 @@
+"""Tests para format_iso8601."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datetime import datetime, timezone, timedelta
+from format_iso8601 import format_iso8601
+
+
+def test_datetime_naive_formateado_como_utc():
+    dt = datetime(2026, 2, 21, 13, 20, 23, 147000)
+    result = format_iso8601(dt)
+    assert result == "2026-02-21T13:20:23.147Z"
+
+
+def test_datetime_con_timezone_convertido_a_utc():
+    tz8 = timezone(timedelta(hours=8))
+    dt = datetime(2026, 2, 21, 21, 20, 23, 147000, tzinfo=tz8)
+    result = format_iso8601(dt)
+    assert result == "2026-02-21T13:20:23.147Z"
+
+
+def test_datetime_utc_sin_conversion():
+    dt = datetime(2026, 6, 15, 9, 0, 0, 500000, tzinfo=timezone.utc)
+    result = format_iso8601(dt)
+    assert result == "2026-06-15T09:00:00.500Z"
@@ -0,0 +1,54 @@
+---
+name: format_simplified
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "format_simplified(dt: datetime, now: datetime) -> str"
+description: "Formato humano simplificado: si dt es del mismo dia que now muestra HH:MM:SS, si no muestra YYYY-MM-DD."
+tags: [datetime, format, time, human, display]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["datetime"]
+tested: true
+tests:
+  - "mismo dia muestra formato hora"
+  - "dia anterior muestra formato fecha"
+  - "exactamente 24h muestra formato fecha"
+test_file_path: "python/functions/core/format_simplified_test.py"
+file_path: "python/functions/core/format_simplified.py"
+---
+
+## Ejemplo
+
+```python
+from datetime import datetime
+from format_simplified import format_simplified
+
+now = datetime(2026, 2, 21, 15, 0, 0)
+
+# Mismo dia
+s = format_simplified(datetime(2026, 2, 21, 9, 30, 0), now)
+# "09:30:00"
+
+# Dia anterior
+s = format_simplified(datetime(2026, 2, 20, 9, 30, 0), now)
+# "2026-02-20"
+```
+
+## Notas
+
+Algoritmo:
+1. Remover tzinfo de ambos datetimes para comparacion simple (`replace(tzinfo=None)`).
+2. Si `(now - dt).days < 1`: retornar `dt.strftime("%H:%M:%S")`.
+3. Si no: retornar `dt.strftime("%Y-%m-%d")`.
+
+El umbral de 1 dia en `timedelta.days` significa que cualquier diferencia
+menor a 24 horas se muestra como hora. Un dt exactamente 24h atras
+tendra `days == 1`, mostrando fecha.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,25 @@
+"""Formato humano simplificado de datetime: hora si es hoy, fecha si es otro dia."""
+
+from datetime import datetime
+
+
+def format_simplified(dt: datetime, now: datetime) -> str:
+    """Formato humano simplificado de datetime.
+
+    Si ``dt`` es del mismo dia que ``now`` (diferencia < 1 dia), retorna
+    la hora en formato ``HH:MM:SS``. En caso contrario retorna la fecha
+    en formato ``YYYY-MM-DD``.
+
+    Args:
+        dt: datetime a formatear.
+        now: datetime de referencia (el momento actual).
+
+    Returns:
+        String ``HH:MM:SS`` si mismo dia, ``YYYY-MM-DD`` si otro dia.
+    """
+    dt_naive = dt.replace(tzinfo=None)
+    now_naive = now.replace(tzinfo=None)
+    diff = now_naive - dt_naive
+    if diff.days < 1:
+        return dt.strftime("%H:%M:%S")
+    return dt.strftime("%Y-%m-%d")
@@ -0,0 +1,30 @@
+"""Tests para format_simplified."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from datetime import datetime, timedelta
+from format_simplified import format_simplified
+
+
+def test_mismo_dia_muestra_formato_hora():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = datetime(2026, 2, 21, 9, 30, 45)
+    result = format_simplified(dt, now)
+    assert result == "09:30:45"
+
+
+def test_dia_anterior_muestra_formato_fecha():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = datetime(2026, 2, 20, 9, 30, 45)
+    result = format_simplified(dt, now)
+    assert result == "2026-02-20"
+
+
+def test_exactamente_24h_muestra_formato_fecha():
+    now = datetime(2026, 2, 21, 15, 0, 0)
+    dt = now - timedelta(hours=24)
+    result = format_simplified(dt, now)
+    assert result == "2026-02-20"
@@ -0,0 +1,36 @@
+---
+name: format_table_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str"
+description: "Convierte una lista 2D de celdas a tabla markdown con alineacion de columnas. Escapa pipes en celdas y añade separador header."
+tags: [markdown, table, formatting, text, pure]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["tabla normal", "tabla con celdas vacias", "tabla con 1 fila", "tabla vacia", "celdas con pipes", "sin header"]
+test_file_path: "python/functions/core/format_table_to_markdown_test.py"
+file_path: "python/functions/core/format_table_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]
+md = format_table_to_markdown(rows)
+# | Name  | Age |
+# | ----- | --- |
+# | Alice | 30  |
+# | Bob   | 25  |
+```
+
+## Notas
+
+Funcion pura. No tiene dependencias externas. Calcula el ancho maximo por columna para alinear. El separador usa minimo 3 guiones por columna para cumplir con la especificacion markdown. Escapa los pipes dentro de celdas con `\|`. Si `has_header=False`, omite la fila separadora.
@@ -0,0 +1,52 @@
+"""Convert a 2D list of cells to a markdown table with column alignment."""
+
+
+def format_table_to_markdown(rows: list[list[str]], has_header: bool = True) -> str:
+    """Convert a 2D list of cells to a markdown table.
+
+    Args:
+        rows: 2D list where each inner list is a row of cell strings.
+        has_header: If True, the first row is treated as the header row.
+
+    Returns:
+        str: Markdown table string. Returns empty string for empty input.
+    """
+    if not rows:
+        return ""
+
+    def escape_cell(cell: str) -> str:
+        return str(cell).replace("|", "\\|")
+
+    # Determine column count from widest row
+    col_count = max(len(row) for row in rows)
+
+    # Pad rows to same column count
+    padded = [row + [""] * (col_count - len(row)) for row in rows]
+
+    # Escape pipe characters in all cells
+    escaped = [[escape_cell(cell) for cell in row] for row in padded]
+
+    # Calculate max width per column
+    col_widths = [
+        max(len(escaped[r][c]) for r in range(len(escaped)))
+        for c in range(col_count)
+    ]
+    col_widths = [max(w, 3) for w in col_widths]  # minimum width of 3 for separator
+
+    def format_row(row: list[str]) -> str:
+        cells = [cell.ljust(col_widths[i]) for i, cell in enumerate(row)]
+        return "| " + " | ".join(cells) + " |"
+
+    lines = []
+
+    if has_header and len(escaped) >= 1:
+        lines.append(format_row(escaped[0]))
+        separator = "| " + " | ".join("-" * col_widths[i] for i in range(col_count)) + " |"
+        lines.append(separator)
+        for row in escaped[1:]:
+            lines.append(format_row(row))
+    else:
+        for row in escaped:
+            lines.append(format_row(row))
+
+    return "\n".join(lines)
@@ -0,0 +1,63 @@
+"""Tests para format_table_to_markdown."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from format_table_to_markdown import format_table_to_markdown
+
+
+def test_tabla_normal():
+    rows = [["Name", "Age", "City"], ["Alice", "30", "Madrid"], ["Bob", "25", "Berlin"]]
+    result = format_table_to_markdown(rows)
+    assert "| Name  | Age | City   |" in result
+    assert "| ---   | --- | ---    |" in result or "---" in result
+    assert "| Alice | 30  | Madrid |" in result
+    assert "| Bob   | 25  | Berlin |" in result
+
+
+def test_tabla_con_celdas_vacias():
+    rows = [["A", "B"], ["", "x"], ["y", ""]]
+    result = format_table_to_markdown(rows)
+    assert "|" in result
+    lines = result.split("\n")
+    assert len(lines) == 4  # header + separator + 2 data rows
+
+
+def test_tabla_con_1_fila():
+    rows = [["Solo", "Row"]]
+    result = format_table_to_markdown(rows)
+    lines = result.split("\n")
+    # header + separator (no data rows)
+    assert len(lines) == 2
+    assert "Solo" in lines[0]
+    assert "---" in lines[1]
+
+
+def test_tabla_vacia():
+    result = format_table_to_markdown([])
+    assert result == ""
+
+
+def test_celdas_con_pipes():
+    rows = [["Header"], ["cell|with|pipes"]]
+    result = format_table_to_markdown(rows)
+    assert "\\|" in result
+
+
+def test_sin_header():
+    rows = [["A", "B"], ["C", "D"]]
+    result = format_table_to_markdown(rows, has_header=False)
+    assert "---" not in result
+    lines = result.split("\n")
+    assert len(lines) == 2
+
+
+if __name__ == "__main__":
+    test_tabla_normal()
+    test_tabla_con_celdas_vacias()
+    test_tabla_con_1_fila()
+    test_tabla_vacia()
+    test_celdas_con_pipes()
+    test_sin_header()
+    print("All tests passed.")
@@ -0,0 +1,36 @@
+---
+name: format_tree_structure
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def format_tree_structure(structure: Any, order: list[str] = None) -> Any"
+description: "Reordena campos de cada nodo de un arbol segun orden de claves especificado."
+tags: [tree, format, order, structure]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"text": "...", "title": "Intro", "node_id": "0001"}]
+format_tree_structure(tree, order=["title", "node_id", "text"])
+# [{"title": "Intro", "node_id": "0001", "text": "..."}]
+```
+
+## Notas
+
+Funcion pura. Elimina nodos vacios (nodes=[]) automaticamente. Claves no listadas en order se descartan.
@@ -0,0 +1,49 @@
+---
+name: from_csv
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "from_csv(text: str, delimiter: str = ',', has_header: bool = True) -> list[dict]"
+description: "Parser CSV a datos tabulares. Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180. Si has_header=False, genera keys col_0, col_1, etc."
+tags: [csv, parser, import, tabular, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "csv simple con header"
+  - "campos con escaping"
+  - "sin header keys generadas"
+  - "lineas vacias ignoradas"
+  - "un solo campo por fila"
+test_file_path: "python/functions/core/from_csv_test.py"
+file_path: "python/functions/core/from_csv.py"
+---
+
+## Ejemplo
+
+```python
+text = "nombre,edad\r\nAna,30\r\nBob,25"
+rows = from_csv(text)
+# [{"nombre": "Ana", "edad": "30"}, {"nombre": "Bob", "edad": "25"}]
+
+# Sin header
+text = "Ana,30\nBob,25"
+rows = from_csv(text, has_header=False)
+# [{"col_0": "Ana", "col_1": "30"}, {"col_0": "Bob", "col_1": "25"}]
+
+# Con escaping
+text = 'a,b\r\n"dijo ""hola""","uno,dos"'
+rows = from_csv(text)
+# [{"a": 'dijo "hola"', "b": "uno,dos"}]
+```
+
+## Notas
+
+Parser manual sin el modulo csv de stdlib. Normaliza CRLF y LF antes de procesar.
+Ignora lineas vacias. Todos los valores son strings — la conversion de tipos queda a cargo del caller.
@@ -0,0 +1,83 @@
+"""Parser CSV a datos tabulares (RFC 4180). Complemento de to_csv."""
+
+
+def _parse_row(line: str, delimiter: str) -> list[str]:
+    """Parsea una linea CSV respetando campos entre comillas (RFC 4180)."""
+    fields: list[str] = []
+    field_chars: list[str] = []
+    in_quotes = False
+    i = 0
+
+    while i < len(line):
+        ch = line[i]
+
+        if in_quotes:
+            if ch == '"':
+                # Comilla doble escapada o cierre de campo
+                if i + 1 < len(line) and line[i + 1] == '"':
+                    field_chars.append('"')
+                    i += 2
+                    continue
+                else:
+                    in_quotes = False
+            else:
+                field_chars.append(ch)
+        else:
+            if ch == '"' and not field_chars:
+                in_quotes = True
+            elif ch == delimiter:
+                fields.append("".join(field_chars))
+                field_chars = []
+            else:
+                field_chars.append(ch)
+        i += 1
+
+    fields.append("".join(field_chars))
+    return fields
+
+
+def from_csv(
+    text: str,
+    delimiter: str = ",",
+    has_header: bool = True,
+) -> list[dict]:
+    """Parser CSV a lista de dicts.
+
+    Complemento de to_csv. Soporta campos entre comillas con escaping RFC 4180.
+    Si has_header=False, genera keys col_0, col_1, etc.
+
+    Args:
+        text: Contenido CSV completo como string.
+        delimiter: Separador de campos. Por defecto coma.
+        has_header: Si True, primera fila es el encabezado.
+                    Si False, genera keys col_0, col_1, ...
+
+    Returns:
+        Lista de dicts. Lista vacia si el texto esta vacio o solo tiene header.
+    """
+    # Normalizar line endings
+    normalized = text.replace("\r\n", "\n").replace("\r", "\n")
+    lines = [l for l in normalized.split("\n") if l.strip() != ""]
+
+    if not lines:
+        return []
+
+    if has_header:
+        headers = _parse_row(lines[0], delimiter)
+        data_lines = lines[1:]
+    else:
+        # Determinar numero de columnas desde la primera fila
+        sample = _parse_row(lines[0], delimiter)
+        headers = [f"col_{i}" for i in range(len(sample))]
+        data_lines = lines
+
+    result: list[dict] = []
+    for line in data_lines:
+        fields = _parse_row(line, delimiter)
+        # Alinear con headers (rellenar con "" si faltan campos)
+        row = {}
+        for i, header in enumerate(headers):
+            row[header] = fields[i] if i < len(fields) else ""
+        result.append(row)
+
+    return result
@@ -0,0 +1,40 @@
+"""Tests para from_csv."""
+
+from from_csv import from_csv
+
+
+def test_csv_simple_con_header():
+    text = "nombre,edad\r\nAna,30\r\nBob,25"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"nombre": "Ana", "edad": "30"}
+    assert result[1] == {"nombre": "Bob", "edad": "25"}
+
+
+def test_campos_con_escaping():
+    text = 'a,b\r\n"dijo ""hola""","uno,dos"'
+    result = from_csv(text)
+    assert result[0]["a"] == 'dijo "hola"'
+    assert result[0]["b"] == "uno,dos"
+
+
+def test_sin_header_keys_generadas():
+    text = "foo,bar\nbaz,qux"
+    result = from_csv(text, has_header=False)
+    assert result[0] == {"col_0": "foo", "col_1": "bar"}
+    assert result[1] == {"col_0": "baz", "col_1": "qux"}
+
+
+def test_lineas_vacias_ignoradas():
+    text = "x,y\n\n1,2\n\n3,4\n"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"x": "1", "y": "2"}
+
+
+def test_un_solo_campo_por_fila():
+    text = "valor\nhola\nmundo"
+    result = from_csv(text)
+    assert len(result) == 2
+    assert result[0] == {"valor": "hola"}
+    assert result[1] == {"valor": "mundo"}
@@ -0,0 +1,49 @@
+---
+name: from_jsonl
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "from_jsonl(text: str) -> list[dict]"
+description: "Parser JSONL a lista de dicts. Ignora lineas vacias. Lanza ValueError con el numero de linea si una linea contiene JSON invalido. Complemento de to_jsonl."
+tags: [jsonl, json, parser, import, streaming, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["json"]
+tested: true
+tests:
+  - "jsonl valido"
+  - "lineas vacias intercaladas"
+  - "linea invalida raise con numero"
+test_file_path: "python/functions/core/from_jsonl_test.py"
+file_path: "python/functions/core/from_jsonl.py"
+---
+
+## Ejemplo
+
+```python
+text = '{"id": 1}\n{"id": 2}'
+rows = from_jsonl(text)
+# [{"id": 1}, {"id": 2}]
+
+# Lineas vacias ignoradas
+text = '{"id": 1}\n\n{"id": 2}\n'
+rows = from_jsonl(text)
+# [{"id": 1}, {"id": 2}]
+
+# JSON invalido levanta error con numero de linea
+try:
+    from_jsonl('{"ok": 1}\nnot-json')
+except ValueError as e:
+    print(e)  # "JSON invalido en linea 2: ..."
+```
+
+## Notas
+
+Aunque se declara pure (no hace I/O), puede lanzar ValueError para JSON invalido.
+Esto es consistente con la convencion del registry: funciones puras pueden lanzar
+excepciones de validacion — solo las funciones impuras retornan error como valor.
@@ -0,0 +1,35 @@
+"""Parser JSON Lines (JSONL) a lista de dicts. Complemento de to_jsonl."""
+
+import json
+
+
+def from_jsonl(text: str) -> list[dict]:
+    """Parser JSONL a lista de dicts.
+
+    Complemento de to_jsonl. Ignora lineas vacias. Lanza ValueError si
+    una linea contiene JSON invalido, indicando el numero de linea.
+
+    Args:
+        text: Contenido JSONL como string (una linea JSON por linea).
+
+    Returns:
+        Lista de dicts parseados.
+
+    Raises:
+        ValueError: Si una linea no es JSON valido, con el numero de linea.
+    """
+    result: list[dict] = []
+
+    for line_num, line in enumerate(text.splitlines(), start=1):
+        stripped = line.strip()
+        if not stripped:
+            continue
+        try:
+            parsed = json.loads(stripped)
+        except json.JSONDecodeError as exc:
+            raise ValueError(
+                f"JSON invalido en linea {line_num}: {exc}"
+            ) from exc
+        result.append(parsed)
+
+    return result
@@ -0,0 +1,25 @@
+"""Tests para from_jsonl."""
+
+import pytest
+
+from from_jsonl import from_jsonl
+
+
+def test_jsonl_valido():
+    text = '{"a": 1}\n{"b": 2}'
+    result = from_jsonl(text)
+    assert result == [{"a": 1}, {"b": 2}]
+
+
+def test_lineas_vacias_intercaladas():
+    text = '{"x": 1}\n\n{"x": 2}\n\n'
+    result = from_jsonl(text)
+    assert len(result) == 2
+    assert result[0] == {"x": 1}
+    assert result[1] == {"x": 2}
+
+
+def test_linea_invalida_raise_con_numero():
+    text = '{"ok": 1}\nnot-json\n{"ok": 3}'
+    with pytest.raises(ValueError, match="linea 2"):
+        from_jsonl(text)
@@ -0,0 +1,70 @@
+---
+name: generate_html_report
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "generate_html_report(title: str, sections: list[dict]) -> str"
+description: "Genera un reporte HTML autocontenido con CSS inline. Soporta secciones de tipo table (list[dict]), text (str con markdown basico), kpi (cards con label/value/delta) y list (list[str]). Para exportar resultados de pipelines sin servidor."
+tags: [html, report, export, table, kpi, template, format]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re"]
+tested: true
+tests:
+  - "reporte con una tabla"
+  - "reporte con multiples secciones mixtas"
+  - "kpi con deltas positivos y negativos"
+  - "caracteres especiales html escapados en data"
+  - "titulo con caracteres especiales"
+test_file_path: "python/functions/core/generate_html_report_test.py"
+file_path: "python/functions/core/generate_html_report.py"
+---
+
+## Ejemplo
+
+```python
+sections = [
+    {
+        "heading": "Resumen ejecutivo",
+        "type": "kpi",
+        "data": [
+            {"label": "Revenue", "value": "$1.2M", "delta": "+15%"},
+            {"label": "Churn", "value": "3.2%", "delta": "-0.5%"},
+        ],
+    },
+    {
+        "heading": "Top usuarios",
+        "type": "table",
+        "data": [
+            {"usuario": "ana@example.com", "compras": 42},
+            {"usuario": "bob@example.com", "compras": 38},
+        ],
+    },
+    {
+        "heading": "Notas",
+        "type": "text",
+        "data": "Datos del **trimestre Q1**. Ver [dashboard](https://example.com).",
+    },
+]
+
+html = generate_html_report("Reporte Mensual", sections)
+# Retorna string HTML completo con DOCTYPE, head con CSS inline, body con secciones
+```
+
+## Tipos de seccion
+
+- **table**: `data` es `list[dict]` — renderiza `<table>` con headers extraidos de las keys
+- **text**: `data` es `str` — soporta `**bold**` y `[text](url)`, escapa HTML
+- **kpi**: `data` es `list[{"label", "value", "delta"}]` — cards con colores para delta positivo/negativo
+- **list**: `data` es `list[str]` — renderiza `<ul><li>...</li></ul>`
+
+## Notas
+
+CSS completamente inline en `<style>`. Tema minimalista con max-width 960px, sans-serif,
+tabla con zebra stripes, cards KPI con colores verde/rojo para deltas.
+Todo el contenido del usuario pasa por HTML escape para proteger contra XSS.
@@ -0,0 +1,164 @@
+"""Genera reportes HTML autocontenidos con CSS inline."""
+
+
+_HTML_ESCAPES = {
+    "&": "&amp;",
+    "<": "&lt;",
+    ">": "&gt;",
+    '"': "&quot;",
+    "'": "&#x27;",
+}
+
+
+def _esc(value: str) -> str:
+    for ch, entity in _HTML_ESCAPES.items():
+        value = value.replace(ch, entity)
+    return value
+
+
+def _render_table(data: list[dict]) -> str:
+    if not data:
+        return "<p><em>(sin datos)</em></p>"
+    headers = list(data[0].keys())
+    rows_html = ""
+    for i, row in enumerate(data):
+        cls = ' class="zebra"' if i % 2 == 1 else ""
+        cells = "".join(f"<td>{_esc(str(row.get(h, '')))}</td>" for h in headers)
+        rows_html += f"<tr{cls}>{cells}</tr>\n"
+    headers_html = "".join(f"<th>{_esc(h)}</th>" for h in headers)
+    return (
+        f"<table>\n<thead><tr>{headers_html}</tr></thead>\n"
+        f"<tbody>\n{rows_html}</tbody>\n</table>"
+    )
+
+
+def _render_text(data: str) -> str:
+    # Markdown basico: **bold** y [text](url)
+    import re
+
+    text = _esc(str(data))
+    # Bold: **text** (despues de escapar, & no interfiere)
+    text = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", text)
+    # Links: [text](url)
+    text = re.sub(r"\[(.+?)\]\((.+?)\)", r'<a href="\2">\1</a>', text)
+    return f"<p>{text}</p>"
+
+
+def _render_kpi(data: list[dict]) -> str:
+    cards = ""
+    for kpi in data:
+        label = _esc(str(kpi.get("label", "")))
+        value = _esc(str(kpi.get("value", "")))
+        delta = kpi.get("delta")
+        delta_html = ""
+        if delta is not None:
+            delta_str = str(delta)
+            if delta_str.startswith("+"):
+                delta_html = f'<span class="delta-pos">{_esc(delta_str)}</span>'
+            elif delta_str.startswith("-"):
+                delta_html = f'<span class="delta-neg">{_esc(delta_str)}</span>'
+            else:
+                delta_html = f'<span class="delta-neutral">{_esc(delta_str)}</span>'
+        cards += (
+            f'<div class="kpi-card">'
+            f'<div class="kpi-label">{label}</div>'
+            f'<div class="kpi-value">{value}</div>'
+            f"{delta_html}"
+            f"</div>\n"
+        )
+    return f'<div class="kpi-grid">\n{cards}</div>'
+
+
+def _render_list(data: list[str]) -> str:
+    items = "".join(f"<li>{_esc(str(item))}</li>\n" for item in data)
+    return f"<ul>\n{items}</ul>"
+
+
+_CSS = """
+body {
+    font-family: sans-serif;
+    max-width: 960px;
+    margin: 2rem auto;
+    padding: 0 1rem;
+    color: #222;
+    background: #fff;
+}
+h1 { font-size: 1.8rem; border-bottom: 2px solid #ddd; padding-bottom: .5rem; }
+h2 { font-size: 1.3rem; margin-top: 2rem; color: #333; }
+table { border-collapse: collapse; width: 100%; margin: 1rem 0; font-size: .95rem; }
+th { background: #f0f0f0; text-align: left; padding: .5rem .75rem; border: 1px solid #ddd; }
+td { padding: .45rem .75rem; border: 1px solid #ddd; }
+tr.zebra { background: #f9f9f9; }
+ul { padding-left: 1.5rem; }
+li { margin: .3rem 0; }
+p { line-height: 1.6; }
+a { color: #0066cc; }
+.kpi-grid { display: flex; flex-wrap: wrap; gap: 1rem; margin: 1rem 0; }
+.kpi-card {
+    border: 1px solid #ddd;
+    border-radius: 6px;
+    padding: 1rem 1.5rem;
+    min-width: 140px;
+    background: #fafafa;
+}
+.kpi-label { font-size: .85rem; color: #666; margin-bottom: .25rem; }
+.kpi-value { font-size: 1.6rem; font-weight: bold; }
+.delta-pos { color: #16a34a; font-size: .9rem; }
+.delta-neg { color: #dc2626; font-size: .9rem; }
+.delta-neutral { color: #888; font-size: .9rem; }
+""".strip()
+
+
+def generate_html_report(title: str, sections: list[dict]) -> str:
+    """Genera un reporte HTML autocontenido con CSS inline.
+
+    Cada seccion es un dict con:
+        heading: str — titulo de la seccion
+        type: "table" | "text" | "kpi" | "list" — tipo de contenido
+        data: contenido segun el tipo:
+            table -> list[dict]
+            text  -> str (soporta **bold** y [links](url))
+            kpi   -> list[{"label": str, "value": str|number, "delta": str|None}]
+            list  -> list[str]
+
+    No requiere servidor — todo el CSS va inline en <style>.
+
+    Args:
+        title: Titulo del reporte (mostrado en <h1> y <title>).
+        sections: Lista de secciones a incluir en el reporte.
+
+    Returns:
+        String HTML completo con DOCTYPE.
+    """
+    sections_html = ""
+    for section in sections:
+        heading = _esc(str(section.get("heading", "")))
+        kind = section.get("type", "text")
+        data = section.get("data")
+
+        if kind == "table":
+            content = _render_table(data or [])
+        elif kind == "kpi":
+            content = _render_kpi(data or [])
+        elif kind == "list":
+            content = _render_list(data or [])
+        else:
+            content = _render_text(str(data or ""))
+
+        sections_html += f"<section>\n<h2>{heading}</h2>\n{content}\n</section>\n"
+
+    return (
+        "<!DOCTYPE html>\n"
+        "<html lang='es'>\n"
+        "<head>\n"
+        "<meta charset='UTF-8'>\n"
+        "<meta name='viewport' content='width=device-width, initial-scale=1'>\n"
+        f"<title>{_esc(title)}</title>\n"
+        f"<style>\n{_CSS}\n</style>\n"
+        "</head>\n"
+        "<body>\n"
+        f"<h1>{_esc(title)}</h1>\n"
+        f"{sections_html}"
+        "</body>\n"
+        "</html>"
+    )
@@ -0,0 +1,71 @@
+"""Tests para generate_html_report."""
+
+from generate_html_report import generate_html_report
+
+
+def test_reporte_con_una_tabla():
+    sections = [
+        {
+            "heading": "Datos",
+            "type": "table",
+            "data": [{"nombre": "Ana", "score": 99}, {"nombre": "Bob", "score": 87}],
+        }
+    ]
+    html = generate_html_report("Reporte", sections)
+    assert "<!DOCTYPE html>" in html
+    assert "<title>Reporte</title>" in html
+    assert "<th>nombre</th>" in html
+    assert "<td>Ana</td>" in html
+    assert "zebra" in html  # segunda fila tiene class zebra
+
+
+def test_reporte_con_multiples_secciones_mixtas():
+    sections = [
+        {"heading": "Texto", "type": "text", "data": "Hola mundo"},
+        {"heading": "Lista", "type": "list", "data": ["uno", "dos", "tres"]},
+        {"heading": "KPIs", "type": "kpi", "data": [{"label": "Revenue", "value": "1M", "delta": None}]},
+    ]
+    html = generate_html_report("Multi", sections)
+    assert "<p>Hola mundo</p>" in html
+    assert "<li>uno</li>" in html
+    assert "Revenue" in html
+    assert "1M" in html
+
+
+def test_kpi_con_deltas_positivos_y_negativos():
+    sections = [
+        {
+            "heading": "Metricas",
+            "type": "kpi",
+            "data": [
+                {"label": "Ganancia", "value": "5K", "delta": "+12%"},
+                {"label": "Perdida", "value": "2K", "delta": "-5%"},
+                {"label": "Estable", "value": "1K", "delta": "0%"},
+            ],
+        }
+    ]
+    html = generate_html_report("KPIs", sections)
+    assert 'class="delta-pos"' in html
+    assert 'class="delta-neg"' in html
+    assert 'class="delta-neutral"' in html
+    assert "+12%" in html
+    assert "-5%" in html
+
+
+def test_caracteres_especiales_html_escapados_en_data():
+    sections = [
+        {
+            "heading": "Codigo",
+            "type": "table",
+            "data": [{"expr": "<script>alert('xss')</script>"}],
+        }
+    ]
+    html = generate_html_report("Seguro", sections)
+    assert "<script>" not in html
+    assert "&lt;script&gt;" in html
+
+
+def test_titulo_con_caracteres_especiales():
+    html = generate_html_report("Reporte & Analisis <2024>", [])
+    assert "Reporte &amp; Analisis &lt;2024&gt;" in html
+    assert "<title>Reporte &amp; Analisis &lt;2024&gt;</title>" in html
@@ -0,0 +1,36 @@
+---
+name: get_leaf_nodes
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def get_leaf_nodes(structure: Any) -> list[dict]"
+description: "Extrae solo nodos hoja (sin hijos) de un arbol jerarquico. Deep copy de cada nodo."
+tags: [tree, leaf, hierarchy, functional]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [copy]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+tree = [{"title": "A", "nodes": [{"title": "A1", "nodes": []}, {"title": "A2", "nodes": []}]}]
+get_leaf_nodes(tree)
+# [{"title": "A1"}, {"title": "A2"}]
+```
+
+## Notas
+
+Funcion pura. Usa deep copy. Un nodo es hoja si su campo 'nodes' es falsy (vacio o ausente).
@@ -0,0 +1,40 @@
+---
+name: get_pdf_page_tokens
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def get_pdf_page_tokens(pdf_path, model: str = None, pdf_parser: str = 'PyPDF2') -> list[tuple[str, int]]"
+description: "Extrae texto y cuenta tokens por pagina de un PDF. Soporta PyPDF2 y PyMuPDF como backends."
+tags: [pdf, tokens, extraction, litellm, parsing]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, PyPDF2]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/get_pdf_page_tokens.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+pages = get_pdf_page_tokens("report.pdf", model="gpt-4o")
+for text, tokens in pages:
+    print(f"{tokens} tokens")
+
+# Con PyMuPDF (mejor para PDFs complejos)
+pages = get_pdf_page_tokens("report.pdf", pdf_parser="PyMuPDF")
+total = sum(t for _, t in pages)
+```
+
+## Notas
+
+Requiere `pip install litellm PyPDF2` (o `pymupdf` para backend PyMuPDF). Acepta path string o BytesIO. Util para estimar costos de procesamiento LLM y para page_list_to_groups.
@@ -0,0 +1,47 @@
+"""Extract text and token count per page from a PDF. Supports PyPDF2 and PyMuPDF."""
+
+import os
+from io import BytesIO
+
+import litellm
+
+
+def get_pdf_page_tokens(pdf_path, model: str = None,
+                        pdf_parser: str = "PyPDF2") -> list[tuple[str, int]]:
+    """Extract text and token count for each page of a PDF.
+
+    Args:
+        pdf_path: Path to PDF file, or BytesIO object.
+        model: Model name for token counting (passed to litellm.token_counter).
+        pdf_parser: Parser backend — 'PyPDF2' or 'PyMuPDF'.
+
+    Returns:
+        list[tuple[str, int]]: List of (page_text, token_count) per page.
+    """
+    if pdf_parser == "PyPDF2":
+        import PyPDF2
+        pdf_reader = PyPDF2.PdfReader(pdf_path)
+        page_list = []
+        for page in pdf_reader.pages:
+            page_text = page.extract_text() or ""
+            token_length = litellm.token_counter(model=model, text=page_text)
+            page_list.append((page_text, token_length))
+        return page_list
+
+    elif pdf_parser == "PyMuPDF":
+        import pymupdf
+        if isinstance(pdf_path, BytesIO):
+            doc = pymupdf.open(stream=pdf_path, filetype="pdf")
+        elif isinstance(pdf_path, str) and os.path.isfile(pdf_path):
+            doc = pymupdf.open(pdf_path)
+        else:
+            raise ValueError(f"Invalid pdf_path: {pdf_path}")
+        page_list = []
+        for page in doc:
+            page_text = page.get_text()
+            token_length = litellm.token_counter(model=model, text=page_text)
+            page_list.append((page_text, token_length))
+        return page_list
+
+    else:
+        raise ValueError(f"Unsupported PDF parser: {pdf_parser}. Use 'PyPDF2' or 'PyMuPDF'.")
@@ -0,0 +1,32 @@
+---
+name: get_text_stats
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def get_text_stats(text: str) -> dict"
+description: "Estadisticas basicas de un texto: total de caracteres, lineas y palabras."
+tags: [text, statistics, stats, characters, words, lines]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests: ["texto normal con palabras y lineas", "texto vacio retorna ceros", "texto con solo newlines"]
+test_file_path: "python/functions/core/get_text_stats_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+stats = get_text_stats("hello world\nfoo bar")
+# {"total_chars": 19, "total_lines": 2, "total_words": 4}
+```
+
+## Notas
+
+Funcion pura sin dependencias externas. `total_lines` cuenta newlines + 1, por lo que un texto vacio cuenta como 1 linea (comportamiento consistente con `wc -l` + 1). `total_words` usa `str.split()` que separa por cualquier whitespace y descarta vacios, equivalente a contar tokens separados por espacios.
@@ -0,0 +1,21 @@
+"""Tests para get_text_stats."""
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+from core import get_text_stats
+
+
+def test_texto_normal_con_palabras_y_lineas():
+    result = get_text_stats("hello world\nfoo bar")
+    assert result == {"total_chars": 19, "total_lines": 2, "total_words": 4}
+
+
+def test_texto_vacio_retorna_ceros():
+    result = get_text_stats("")
+    assert result == {"total_chars": 0, "total_lines": 1, "total_words": 0}
+
+
+def test_texto_con_solo_newlines():
+    result = get_text_stats("\n\n")
+    assert result == {"total_chars": 2, "total_lines": 3, "total_words": 0}
@@ -0,0 +1,66 @@
+---
+name: html_to_markdown
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "html_to_markdown(html: str) -> str"
+description: "Convierte HTML a markdown. Usa readabilipy para extraer contenido principal (filtra nav, ads, boilerplate), luego markdownify para convertir a markdown. Si las librerias opcionales no estan disponibles, usa un parser stdlib como fallback."
+tags: [html, markdown, parse, convert, readabilipy, markdownify, content-extraction]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["re", "html.parser"]
+tested: true
+tests:
+  - "HTML con nav/footer filtra boilerplate"
+  - "HTML limpio se convierte correctamente"
+  - "HTML con imagenes lazy-loaded"
+test_file_path: "python/functions/core/html_to_markdown_test.py"
+file_path: "python/functions/core/html_to_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from core.html_to_markdown import html_to_markdown
+
+html = """
+<html>
+  <body>
+    <nav><a href="/">Home</a></nav>
+    <main>
+      <h1>Titulo del articulo</h1>
+      <p>Contenido <strong>relevante</strong> aqui.</p>
+    </main>
+    <footer>Copyright 2026</footer>
+  </body>
+</html>
+"""
+
+md = html_to_markdown(html)
+# "# Titulo del articulo\n\nContenido **relevante** aqui."
+```
+
+## Notas
+
+Algoritmo:
+1. Preprocesar HTML: manejar contenido oculto WeChat (`js_content` con display:none),
+   lazy loading images (`data-src` → `src`).
+2. Extraer contenido principal con `readabilipy` (basado en Mozilla Readability).
+   Si no esta disponible, usa el HTML completo.
+3. Convertir a markdown con `markdownify` (headings ATX, strip script/style).
+   Si no esta disponible, usa el parser stdlib de la misma funcion.
+
+Dependencias opcionales (mejoran la calidad si estan instaladas):
+- `readabilipy` — extraccion del contenido principal (filtra nav, ads, boilerplate)
+- `markdownify` — conversion HTML→markdown de alta fidelidad
+- `beautifulsoup4` — requerida por readabilipy
+
+Sin las dependencias opcionales la funcion sigue siendo pura y funcional,
+usando `html.parser` de stdlib como fallback.
+
+Funcion pura. No hace I/O ni tiene efectos secundarios.
@@ -0,0 +1,272 @@
+"""Convierte HTML a markdown usando readabilipy + markdownify, con fallback a stdlib."""
+
+import re
+from html.parser import HTMLParser
+from typing import Optional
+
+
+# ---------------------------------------------------------------------------
+# Stdlib fallback parser (no external deps)
+# ---------------------------------------------------------------------------
+
+_BLOCK_TAGS = {
+    "p", "div", "article", "section", "main", "header", "footer", "aside",
+    "nav", "figure", "figcaption", "blockquote", "pre", "ul", "ol", "li",
+    "table", "thead", "tbody", "tr", "th", "td", "h1", "h2", "h3",
+    "h4", "h5", "h6", "br", "hr",
+}
+
+_SKIP_TAGS = {
+    "script", "style", "noscript", "iframe", "svg", "canvas",
+    "nav", "footer", "header", "aside",
+}
+
+_HEADING_TAGS = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
+
+
+class _HTMLToMarkdownParser(HTMLParser):
+    """Minimal HTML → Markdown parser using only stdlib."""
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._parts: list[str] = []
+        self._skip_depth = 0
+        self._in_pre = False
+        self._tag_stack: list[str] = []
+        self._list_stack: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:
+        tag = tag.lower()
+        self._tag_stack.append(tag)
+
+        if self._skip_depth > 0:
+            if tag in _SKIP_TAGS:
+                self._skip_depth += 1
+            return
+
+        if tag in _SKIP_TAGS:
+            self._skip_depth += 1
+            return
+
+        attrs_dict = dict(attrs)
+
+        if tag in _HEADING_TAGS:
+            level = _HEADING_TAGS[tag]
+            self._parts.append(f"\n\n{'#' * level} ")
+
+        elif tag == "p":
+            self._parts.append("\n\n")
+
+        elif tag == "br":
+            self._parts.append("  \n")
+
+        elif tag == "hr":
+            self._parts.append("\n\n---\n\n")
+
+        elif tag == "pre":
+            self._in_pre = True
+            self._parts.append("\n\n```\n")
+
+        elif tag == "code" and not self._in_pre:
+            self._parts.append("`")
+
+        elif tag in ("strong", "b"):
+            self._parts.append("**")
+
+        elif tag in ("em", "i"):
+            self._parts.append("*")
+
+        elif tag == "a":
+            href = attrs_dict.get("href", "")
+            self._parts.append("[")
+            self._parts.append(f"_href:{href}_")
+
+        elif tag == "img":
+            # Handle lazy-loaded images: prefer data-src over src
+            src = attrs_dict.get("data-src") or attrs_dict.get("src", "")
+            alt = attrs_dict.get("alt", "")
+            self._parts.append(f"\n\n![{alt}]({src})\n\n")
+
+        elif tag == "ul":
+            self._list_stack.append("ul")
+            self._parts.append("\n")
+
+        elif tag == "ol":
+            self._list_stack.append("ol")
+            self._parts.append("\n")
+
+        elif tag == "li":
+            prefix = "-" if (not self._list_stack or self._list_stack[-1] == "ul") else "1."
+            self._parts.append(f"\n{prefix} ")
+
+        elif tag in ("blockquote",):
+            self._parts.append("\n\n> ")
+
+        elif tag in ("th", "td"):
+            self._parts.append("| ")
+
+        elif tag == "tr":
+            self._parts.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        tag = tag.lower()
+        if self._tag_stack and self._tag_stack[-1] == tag:
+            self._tag_stack.pop()
+
+        if self._skip_depth > 0:
+            if tag in _SKIP_TAGS:
+                self._skip_depth -= 1
+            return
+
+        if tag in _HEADING_TAGS:
+            self._parts.append("\n\n")
+
+        elif tag == "p":
+            self._parts.append("\n\n")
+
+        elif tag == "pre":
+            self._in_pre = False
+            self._parts.append("\n```\n\n")
+
+        elif tag == "code" and not self._in_pre:
+            self._parts.append("`")
+
+        elif tag in ("strong", "b"):
+            self._parts.append("**")
+
+        elif tag in ("em", "i"):
+            self._parts.append("*")
+
+        elif tag == "a":
+            # Find the matching _href: placeholder and rebuild [text](href)
+            text_parts: list[str] = []
+            href = ""
+            while self._parts:
+                part = self._parts.pop()
+                if part.startswith("_href:") and part.endswith("_"):
+                    href = part[6:-1]
+                    # collected text_parts in reverse, also the "[" opener
+                    if self._parts and self._parts[-1] == "[":
+                        self._parts.pop()
+                    break
+                text_parts.insert(0, part)
+            link_text = "".join(text_parts).strip()
+            self._parts.append(f"[{link_text}]({href})")
+
+        elif tag in ("ul", "ol"):
+            if self._list_stack:
+                self._list_stack.pop()
+            self._parts.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth > 0:
+            return
+        if self._in_pre:
+            self._parts.append(data)
+        else:
+            self._parts.append(data)
+
+    def get_markdown(self) -> str:
+        raw = "".join(self._parts)
+        # Collapse 3+ consecutive newlines to 2
+        raw = re.sub(r"\n{3,}", "\n\n", raw)
+        return raw.strip()
+
+
+def _stdlib_html_to_markdown(html: str) -> str:
+    """Convert HTML to markdown using only Python stdlib."""
+    parser = _HTMLToMarkdownParser()
+    parser.feed(html)
+    return parser.get_markdown()
+
+
+# ---------------------------------------------------------------------------
+# Public function
+# ---------------------------------------------------------------------------
+
+
+def html_to_markdown(html: str) -> str:
+    """Convierte HTML a markdown.
+
+    Usa readabilipy para extraer el contenido principal (filtra nav, ads,
+    boilerplate) y markdownify para convertir a markdown. Si alguna de esas
+    librerias no esta disponible, usa un parser stdlib como fallback.
+
+    Pasos:
+    1. Preprocesar HTML: manejar contenido oculto (WeChat js_content),
+       lazy loading images (data-src → src).
+    2. Extraer contenido principal con readabilipy (basado en Mozilla
+       Readability). Fallback: usar el HTML completo.
+    3. Convertir a markdown con markdownify (headings ATX, strip
+       script/style). Fallback: parser stdlib.
+
+    Args:
+        html: HTML completo de la pagina.
+
+    Returns:
+        Contenido de la pagina en formato markdown.
+    """
+    # Step 1: preprocess — handle WeChat hidden content and lazy-loaded images
+    html = _preprocess_html(html)
+
+    # Step 2: extract main content with readabilipy (optional dep)
+    main_html = _extract_main_content(html)
+
+    # Step 3: convert to markdown
+    return _convert_to_markdown(main_html)
+
+
+def _preprocess_html(html: str) -> str:
+    """Preprocesar HTML antes de extraer contenido.
+
+    - Expande contenido oculto de WeChat (js_content).
+    - Reemplaza data-src por src en imagenes lazy-loaded.
+    """
+    # WeChat js_content: replace hidden wrapper divs
+    html = re.sub(
+        r'<div[^>]*id=["\']js_content["\'][^>]*style=["\'][^"\']*display\s*:\s*none[^"\']*["\'][^>]*>',
+        '<div id="js_content">',
+        html,
+        flags=re.IGNORECASE,
+    )
+
+    # Lazy loading: copy data-src to src for img tags
+    def replace_lazy_src(m: re.Match) -> str:
+        tag = m.group(0)
+        data_src_match = re.search(r'data-src=["\']([^"\']*)["\']', tag)
+        if data_src_match:
+            data_src = data_src_match.group(1)
+            # Replace or add src attribute
+            if re.search(r'\bsrc=["\']', tag):
+                tag = re.sub(r'\bsrc=["\'][^"\']*["\']', f'src="{data_src}"', tag)
+            else:
+                tag = tag.replace("<img", f'<img src="{data_src}"', 1)
+        return tag
+
+    html = re.sub(r"<img[^>]+>", replace_lazy_src, html, flags=re.IGNORECASE)
+    return html
+
+
+def _extract_main_content(html: str) -> str:
+    """Extraer contenido principal usando readabilipy si esta disponible."""
+    try:
+        from readabilipy import simple_json_from_html_string  # type: ignore
+
+        article = simple_json_from_html_string(html, use_readability=True)
+        return article.get("content") or html
+    except ImportError:
+        return html
+
+
+def _convert_to_markdown(html: str) -> str:
+    """Convertir HTML a markdown usando markdownify si esta disponible."""
+    try:
+        import markdownify  # type: ignore
+
+        return markdownify.markdownify(
+            html,
+            heading_style="ATX",
+            strip=["script", "style"],
+        )
+    except ImportError:
+        return _stdlib_html_to_markdown(html)
@@ -0,0 +1,90 @@
+"""Tests para html_to_markdown."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from core.html_to_markdown import html_to_markdown, _preprocess_html
+
+
+def test_html_con_nav_y_footer_filtra_boilerplate():
+    """HTML con nav/footer: el contenido principal debe extraerse (nav no aparece en output)."""
+    html = """
+    <html>
+      <body>
+        <nav><a href="/">Home</a><a href="/about">About</a></nav>
+        <main>
+          <h1>Titulo principal</h1>
+          <p>Este es el contenido relevante del articulo.</p>
+        </main>
+        <footer><p>Copyright 2026</p></footer>
+      </body>
+    </html>
+    """
+    result = html_to_markdown(html)
+    assert "Titulo principal" in result
+    assert "contenido relevante" in result
+
+
+def test_html_limpio_se_convierte_correctamente():
+    """HTML limpio sin boilerplate: headings y parrafos se convierten correctamente."""
+    html = """
+    <html>
+      <body>
+        <h1>Hello World</h1>
+        <p>Parrafo de prueba con <strong>texto en negrita</strong>.</p>
+        <h2>Seccion dos</h2>
+        <p>Mas contenido aqui.</p>
+      </body>
+    </html>
+    """
+    result = html_to_markdown(html)
+    assert "Hello World" in result
+    assert "Parrafo de prueba" in result
+    assert "Seccion dos" in result
+
+
+def test_html_con_imagenes_lazy_loaded():
+    """HTML con imagenes lazy-loaded: data-src debe reemplazar src en el output."""
+    html = """
+    <html>
+      <body>
+        <p>Articulo con imagen</p>
+        <img src="placeholder.gif" data-src="imagen-real.jpg" alt="foto real" />
+      </body>
+    </html>
+    """
+    # Verificar preprocesamiento
+    preprocessed = _preprocess_html(html)
+    assert "imagen-real.jpg" in preprocessed
+    # El resultado final debe contener la URL real
+    result = html_to_markdown(html)
+    assert "imagen-real.jpg" in result
+
+
+def test_preprocess_lazy_loading_reemplaza_src():
+    """_preprocess_html reemplaza src con data-src en imagenes."""
+    html = '<img src="placeholder.gif" data-src="real.jpg" alt="x" />'
+    result = _preprocess_html(html)
+    assert 'src="real.jpg"' in result
+
+
+def test_preprocess_lazy_loading_sin_src_anade_src():
+    """_preprocess_html agrega src cuando la imagen no tiene atributo src."""
+    html = '<img data-src="real.jpg" alt="foto" />'
+    result = _preprocess_html(html)
+    assert 'src="real.jpg"' in result
+
+
+def test_html_vacio_retorna_string():
+    """HTML vacio no lanza excepcion."""
+    result = html_to_markdown("")
+    assert isinstance(result, str)
+
+
+def test_html_solo_texto():
+    """HTML con solo texto plano se convierte sin error."""
+    html = "<p>Solo texto</p>"
+    result = html_to_markdown(html)
+    assert "Solo texto" in result
@@ -0,0 +1,48 @@
+---
+name: is_git_repo_url
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def is_git_repo_url(url: str, known_hosts: list[str] | None = None) -> bool"
+description: "Verifica si una URL apunta a un repositorio git clonable. Acepta org/repo y org/repo/tree/<ref>. Rechaza issues, blobs, PRs y otros sub-recursos."
+tags: [git, url, validation, github, gitlab, repository]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [urllib.parse]
+tested: true
+tests:
+  - "URL repo valida"
+  - "URL de issue (False)"
+  - "URL de blob/file (False)"
+  - "URL con tree/branch (True)"
+test_file_path: "python/functions/core/parse_git_url_test.py"
+file_path: "python/functions/core/core.py"
+---
+
+## Ejemplo
+
+```python
+is_git_repo_url("https://github.com/psf/requests")
+# True
+
+is_git_repo_url("https://github.com/psf/requests/issues/123")
+# False
+
+is_git_repo_url("https://github.com/psf/requests/blob/main/README.md")
+# False
+
+is_git_repo_url("https://github.com/psf/requests/tree/main")
+# True
+
+is_git_repo_url("git@github.com:psf/requests.git")
+# True
+```
+
+## Notas
+
+Funcion pura. Para SSH y git:// se acepta cualquier path siempre que el host sea conocido (los protocolos de clonacion no navegan a sub-recursos). Para HTTP/HTTPS se exige exactamente 2 segmentos (org/repo) o 4 segmentos con `tree` en posicion 3.
@@ -0,0 +1,47 @@
+---
+name: join_by_key
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def join_by_key(left: list[dict], right: list[dict], key: str, how: str = 'inner') -> list[dict]"
+description: "Join de dos listas de dicts por una clave comun. Soporta inner, left, right y outer. Campos duplicados del right se sufijan con _right. Algoritmo O(n+m)."
+tags: [tabular, join, merge, python, core]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "Inner join solo matches"
+  - "Left join todos los left con None para right sin match"
+  - "Right join"
+  - "Outer join"
+  - "Campos duplicados con sufijo _right"
+  - "Key ausente en alguna fila"
+test_file_path: "python/functions/core/join_by_key_test.py"
+file_path: "python/functions/core/join_by_key.py"
+---
+
+## Ejemplo
+
+```python
+left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+
+join_by_key(left, right, key="id", how="inner")
+# [{"id": 1, "name": "Alice", "dept": "eng"}]
+
+join_by_key(left, right, key="id", how="left")
+# [{"id": 1, "name": "Alice", "dept": "eng"},
+#  {"id": 2, "name": "Bob", "dept": None}]
+```
+
+## Notas
+
+Funcion pura sin dependencias externas.
+El algoritmo indexa right en O(n) y luego itera left en O(m), total O(n+m).
+Los campos de right que colisionan con campos de left (excepto la clave) se renombran con sufijo _right.
@@ -0,0 +1,95 @@
+"""Join de dos tablas tabulares por una clave comun."""
+
+
+def join_by_key(
+    left: list[dict],
+    right: list[dict],
+    key: str,
+    how: str = "inner",
+) -> list[dict]:
+    """Une dos listas de dicts por una clave comun.
+
+    Soporta los cuatro tipos de join: inner, left, right, outer.
+    Campos duplicados del lado right (distintos a la clave) se sufijan con _right.
+
+    Algoritmo O(n+m): indexa right por key, luego itera left buscando matches.
+
+    Args:
+        left: Lista de dicts del lado izquierdo.
+        right: Lista de dicts del lado derecho.
+        key: Nombre del campo clave para el join.
+        how: Tipo de join: inner, left, right, outer.
+
+    Returns:
+        Lista de dicts con campos de ambos lados mergeados.
+        Campos del right ausentes en un match left se rellenan con None.
+        Campos del left ausentes en un match right se rellenan con None.
+    """
+    # Indexar right por key
+    right_index: dict[any, list[dict]] = {}
+    for row in right:
+        k = row.get(key)
+        right_index.setdefault(k, []).append(row)
+
+    # Determinar campos del right que podrian colisionar con left
+    left_keys = {k for row in left for k in row}
+    right_only_keys = {k for row in right for k in row if k != key}
+    conflicting = right_only_keys & left_keys - {key}
+
+    def _merge(l_row: dict | None, r_row: dict | None) -> dict:
+        result: dict = {}
+        if l_row is not None:
+            result.update(l_row)
+        if r_row is not None:
+            for k, v in r_row.items():
+                if k == key:
+                    continue
+                if k in conflicting:
+                    result[f"{k}_right"] = v
+                else:
+                    result[k] = v
+        return result
+
+    def _empty_left(left_sample: dict | None) -> dict:
+        if left_sample is None:
+            return {}
+        return {k: None for k in left_sample}
+
+    def _empty_right() -> dict:
+        result: dict = {}
+        for row in right:
+            for k in row:
+                if k == key:
+                    continue
+                dest = f"{k}_right" if k in conflicting else k
+                result[dest] = None
+        return result
+
+    matched_right_keys: set = set()
+    output: list[dict] = []
+
+    for l_row in left:
+        k = l_row.get(key)
+        r_rows = right_index.get(k)
+        if r_rows:
+            matched_right_keys.add(k)
+            for r_row in r_rows:
+                output.append(_merge(l_row, r_row))
+        else:
+            if how in ("left", "outer"):
+                output.append(_merge(l_row, None) | _empty_right())
+
+    if how in ("right", "outer"):
+        for r_row in right:
+            k = r_row.get(key)
+            if k not in matched_right_keys:
+                base = _empty_right()
+                base[key] = k
+                for rk, rv in r_row.items():
+                    if rk == key:
+                        continue
+                    dest = f"{rk}_right" if rk in conflicting else rk
+                    base[dest] = rv
+                output.append(base)
+
+    return output
@@ -0,0 +1,72 @@
+"""Tests para join_by_key."""
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+
+from join_by_key import join_by_key
+
+
+def test_inner_join_solo_matches():
+    """Inner join solo matches."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    assert len(result) == 1
+    assert result[0]["id"] == 1
+    assert result[0]["name"] == "Alice"
+    assert result[0]["dept"] == "eng"
+
+
+def test_left_join_todos_los_left_con_none_para_right_sin_match():
+    """Left join todos los left con None para right sin match."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="left")
+    assert len(result) == 2
+    alice = next(r for r in result if r["id"] == 1)
+    bob = next(r for r in result if r["id"] == 2)
+    assert alice["dept"] == "eng"
+    assert bob["dept"] is None
+
+
+def test_right_join():
+    """Right join."""
+    left = [{"id": 1, "name": "Alice"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 2, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="right")
+    assert len(result) == 2
+    eng = next(r for r in result if r["id"] == 1)
+    sales = next(r for r in result if r["id"] == 2)
+    assert eng["name"] == "Alice"
+    assert sales.get("name") is None
+
+
+def test_outer_join():
+    """Outer join."""
+    left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right = [{"id": 1, "dept": "eng"}, {"id": 3, "dept": "sales"}]
+    result = join_by_key(left, right, key="id", how="outer")
+    ids = {r["id"] for r in result}
+    assert ids == {1, 2, 3}
+
+
+def test_campos_duplicados_con_sufijo_right():
+    """Campos duplicados con sufijo _right."""
+    left = [{"id": 1, "name": "Alice", "score": 90}]
+    right = [{"id": 1, "score": 85, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    assert len(result) == 1
+    assert result[0]["score"] == 90
+    assert result[0]["score_right"] == 85
+    assert result[0]["dept"] == "eng"
+
+
+def test_key_ausente_en_alguna_fila():
+    """Key ausente en alguna fila."""
+    left = [{"id": 1, "name": "Alice"}, {"name": "Bob"}]  # Bob sin id
+    right = [{"id": 1, "dept": "eng"}]
+    result = join_by_key(left, right, key="id", how="inner")
+    # Solo Alice matchea
+    assert len(result) == 1
+    assert result[0]["name"] == "Alice"
@@ -0,0 +1,41 @@
+---
+name: list_to_tree
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def list_to_tree(data: list[dict]) -> list[dict]"
+description: "Convierte lista plana con codigos de estructura ('1.2.3') a arbol jerarquico anidado."
+tags: [tree, hierarchy, structure, conversion]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/core.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+flat = [
+    {"structure": "1", "title": "Intro", "start_index": 1, "end_index": 5},
+    {"structure": "1.1", "title": "Background", "start_index": 1, "end_index": 3},
+    {"structure": "1.2", "title": "Scope", "start_index": 3, "end_index": 5},
+    {"structure": "2", "title": "Methods", "start_index": 5, "end_index": 10},
+]
+tree = list_to_tree(flat)
+# [{"title": "Intro", "nodes": [{"title": "Background"}, {"title": "Scope"}]}, {"title": "Methods"}]
+```
+
+## Notas
+
+Funcion pura. Cada item necesita campo 'structure' con codigo jerarquico separado por puntos. Nodos huerfanos se promueven a raiz.
@@ -0,0 +1,40 @@
+---
+name: llm_acompletion_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10, temperature: float = 0) -> str"
+description: "Completion LLM asincrono con retry automatico. Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
+tags: [llm, completion, retry, async, litellm, api]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, asyncio, logging]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/llm_acompletion_retry.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+import asyncio
+
+async def main():
+    response = await llm_acompletion_retry("gpt-4o", "Summarize this text: ...")
+    print(response)
+
+asyncio.run(main())
+```
+
+## Notas
+
+Requiere `pip install litellm`. Version async de llm_completion_retry. Usa asyncio.sleep entre retries. Ideal para procesar multiples prompts en paralelo con asyncio.gather.
@@ -0,0 +1,43 @@
+"""Async LLM completion with retry logic via litellm. Supports 100+ models."""
+
+import asyncio
+import logging
+
+import litellm
+
+litellm.drop_params = True
+
+
+async def llm_acompletion_retry(model: str, prompt: str, max_retries: int = 10,
+                                temperature: float = 0) -> str:
+    """Asynchronous LLM completion with retry. Multi-model support via litellm.
+
+    Args:
+        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
+        prompt: User prompt text.
+        max_retries: Max retry attempts on failure.
+        temperature: Sampling temperature.
+
+    Returns:
+        str: Response content. Empty string if all retries fail.
+    """
+    if model:
+        model = model.removeprefix("litellm/")
+
+    messages = [{"role": "user", "content": prompt}]
+
+    for i in range(max_retries):
+        try:
+            response = await litellm.acompletion(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logging.error(f"Async LLM completion error (attempt {i+1}/{max_retries}): {e}")
+            if i < max_retries - 1:
+                await asyncio.sleep(1)
+            else:
+                logging.error(f"Max retries reached for model={model}")
+                return ""
@@ -0,0 +1,43 @@
+---
+name: llm_completion_retry
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def llm_completion_retry(model: str, prompt: str, chat_history: list = None, return_finish_reason: bool = False, max_retries: int = 10, temperature: float = 0) -> str"
+description: "Completion LLM sincrono con retry automatico (max 10). Soporte multi-modelo via litellm (OpenAI, Anthropic, etc.)."
+tags: [llm, completion, retry, litellm, api]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [litellm, logging, time]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/core/llm_completion_retry.py"
+source_repo: "https://github.com/VectifyAI/PageIndex"
+source_license: "MIT"
+source_file: "pageindex/utils.py"
+---
+
+## Ejemplo
+
+```python
+response = llm_completion_retry("gpt-4o", "Explain quantum computing in one sentence")
+# "Quantum computing uses quantum bits..."
+
+# Con historial de chat
+history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
+response = llm_completion_retry("claude-sonnet-4-20250514", "What's 2+2?", chat_history=history)
+
+# Con finish reason
+content, reason = llm_completion_retry("gpt-4o", "...", return_finish_reason=True)
+# reason: "finished" | "max_output_reached" | "error"
+```
+
+## Notas
+
+Requiere `pip install litellm`. Soporta 100+ modelos via litellm. Retry con sleep(1) entre intentos. Retorna string vacio si todos los intentos fallan.
@@ -0,0 +1,52 @@
+"""LLM completion with retry logic via litellm. Supports 100+ models."""
+
+import logging
+import time
+
+import litellm
+
+litellm.drop_params = True
+
+
+def llm_completion_retry(model: str, prompt: str, chat_history: list = None,
+                         return_finish_reason: bool = False, max_retries: int = 10,
+                         temperature: float = 0):
+    """Synchronous LLM completion with retry. Multi-model support via litellm.
+
+    Args:
+        model: Model name (e.g. 'gpt-4o', 'claude-sonnet-4-20250514'). Strips 'litellm/' prefix.
+        prompt: User prompt text.
+        chat_history: Optional list of prior messages [{"role": ..., "content": ...}].
+        return_finish_reason: If True, returns (content, reason) tuple.
+        max_retries: Max retry attempts on failure.
+        temperature: Sampling temperature.
+
+    Returns:
+        str or (str, str): Response content, optionally with finish reason.
+    """
+    if model:
+        model = model.removeprefix("litellm/")
+
+    messages = list(chat_history or []) + [{"role": "user", "content": prompt}]
+
+    for i in range(max_retries):
+        try:
+            response = litellm.completion(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+            )
+            content = response.choices[0].message.content
+            if return_finish_reason:
+                reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
+                return content, reason
+            return content
+        except Exception as e:
+            logging.error(f"LLM completion error (attempt {i+1}/{max_retries}): {e}")
+            if i < max_retries - 1:
+                time.sleep(1)
+            else:
+                logging.error(f"Max retries reached for model={model}")
+                if return_finish_reason:
+                    return "", "error"
+                return ""
@@ -0,0 +1,43 @@
+---
+name: load_translations
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: impure
+signature: "def load_translations(locales_dir: str) -> dict[str, dict]"
+description: "Carga todos los archivos JSON de un directorio de locales. Cada archivo {locale}.json se indexa por nombre sin extension. Retorna {} si el directorio no existe o esta vacio."
+tags: [i18n, translation, locale, json, files]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [json, os]
+tested: true
+tests: ["carga multiples locales", "directorio inexistente retorna dict vacio", "ignora archivos no json", "locale con estructura anidada"]
+test_file_path: "python/functions/core/load_translations_test.py"
+file_path: "python/functions/core/load_translations.py"
+---
+
+## Ejemplo
+
+```python
+from load_translations import load_translations
+from t import _set_translations, t
+
+# Estructura de archivos:
+# locales/
+#   en.json  →  {"report": {"done": "Done", "sectionStart": "Section: {title}"}}
+#   es.json  →  {"report": {"done": "Listo"}}
+
+translations = load_translations("locales/")
+_set_translations(translations, default_locale="en")
+
+t("report.done", locale="es")
+# → "Listo"
+```
+
+## Notas
+
+Lee el filesystem, por eso es impura. Los errores de JSON malformado se propagan directamente (`json.JSONDecodeError`). Los errores de acceso al directorio se propagan como `OSError`. Companera natural de `t_py_core` — el flujo tipico es: `load_translations` al inicio de la app → `_set_translations` → llamadas a `t` durante la ejecucion. Inspirada conceptualmente en el modulo `locale.py` de MiroFish (AGPL-3.0); reimplementada desde cero.
@@ -0,0 +1,46 @@
+"""Carga de archivos JSON de un directorio de locales."""
+
+import json
+import os
+
+
+def load_translations(locales_dir: str) -> dict[str, dict]:
+    """Carga todos los archivos JSON de un directorio de locales.
+
+    Cada archivo `{locale}.json` se carga como diccionario y se indexa
+    por el nombre del archivo sin extension (el locale).
+
+    Args:
+        locales_dir: Ruta al directorio que contiene los archivos JSON de locales.
+
+    Returns:
+        Diccionario {locale: dict_de_traducciones}. Retorna {} si el directorio
+        no existe o no contiene archivos JSON.
+
+    Raises:
+        OSError: Si el directorio no es accesible.
+        json.JSONDecodeError: Si un archivo JSON esta malformado.
+
+    Example:
+        >>> # locales/en.json = {"greeting": "Hello"}
+        >>> # locales/es.json = {"greeting": "Hola"}
+        >>> translations = load_translations("locales/")
+        >>> translations["en"]["greeting"]
+        'Hello'
+        >>> translations["es"]["greeting"]
+        'Hola'
+    """
+    translations: dict[str, dict] = {}
+
+    if not os.path.isdir(locales_dir):
+        return translations
+
+    for filename in os.listdir(locales_dir):
+        if not filename.endswith(".json"):
+            continue
+        locale = filename[:-5]  # quitar ".json"
+        filepath = os.path.join(locales_dir, filename)
+        with open(filepath, encoding="utf-8") as f:
+            translations[locale] = json.load(f)
+
+    return translations
@@ -0,0 +1,80 @@
+"""Tests para load_translations."""
+
+import json
+import os
+import sys
+import tempfile
+import shutil
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from load_translations import load_translations
+
+
+def test_carga_multiples_locales():
+    tmp = tempfile.mkdtemp()
+    try:
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump({"greeting": "Hello"}, f)
+        with open(os.path.join(tmp, "es.json"), "w") as f:
+            json.dump({"greeting": "Hola"}, f)
+
+        result = load_translations(tmp)
+        assert "en" in result, "Debe contener locale 'en'"
+        assert "es" in result, "Debe contener locale 'es'"
+        assert result["en"]["greeting"] == "Hello"
+        assert result["es"]["greeting"] == "Hola"
+    finally:
+        shutil.rmtree(tmp)
+
+
+def test_directorio_inexistente_retorna_dict_vacio():
+    result = load_translations("/tmp/directorio_que_no_existe_xyz_12345")
+    assert result == {}, f"Expected {{}}, got {result}"
+
+
+def test_ignora_archivos_no_json():
+    tmp = tempfile.mkdtemp()
+    try:
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump({"key": "value"}, f)
+        with open(os.path.join(tmp, "README.md"), "w") as f:
+            f.write("# Locales")
+        with open(os.path.join(tmp, "notes.txt"), "w") as f:
+            f.write("some notes")
+
+        result = load_translations(tmp)
+        assert list(result.keys()) == ["en"], f"Expected only 'en', got {list(result.keys())}"
+    finally:
+        shutil.rmtree(tmp)
+
+
+def test_locale_con_estructura_anidada():
+    tmp = tempfile.mkdtemp()
+    try:
+        nested = {"report": {"sectionStart": "Section: {title}", "done": "Done"}}
+        with open(os.path.join(tmp, "en.json"), "w") as f:
+            json.dump(nested, f)
+
+        result = load_translations(tmp)
+        assert result["en"]["report"]["done"] == "Done"
+        assert result["en"]["report"]["sectionStart"] == "Section: {title}"
+    finally:
+        shutil.rmtree(tmp)
+
+
+if __name__ == "__main__":
+    test_carga_multiples_locales()
+    print("PASS: carga multiples locales")
+
+    test_directorio_inexistente_retorna_dict_vacio()
+    print("PASS: directorio inexistente retorna dict vacio")
+
+    test_ignora_archivos_no_json()
+    print("PASS: ignora archivos no json")
+
+    test_locale_con_estructura_anidada()
+    print("PASS: locale con estructura anidada")
+
+    print("---")
+    print("All tests passed.")
@@ -0,0 +1,67 @@
+---
+name: merge_entity_attributes
+kind: function
+lang: py
+domain: core
+version: "1.0.0"
+purity: pure
+signature: "def merge_entity_attributes(attr_list: list[dict]) -> dict"
+description: "Combina atributos de multiples candidatos de la misma entidad. Aplica heuristicas de resolucion por tipo de campo: max para numericos, min/max para fechas, union para listas, OR para booleanos, mas largo para strings."
+tags: [merge, entity, attributes, resolution, deduplication, fuzzygraph, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+tested: true
+tests:
+  - "Atributos complementarios (A tiene full_name, B tiene nationality) -> ambos"
+  - "Atributos conflictivos en risk_score -> max"
+  - "Atributos first_seen conflictivos -> min"
+  - "Todos null -> null"
+  - "Listas -> union sin duplicados"
+  - "Boolean verified -> True si alguno es True"
+  - "String conflictivo -> usar el mas largo"
+  - "Valores iguales -> usar ese valor"
+  - "Un solo candidato -> retorna sus atributos tal cual"
+  - "Lista vacia -> retorna dict vacio"
+  - "last_seen conflictivo -> max (mas reciente)"
+  - "Un candidato tiene null, otro tiene valor -> usar el valor"
+test_file_path: "python/functions/core/merge_entity_attributes_test.py"
+file_path: "python/functions/core/merge_entity_attributes.py"
+---
+
+## Ejemplo
+
+```python
+a = {"risk_score": 3.5, "first_seen": "2022-05-15", "verified": False}
+b = {"risk_score": 7.2, "first_seen": "2023-01-01", "verified": True, "alias": "Alice"}
+
+result = merge_entity_attributes([a, b])
+# {
+#   "risk_score": 7.2,           # max
+#   "first_seen": "2022-05-15",  # min (mas antigua)
+#   "verified": True,            # OR logico
+#   "alias": "Alice"             # solo en b
+# }
+```
+
+## Heuristicas de resolucion
+
+| Campo / tipo | Conflicto | Resolucion |
+|---|---|---|
+| `risk_score`, `balance`, `cvss` | numerico | `max` |
+| `first_seen`, `created_date` | fecha | `min` (mas antigua) |
+| `last_seen`, `expires_date` | fecha | `max` (mas reciente) |
+| `verified`, `exploited` | booleano | `any` (OR logico) |
+| cualquier `list` | lista | union sin duplicados |
+| cualquier `str` u otro | string | el mas largo |
+
+Los campos fuera de las listas conocidas usan la heuristica por tipo Python (`list`, `bool`, luego `str`/otro).
+
+## Notas
+
+Funcion pura. No tiene dependencias externas. Las listas conocidas de campos especiales (`_NUMERIC_FIELDS`, `_DATE_MIN_FIELDS`, etc.) pueden extenderse si el dominio crece.
+
+Disenada originalmente para el grafo de entidades de fuzzygraph, donde multiples fuentes pueden describir la misma entidad con datos complementarios o contradictorios.
--- a/Show More
+++ b/Show More